From 045774490c10afe2dc9a479f176a88ddce5a9728 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Tue, 29 Oct 2024 14:15:00 +0100 Subject: [PATCH 1/7] CrateDB: Vector Store --- docs/docs/how_to/indexing.ipynb | 2 +- libs/community/extended_testing_deps.txt | 2 + .../vectorstores/__init__.py | 5 + .../vectorstores/cratedb/__init__.py | 7 + .../vectorstores/cratedb/base.py | 453 ++++++++++++ .../vectorstores/cratedb/extended.py | 91 +++ .../vectorstores/cratedb/model.py | 115 +++ .../vectorstores/docker-compose/cratedb.yml | 20 + .../vectorstores/test_cratedb.py | 668 ++++++++++++++++++ .../unit_tests/vectorstores/test_imports.py | 1 + .../vectorstores/test_indexing_docs.py | 3 +- 11 files changed, 1365 insertions(+), 2 deletions(-) create mode 100644 libs/community/langchain_community/vectorstores/cratedb/__init__.py create mode 100644 libs/community/langchain_community/vectorstores/cratedb/base.py create mode 100644 libs/community/langchain_community/vectorstores/cratedb/extended.py create mode 100644 libs/community/langchain_community/vectorstores/cratedb/model.py create mode 100644 libs/community/tests/integration_tests/vectorstores/docker-compose/cratedb.yml create mode 100644 libs/community/tests/integration_tests/vectorstores/test_cratedb.py diff --git a/docs/docs/how_to/indexing.ipynb b/docs/docs/how_to/indexing.ipynb index e3e6ec8aef6d7..cdce7015614ce 100644 --- a/docs/docs/how_to/indexing.ipynb +++ b/docs/docs/how_to/indexing.ipynb @@ -60,7 +60,7 @@ " * document addition by id (`add_documents` method with `ids` argument)\n", " * delete by id (`delete` method with `ids` argument)\n", "\n", - "Compatible Vectorstores: `Aerospike`, `AnalyticDB`, `AstraDB`, `AwaDB`, `AzureCosmosDBNoSqlVectorSearch`, `AzureCosmosDBVectorSearch`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MongoDBAtlasVectorSearch`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SingleStoreDB`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `Yellowbrick`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`.\n", + "Compatible Vectorstores: `Aerospike`, `AnalyticDB`, `AstraDB`, `AwaDB`, `AzureCosmosDBNoSqlVectorSearch`, `AzureCosmosDBVectorSearch`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `CrateDBVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `Milvus`, `MongoDBAtlasVectorSearch`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SingleStoreDB`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `Yellowbrick`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`.\n", " \n", "## Caution\n", "\n", diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt index d331fb66e85dd..f65e0de40507f 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_testing_deps.txt @@ -14,6 +14,7 @@ chardet>=5.1.0,<6 cloudpathlib>=0.18,<0.19 cloudpickle>=2.0.0 cohere>=4,<6 +crate==1.0.0.dev1 databricks-vectorsearch>=0.21,<0.22 datasets>=2.15.0,<3 dgml-utils>=0.3.0,<0.4 @@ -77,6 +78,7 @@ requests-toolbelt>=1.0.0,<2 rspace_client>=2.5.0,<3 scikit-learn>=1.2.2,<2 simsimd>=5.0.0,<6 +sqlalchemy-cratedb>=0.40.0,<1 sqlite-vss>=0.1.2,<0.2 sqlite-vec>=0.1.0,<0.2 sseclient-py>=1.8.0,<2 diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index c38beea0ed6d2..5741fd7a644b7 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -92,6 +92,9 @@ from langchain_community.vectorstores.couchbase import ( CouchbaseVectorStore, ) + from langchain_community.vectorstores.cratedb import ( + CrateDBVectorSearch, + ) from langchain_community.vectorstores.dashvector import ( DashVector, ) @@ -334,6 +337,7 @@ "Clickhouse", "ClickhouseSettings", "CouchbaseVectorStore", + "CrateDBVectorSearch", "DashVector", "DatabricksVectorSearch", "DeepLake", @@ -438,6 +442,7 @@ "Clickhouse": "langchain_community.vectorstores.clickhouse", "ClickhouseSettings": "langchain_community.vectorstores.clickhouse", "CouchbaseVectorStore": "langchain_community.vectorstores.couchbase", + "CrateDBVectorSearch": "langchain_community.vectorstores.cratedb", "DashVector": "langchain_community.vectorstores.dashvector", "DatabricksVectorSearch": "langchain_community.vectorstores.databricks_vector_search", # noqa: E501 "DeepLake": "langchain_community.vectorstores.deeplake", diff --git a/libs/community/langchain_community/vectorstores/cratedb/__init__.py b/libs/community/langchain_community/vectorstores/cratedb/__init__.py new file mode 100644 index 0000000000000..62462bce1eba9 --- /dev/null +++ b/libs/community/langchain_community/vectorstores/cratedb/__init__.py @@ -0,0 +1,7 @@ +from .base import CrateDBVectorSearch +from .extended import CrateDBVectorSearchMultiCollection + +__all__ = [ + "CrateDBVectorSearch", + "CrateDBVectorSearchMultiCollection", +] diff --git a/libs/community/langchain_community/vectorstores/cratedb/base.py b/libs/community/langchain_community/vectorstores/cratedb/base.py new file mode 100644 index 0000000000000..ff392923cba44 --- /dev/null +++ b/libs/community/langchain_community/vectorstores/cratedb/base.py @@ -0,0 +1,453 @@ +from __future__ import annotations + +import enum +import math +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Tuple, + Type, +) + +import sqlalchemy +from langchain.docstore.document import Document +from langchain.schema.embeddings import Embeddings +from langchain.utils import get_from_dict_or_env +from langchain.vectorstores.pgvector import PGVector +from sqlalchemy.orm import sessionmaker + +from langchain_community.vectorstores.cratedb.model import ModelFactory + + +class DistanceStrategy(str, enum.Enum): + """Enumerator of the Distance strategies.""" + + EUCLIDEAN = "euclidean" + COSINE = "cosine" + MAX_INNER_PRODUCT = "inner" + + +DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.EUCLIDEAN + + +_LANGCHAIN_DEFAULT_COLLECTION_NAME = "langchain" + + +def _results_to_docs(docs_and_scores: Any) -> List[Document]: + """Return docs from docs and scores.""" + return [doc for doc, _ in docs_and_scores] + + +class CrateDBVectorSearch(PGVector): + """`CrateDB` vector store. + + To use it, you should have the ``crate[sqlalchemy]`` python package installed. + + Args: + connection_string: Database connection string. + embedding_function: Any embedding function implementing + `langchain.embeddings.base.Embeddings` interface. + collection_name: The name of the collection to use. (default: langchain) + NOTE: This is not the name of the table, but the name of the collection. + The tables will be created when initializing the store (if not exists) + So, make sure the user has the right permissions to create tables. + distance_strategy: The distance strategy to use. (default: EUCLIDEAN) + pre_delete_collection: If True, will delete the collection if it exists. + (default: False). Useful for testing. + + Example: + .. code-block:: python + + from langchain.vectorstores import CrateDBVectorSearch + from langchain.embeddings.openai import OpenAIEmbeddings + + CONNECTION_STRING = "crate://crate@localhost:4200/test3" + COLLECTION_NAME = "state_of_the_union_test" + embeddings = OpenAIEmbeddings() + vectorestore = CrateDBVectorSearch.from_documents( + embedding=embeddings, + documents=docs, + collection_name=COLLECTION_NAME, + connection_string=CONNECTION_STRING, + ) + + + """ + + def __post_init__( + self, + ) -> None: + """ + Initialize the store. + """ + + self._engine = self._bind + self.Session = sessionmaker(bind=self._engine) # type: ignore[call-overload] + + # Patch dialect to invoke `REFRESH TABLE` after each DML operation. + from sqlalchemy_cratedb.support import refresh_after_dml + + refresh_after_dml(self._engine) + + # Need to defer initialization, because dimension size + # can only be figured out at runtime. + self.BaseModel = None + self.CollectionStore = None # type: ignore[assignment] + self.EmbeddingStore = None # type: ignore[assignment] + + def __del__(self) -> None: + """ + Work around premature session close. + + sqlalchemy.orm.exc.DetachedInstanceError: Parent instance is not bound + to a Session; lazy load operation of attribute 'embeddings' cannot proceed. + -- https://docs.sqlalchemy.org/en/20/errors.html#error-bhk3 + + TODO: Review! + """ # noqa: E501 + pass + + def _init_models(self, embedding: List[float]) -> None: + """ + Create SQLAlchemy models at runtime, when not established yet. + """ + + # TODO: Use a better way to run this only once. + if self.CollectionStore is not None and self.EmbeddingStore is not None: + return + + size = len(embedding) + self._init_models_with_dimensionality(size=size) + + def _init_models_with_dimensionality(self, size: int) -> None: + mf = ModelFactory(dimensions=size) + self.BaseModel, self.CollectionStore, self.EmbeddingStore = ( + mf.BaseModel, # type: ignore[assignment] + mf.CollectionStore, + mf.EmbeddingStore, + ) + + def get_collection(self, session: sqlalchemy.orm.Session) -> Any: + if self.CollectionStore is None: + raise RuntimeError( + "Collection can't be accessed without specifying " + "dimension size of embedding vectors" + ) + return self.CollectionStore.get_by_name(session, self.collection_name) + + def add_embeddings( + self, + texts: Iterable[str], + embeddings: List[List[float]], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Add embeddings to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + embeddings: List of list of embedding vectors. + metadatas: List of metadatas associated with the texts. + kwargs: vectorstore specific parameters + """ + from sqlalchemy_cratedb.support import refresh_table + + if not embeddings: + return [] + self._init_models(embeddings[0]) + + # When the user requested to delete the collection before running subsequent + # operations on it, run the deletion gracefully if the table does not exist + # yet. + if self.pre_delete_collection: + try: + self.delete_collection() + except sqlalchemy.exc.ProgrammingError as ex: + if "RelationUnknown" not in str(ex): + raise + + # Tables need to be created at runtime, because the `EmbeddingStore.embedding` + # field, a `FloatVector`, needs to be initialized with a dimensionality + # parameter, which is only obtained at runtime. + self.create_tables_if_not_exists() + self.create_collection() + + # After setting up the table/collection at runtime, add embeddings. + embedding_ids = super().add_embeddings( + texts=texts, embeddings=embeddings, metadatas=metadatas, ids=ids, **kwargs + ) + refresh_table(self.Session(), self.EmbeddingStore) + return embedding_ids + + def create_tables_if_not_exists(self) -> None: + """ + Need to overwrite because this `Base` is different from parent's `Base`. + """ + if self.BaseModel is None: + raise RuntimeError("Storage models not initialized") + self.BaseModel.metadata.create_all(self._engine) + + def drop_tables(self) -> None: + """ + Need to overwrite because this `Base` is different from parent's `Base`. + """ + mf = ModelFactory() + mf.Base.metadata.drop_all(self._engine) + + def delete( + self, + ids: Optional[List[str]] = None, + collection_only: bool = False, + **kwargs: Any, + ) -> None: + """ + Delete vectors by ids or uuids. + + Remark: Specialized for CrateDB to synchronize data. + + Args: + ids: List of ids to delete. + + Remark: Patch for CrateDB needs to overwrite this, in order to + add a "REFRESH TABLE" statement afterwards. The other + patch, listening to `after_delete` events seems not be + able to catch it. + """ + from sqlalchemy_cratedb.support import refresh_table + + super().delete(ids=ids, collection_only=collection_only, **kwargs) + + # CrateDB: Synchronize data because `on_flush` does not catch it. + with self.Session() as session: + refresh_table(session, self.EmbeddingStore) + + @property + def distance_strategy(self) -> Any: + if self._distance_strategy == DistanceStrategy.EUCLIDEAN: + return self.EmbeddingStore.embedding.euclidean_distance + elif self._distance_strategy == DistanceStrategy.COSINE: + raise NotImplementedError("Cosine similarity not implemented yet") + elif self._distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT: + raise NotImplementedError("Dot-product similarity not implemented yet") + else: + raise ValueError( + f"Got unexpected value for distance: {self._distance_strategy}. " + f"Should be one of {', '.join([ds.value for ds in DistanceStrategy])}." + ) + + def _results_to_docs_and_scores(self, results: Any) -> List[Tuple[Document, float]]: + """Return docs and scores from results.""" + docs = [ + ( + Document( + page_content=result.EmbeddingStore.document, + metadata=result.EmbeddingStore.cmetadata, + ), + result._score if self.embedding_function is not None else None, + ) + for result in results + ] + return docs + + def _query_collection( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Dict[str, str]] = None, + ) -> List[Any]: + """Query the collection.""" + self._init_models(embedding) + with self.Session() as session: + collection = self.get_collection(session) + if not collection: + raise ValueError("Collection not found") + return self._query_collection_multi( + collections=[collection], embedding=embedding, k=k, filter=filter + ) + + def _query_collection_multi( + self, + collections: List[Any], + embedding: List[float], + k: int = 4, + filter: Optional[Dict[str, str]] = None, + ) -> List[Any]: + """Query the collection.""" + self._init_models(embedding) + + collection_names = [coll.name for coll in collections] + collection_uuids = [coll.uuid for coll in collections] + self.logger.info(f"Querying collections: {collection_names}") + + with self.Session() as session: + filter_by = self.EmbeddingStore.collection_id.in_(collection_uuids) + + if filter is not None: + filter_clauses = [] + for key, value in filter.items(): + IN = "in" + if isinstance(value, dict) and IN in map(str.lower, value): + value_case_insensitive = { + k.lower(): v for k, v in value.items() + } + filter_by_metadata = self.EmbeddingStore.cmetadata[key].in_( + value_case_insensitive[IN] + ) + filter_clauses.append(filter_by_metadata) + else: + filter_by_metadata = self.EmbeddingStore.cmetadata[key] == str( + value + ) # type: ignore[assignment] + filter_clauses.append(filter_by_metadata) + + filter_by = sqlalchemy.and_(filter_by, *filter_clauses) # type: ignore[assignment] + + _type = self.EmbeddingStore + + results: List[Any] = ( + session.query( # type: ignore[attr-defined] + self.EmbeddingStore, + # TODO: Original pgvector code uses `self.distance_strategy`. + # CrateDB currently only supports EUCLIDEAN. + # self.distance_strategy(embedding).label("distance") + sqlalchemy.literal_column( + f"{self.EmbeddingStore.__tablename__}._score" + ).label("_score"), + ) + .filter(filter_by) + # CrateDB applies `KNN_MATCH` within the `WHERE` clause. + .filter( + sqlalchemy.func.knn_match( + self.EmbeddingStore.embedding, embedding, k + ) + ) + .order_by(sqlalchemy.desc("_score")) + .join( + self.CollectionStore, + self.EmbeddingStore.collection_id == self.CollectionStore.uuid, + ) + .limit(k) + ) + return results + + @classmethod + def from_texts( # type: ignore[override] + cls: Type[CrateDBVectorSearch], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + ids: Optional[List[str]] = None, + pre_delete_collection: bool = False, + **kwargs: Any, + ) -> CrateDBVectorSearch: + """ + Return VectorStore initialized from texts and embeddings. + Database connection string is required. + + Either pass it as a parameter, or set the CRATEDB_CONNECTION_STRING + environment variable. + + Remark: Needs to be overwritten, because CrateDB uses a different + DEFAULT_DISTANCE_STRATEGY. + """ + return super().from_texts( # type: ignore[return-value] + texts, + embedding, + metadatas=metadatas, + ids=ids, + collection_name=collection_name, + distance_strategy=distance_strategy, # type: ignore[arg-type] + pre_delete_collection=pre_delete_collection, + **kwargs, + ) + + @classmethod + def get_connection_string(cls, kwargs: Dict[str, Any]) -> str: + connection_string: str = get_from_dict_or_env( + data=kwargs, + key="connection_string", + env_key="CRATEDB_CONNECTION_STRING", + ) + + if not connection_string: + raise ValueError( + "Database connection string is required." + "Either pass it as a parameter, or set the " + "CRATEDB_CONNECTION_STRING environment variable." + ) + + return connection_string + + @classmethod + def connection_string_from_db_params( + cls, + driver: str, + host: str, + port: int, + database: str, + user: str, + password: str, + ) -> str: + """Return connection string from database parameters.""" + return str( + sqlalchemy.URL.create( + drivername=driver, + host=host, + port=port, + username=user, + password=password, + query={"schema": database}, + ) + ) + + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + """ + if self.override_relevance_score_fn is not None: + return self.override_relevance_score_fn + + # Default strategy is to rely on distance strategy provided + # in vectorstore constructor + if self._distance_strategy == DistanceStrategy.COSINE: + return self._cosine_relevance_score_fn + elif self._distance_strategy == DistanceStrategy.EUCLIDEAN: + return self._euclidean_relevance_score_fn + elif self._distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT: + return self._max_inner_product_relevance_score_fn + else: + raise ValueError( + "No supported normalization function for distance_strategy of " + f"{self._distance_strategy}. Consider providing relevance_score_fn to " + "CrateDBVectorSearch constructor." + ) + + @staticmethod + def _euclidean_relevance_score_fn(score: float) -> float: + """Return a similarity score on a scale [0, 1].""" + # The 'correct' relevance function + # may differ depending on a few things, including: + # - the distance / similarity metric used by the VectorStore + # - the scale of your embeddings (OpenAI's are unit normed. Many + # others are not!) + # - embedding dimensionality + # - etc. + # This function converts the euclidean norm of normalized embeddings + # (0 is most similar, sqrt(2) most dissimilar) + # to a similarity function (0 to 1) + + # Original: + # return 1.0 - distance / math.sqrt(2) + return score / math.sqrt(2) diff --git a/libs/community/langchain_community/vectorstores/cratedb/extended.py b/libs/community/langchain_community/vectorstores/cratedb/extended.py new file mode 100644 index 0000000000000..4d5ae8e88f3fd --- /dev/null +++ b/libs/community/langchain_community/vectorstores/cratedb/extended.py @@ -0,0 +1,91 @@ +import logging +from typing import ( + Any, + Callable, + Dict, + List, + Optional, +) + +import sqlalchemy +from langchain.schema.embeddings import Embeddings + +from langchain_community.vectorstores.cratedb.base import ( + DEFAULT_DISTANCE_STRATEGY, + CrateDBVectorSearch, + DistanceStrategy, +) +from langchain_community.vectorstores.pgvector import _LANGCHAIN_DEFAULT_COLLECTION_NAME + + +class CrateDBVectorSearchMultiCollection(CrateDBVectorSearch): + """ + Provide functionality for searching multiple collections. + It can not be used for indexing documents. + + To use it, you should have the ``sqlalchemy-cratedb`` Python package installed. + + Synopsis:: + + from langchain.vectorstores.cratedb import CrateDBVectorSearchMultiCollection + + multisearch = CrateDBVectorSearchMultiCollection( + collection_names=["collection_foo", "collection_bar"], + embedding_function=embeddings, + connection_string=CONNECTION_STRING, + ) + docs_with_score = multisearch.similarity_search_with_score(query) + """ + + def __init__( + self, + connection_string: str, + embedding_function: Embeddings, + collection_names: List[str] = [_LANGCHAIN_DEFAULT_COLLECTION_NAME], + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, # type: ignore[arg-type] + logger: Optional[logging.Logger] = None, + relevance_score_fn: Optional[Callable[[float], float]] = None, + *, + connection: Optional[sqlalchemy.engine.Connection] = None, + engine_args: Optional[dict[str, Any]] = None, + ) -> None: + self.connection_string = connection_string + self.embedding_function = embedding_function + self.collection_names = collection_names + self._distance_strategy = distance_strategy # type: ignore[assignment] + self.logger = logger or logging.getLogger(__name__) + self.override_relevance_score_fn = relevance_score_fn + self.engine_args = engine_args or {} + + # Create a connection if not provided, otherwise use the provided connection + self._bind = connection if connection else self._create_engine() + + self.__post_init__() + + @classmethod + def _from(cls, *args: List, **kwargs: Dict): # type: ignore[no-untyped-def,override] + raise NotImplementedError("This adapter can not be used for indexing documents") + + def get_collections(self, session: sqlalchemy.orm.Session) -> Any: + if self.CollectionStore is None: + raise RuntimeError( + "Collection can't be accessed without specifying " + "dimension size of embedding vectors" + ) + return self.CollectionStore.get_by_names(session, self.collection_names) + + def _query_collection( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Dict[str, str]] = None, + ) -> List[Any]: + """Query multiple collections.""" + self._init_models(embedding) + with self.Session() as session: + collections = self.get_collections(session) + if not collections: + raise ValueError("No collections found") + return self._query_collection_multi( + collections=collections, embedding=embedding, k=k, filter=filter + ) diff --git a/libs/community/langchain_community/vectorstores/cratedb/model.py b/libs/community/langchain_community/vectorstores/cratedb/model.py new file mode 100644 index 0000000000000..68848fa186a1c --- /dev/null +++ b/libs/community/langchain_community/vectorstores/cratedb/model.py @@ -0,0 +1,115 @@ +import uuid +from typing import Any, List, Optional, Tuple + +import sqlalchemy +from sqlalchemy.orm import Session, declarative_base, relationship + + +def generate_uuid() -> str: + return str(uuid.uuid4()) + + +class ModelFactory: + """Provide SQLAlchemy model objects at runtime.""" + + def __init__(self, dimensions: Optional[int] = None): + from sqlalchemy_cratedb import FloatVector, ObjectType + + # While it does not have any function here, you will still need to supply a + # dummy dimension size value for operations like deleting records. + self.dimensions = dimensions or 1024 + + Base: Any = declarative_base() + + # Optional: Use a custom schema for the langchain tables. + # Base = declarative_base(metadata=MetaData(schema="langchain")) # type: Any + + class BaseModel(Base): + """Base model for the SQL stores.""" + + __abstract__ = True + uuid = sqlalchemy.Column( + sqlalchemy.String, primary_key=True, default=generate_uuid + ) + + class CollectionStore(BaseModel): + """Collection store.""" + + __tablename__ = "collection" + __table_args__ = {"keep_existing": True} + + name = sqlalchemy.Column(sqlalchemy.String) + cmetadata: sqlalchemy.Column = sqlalchemy.Column(ObjectType) + + embeddings = relationship( + "EmbeddingStore", + back_populates="collection", + cascade="all, delete-orphan", + passive_deletes=False, + ) + + @classmethod + def get_by_name( + cls, session: Session, name: str + ) -> Optional["CollectionStore"]: + return session.query(cls).filter(cls.name == name).first() # type: ignore[attr-defined] + + @classmethod + def get_by_names( + cls, session: Session, names: List[str] + ) -> List["CollectionStore"]: + return session.query(cls).filter(cls.name.in_(names)).all() # type: ignore[attr-defined] + + @classmethod + def get_or_create( + cls, + session: Session, + name: str, + cmetadata: Optional[dict] = None, + ) -> Tuple["CollectionStore", bool]: + """ + Get or create a collection. + Returns [Collection, bool] where the bool is True + if the collection was created. + """ + created = False + collection = cls.get_by_name(session, name) + if collection: + return collection, created + + collection = cls(name=name, cmetadata=cmetadata) + session.add(collection) + session.commit() + created = True + return collection, created + + class EmbeddingStore(BaseModel): + """Embedding store.""" + + __tablename__ = "embedding" + __table_args__ = {"keep_existing": True} + + collection_id = sqlalchemy.Column( + sqlalchemy.String, + sqlalchemy.ForeignKey( + f"{CollectionStore.__tablename__}.uuid", + ondelete="CASCADE", + ), + ) + collection = relationship("CollectionStore", back_populates="embeddings") + + embedding: sqlalchemy.Column = sqlalchemy.Column( + FloatVector(self.dimensions) + ) + document: sqlalchemy.Column = sqlalchemy.Column( + sqlalchemy.String, nullable=True + ) + cmetadata: sqlalchemy.Column = sqlalchemy.Column(ObjectType, nullable=True) + + # custom_id : any user defined id + custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True) + + self.Base = Base + self.BaseModel = BaseModel + self.CollectionStore = CollectionStore + self.EmbeddingStore = EmbeddingStore diff --git a/libs/community/tests/integration_tests/vectorstores/docker-compose/cratedb.yml b/libs/community/tests/integration_tests/vectorstores/docker-compose/cratedb.yml new file mode 100644 index 0000000000000..b547b2c766f20 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/docker-compose/cratedb.yml @@ -0,0 +1,20 @@ +version: "3" + +services: + postgresql: + image: crate/crate:nightly + environment: + - CRATE_HEAP_SIZE=4g + ports: + - "4200:4200" + - "5432:5432" + command: | + crate -Cdiscovery.type=single-node + healthcheck: + test: + [ + "CMD-SHELL", + "curl --silent --fail http://localhost:4200/ || exit 1", + ] + interval: 5s + retries: 60 diff --git a/libs/community/tests/integration_tests/vectorstores/test_cratedb.py b/libs/community/tests/integration_tests/vectorstores/test_cratedb.py new file mode 100644 index 0000000000000..e258a42177f70 --- /dev/null +++ b/libs/community/tests/integration_tests/vectorstores/test_cratedb.py @@ -0,0 +1,668 @@ +""" +Test CrateDB `FLOAT_VECTOR` / `KNN_MATCH` functionality. + +cd tests/integration_tests/vectorstores/docker-compose +docker-compose -f cratedb.yml up +""" + +import os +import re +from typing import Dict, Generator, List + +import pytest +import sqlalchemy as sa +import sqlalchemy.orm +from langchain.docstore.document import Document +from sqlalchemy.exc import ProgrammingError +from sqlalchemy.orm import Session + +from langchain_community.vectorstores.cratedb import CrateDBVectorSearch +from langchain_community.vectorstores.cratedb.extended import ( + CrateDBVectorSearchMultiCollection, +) +from langchain_community.vectorstores.cratedb.model import ModelFactory +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, + FakeEmbeddings, +) + +SCHEMA_NAME = os.environ.get("TEST_CRATEDB_DATABASE", "testdrive") + +CONNECTION_STRING = CrateDBVectorSearch.connection_string_from_db_params( + driver=os.environ.get("TEST_CRATEDB_DRIVER", "crate"), + host=os.environ.get("TEST_CRATEDB_HOST", "localhost"), + port=int(os.environ.get("TEST_CRATEDB_PORT", "4200")), + database=SCHEMA_NAME, + user=os.environ.get("TEST_CRATEDB_USER", "crate"), + password=os.environ.get("TEST_CRATEDB_PASSWORD", ""), +) + +ADA_TOKEN_COUNT = 1536 + + +@pytest.fixture +def engine() -> sa.Engine: + """ + Return an SQLAlchemy engine object. + """ + return sa.create_engine(CONNECTION_STRING, echo=False) + + +@pytest.fixture +def session(engine: sa.Engine) -> Generator[sa.orm.Session, None, None]: + with engine.connect() as conn: + with Session(conn) as session: + yield session + + +@pytest.fixture(autouse=True) +def drop_tables(engine: sa.Engine) -> None: + """ + Drop database tables. + """ + try: + mf = ModelFactory() + mf.BaseModel.metadata.drop_all(engine, checkfirst=False) + except Exception as ex: + if "RelationUnknown" not in str(ex): + raise + + +@pytest.fixture +def prune_tables(engine: sa.Engine) -> None: + """ + Delete data from database tables. + """ + with engine.connect() as conn: + with Session(conn) as session: + mf = ModelFactory() + try: + session.query(mf.CollectionStore).delete() + except ProgrammingError: + pass + try: + session.query(mf.EmbeddingStore).delete() + except ProgrammingError: + pass + + +def ensure_collection(session: sa.orm.Session, name: str) -> None: + """ + Create a (fake) collection item. + """ + session.execute( + sa.text( + """ + CREATE TABLE IF NOT EXISTS collection ( + uuid TEXT, + name TEXT, + cmetadata OBJECT + ); + """ + ) + ) + session.execute( + sa.text( + """ + CREATE TABLE IF NOT EXISTS embedding ( + uuid TEXT, + collection_id TEXT, + embedding FLOAT_VECTOR(123), + document TEXT, + cmetadata OBJECT, + custom_id TEXT + ); + """ + ) + ) + try: + session.execute( + sa.text( + f"INSERT INTO collection (uuid, name, cmetadata) " + f"VALUES ('uuid-{name}', '{name}', {{}});" + ) + ) + session.execute(sa.text("REFRESH TABLE collection")) + except sa.exc.IntegrityError: + pass + + +class FakeEmbeddingsWithAdaDimension(FakeEmbeddings): + """Fake embeddings functionality for testing.""" + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Return simple embeddings.""" + return [ + [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(i)] for i in range(len(texts)) + ] + + def embed_query(self, text: str) -> List[float]: + """Return simple embeddings.""" + return [float(1.0)] * (ADA_TOKEN_COUNT - 1) + [float(0.0)] + + +class ConsistentFakeEmbeddingsWithAdaDimension(ConsistentFakeEmbeddings): + """ + Fake embeddings which remember all the texts seen so far to return + consistent vectors for the same texts. + + Other than this, they also have a fixed dimensionality, which is + important in this case. + """ + + def __init__(self, *args: List, **kwargs: Dict) -> None: + super().__init__(dimensionality=ADA_TOKEN_COUNT) + + +def test_cratedb_texts() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_cratedb_embedding_dimension() -> None: + """Verify the `embedding` column uses the correct vector dimensionality.""" + texts = ["foo", "bar", "baz"] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection", + embedding=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + with docsearch.Session() as session: + result = session.execute(sa.text(f"SHOW CREATE TABLE {SCHEMA_NAME}.embedding")) + record = result.first() + if not record: + raise ValueError("No data found") + ddl = record[0] + assert f'"embedding" FLOAT_VECTOR({ADA_TOKEN_COUNT})' in ddl + + +def test_cratedb_embeddings() -> None: + """Test end to end construction with embeddings and search.""" + texts = ["foo", "bar", "baz"] + text_embeddings = FakeEmbeddingsWithAdaDimension().embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + docsearch = CrateDBVectorSearch.from_embeddings( + text_embeddings=text_embedding_pairs, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_cratedb_with_metadatas() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": "0"})] + + +def test_cratedb_with_metadatas_with_scores() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search_with_score("foo", k=1) + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 2.0)] + + +def test_cratedb_with_filter_match() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection_filter", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + # TODO: Original: + # assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] # noqa: E501 + output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "0"}) + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), pytest.approx(2.2, 0.3)) + ] + + +def test_cratedb_with_filter_distant_match() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection_filter", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search_with_score("foo", k=2, filter={"page": "2"}) + # Original score value: 0.0013003906671379406 + assert output == [ + (Document(page_content="baz", metadata={"page": "2"}), pytest.approx(1.5, 0.2)) + ] + + +def test_cratedb_with_filter_no_match() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection_filter", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "5"}) + assert output == [] + + +def test_cratedb_collection_delete() -> None: + """ + Test end to end collection construction and deletion. + Uses two different collections of embeddings. + """ + + store_foo = CrateDBVectorSearch.from_texts( + texts=["foo"], + collection_name="test_collection_foo", + collection_metadata={"category": "foo"}, + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=[{"document": "foo"}], + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + store_bar = CrateDBVectorSearch.from_texts( + texts=["bar"], + collection_name="test_collection_bar", + collection_metadata={"category": "bar"}, + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=[{"document": "bar"}], + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + session = store_foo.Session() + + # Verify data in database. + collection_foo = store_foo.get_collection(session) + collection_bar = store_bar.get_collection(session) + if collection_foo is None or collection_bar is None: + assert False, "Expected CollectionStore objects but received None" + assert collection_foo.embeddings[0].cmetadata == {"document": "foo"} + assert collection_bar.embeddings[0].cmetadata == {"document": "bar"} + + # Delete first collection. + store_foo.delete_collection() + + # Verify that the "foo" collection has been deleted. + collection_foo = store_foo.get_collection(session) + collection_bar = store_bar.get_collection(session) + if collection_bar is None: + assert False, "Expected CollectionStore object but received None" + assert collection_foo is None + assert collection_bar.embeddings[0].cmetadata == {"document": "bar"} + + # Verify that associated embeddings also have been deleted. + embeddings_count = session.query(store_foo.EmbeddingStore).count() + assert embeddings_count == 1 + + +def test_cratedb_collection_with_metadata() -> None: + """Test end to end collection construction""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + cratedb_vector = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection", + collection_metadata={"foo": "bar"}, + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + collection = cratedb_vector.get_collection(cratedb_vector.Session()) + if collection is None: + assert False, "Expected a CollectionStore object but received None" + else: + assert collection.name == "test_collection" + assert collection.cmetadata == {"foo": "bar"} + + +def test_cratedb_collection_no_embedding_dimension() -> None: + """ + Verify that addressing collections fails when not specifying dimensions. + """ + cratedb_vector = CrateDBVectorSearch( + embedding_function=None, # type: ignore[arg-type] + connection_string=CONNECTION_STRING, + ) + session = cratedb_vector.Session() + with pytest.raises(RuntimeError) as ex: + cratedb_vector.get_collection(session) + assert ex.match( + "Collection can't be accessed without specifying " + "dimension size of embedding vectors" + ) + + +def test_cratedb_collection_read_only(session: Session) -> None: + """ + Test using a collection, without adding any embeddings upfront. + + This happens when just invoking the "retrieval" case. + + In this scenario, embedding dimensionality needs to be figured out + from the supplied `embedding_function`. + """ + + # Create a fake collection item. + ensure_collection(session, "baz2") + + # This test case needs an embedding _with_ dimensionality. + # Otherwise, the data access layer is unable to figure it + # out at runtime. + embedding = ConsistentFakeEmbeddingsWithAdaDimension() + + vectorstore = CrateDBVectorSearch( + collection_name="baz2", + connection_string=CONNECTION_STRING, + embedding_function=embedding, + ) + output = vectorstore.similarity_search("foo", k=1) + + # No documents/embeddings have been loaded, the collection is empty. + # This is why there are also no results. + assert output == [] + + +def test_cratedb_with_filter_in_set() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection_filter", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.similarity_search_with_score( + "foo", k=2, filter={"page": {"IN": ["0", "2"]}} + ) + # Original score values: 0.0, 0.0013003906671379406 + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), pytest.approx(3.0, 0.1)), + (Document(page_content="baz", metadata={"page": "2"}), pytest.approx(2.2, 0.1)), + ] + + +def test_cratedb_delete_docs() -> None: + """Add and delete documents.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection_filter", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + ids=["1", "2", "3"], + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + docsearch.delete(["1", "2"]) + with docsearch._make_session() as session: + records = list(session.query(docsearch.EmbeddingStore).all()) + # ignoring type error since mypy cannot determine whether + # the list is sortable + assert sorted(record.custom_id for record in records) == ["3"] # type: ignore + + docsearch.delete(["2", "3"]) # Should not raise on missing ids + with docsearch._make_session() as session: + records = list(session.query(docsearch.EmbeddingStore).all()) + # ignoring type error since mypy cannot determine whether + # the list is sortable + assert sorted(record.custom_id for record in records) == [] # type: ignore + + +def test_cratedb_relevance_score() -> None: + """Test to make sure the relevance score is scaled to 0-1.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + + output = docsearch.similarity_search_with_relevance_scores("foo", k=3) + # Original score values: 1.0, 0.9996744261675065, 0.9986996093328621 + assert output == [ + (Document(page_content="foo", metadata={"page": "0"}), pytest.approx(1.4, 0.1)), + (Document(page_content="bar", metadata={"page": "1"}), pytest.approx(1.1, 0.1)), + (Document(page_content="baz", metadata={"page": "2"}), pytest.approx(0.8, 0.1)), + ] + + +def test_cratedb_retriever_search_threshold() -> None: + """Test using retriever for searching with threshold.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + + retriever = docsearch.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={"k": 3, "score_threshold": 0.999}, + ) + output = retriever.get_relevant_documents("summer") + assert output == [ + Document(page_content="foo", metadata={"page": "0"}), + Document(page_content="bar", metadata={"page": "1"}), + ] + + +def test_cratedb_retriever_search_threshold_custom_normalization_fn() -> None: + """Test searching with threshold and custom normalization function""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + metadatas=metadatas, + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + relevance_score_fn=lambda d: d * 0, + ) + + retriever = docsearch.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={"k": 3, "score_threshold": 0.5}, + ) + output = retriever.get_relevant_documents("foo") + assert output == [] + + +def test_cratedb_max_marginal_relevance_search() -> None: + """Test max marginal relevance search.""" + texts = ["foo", "bar", "baz"] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.max_marginal_relevance_search("foo", k=1, fetch_k=3) + assert output == [Document(page_content="foo")] + + +def test_cratedb_max_marginal_relevance_search_with_score() -> None: + """Test max marginal relevance search with relevance scores.""" + texts = ["foo", "bar", "baz"] + docsearch = CrateDBVectorSearch.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.max_marginal_relevance_search_with_score("foo", k=1, fetch_k=3) + assert output == [(Document(page_content="foo"), 2.0)] + + +def test_cratedb_multicollection_search_success() -> None: + """ + `CrateDBVectorSearchMultiCollection` provides functionality for + searching multiple collections. + """ + + store_1 = CrateDBVectorSearch.from_texts( + texts=["Räuber", "Hotzenplotz"], + collection_name="test_collection_1", + embedding=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + _ = CrateDBVectorSearch.from_texts( + texts=["John", "Doe"], + collection_name="test_collection_2", + embedding=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + + # Probe the first store. + output = store_1.similarity_search("Räuber", k=1) + assert Document(page_content="Räuber") in output[:2] + output = store_1.similarity_search("Hotzenplotz", k=1) + assert Document(page_content="Hotzenplotz") in output[:2] + output = store_1.similarity_search("John Doe", k=1) + assert Document(page_content="Hotzenplotz") in output[:2] + + # Probe the multi-store. + multisearch = CrateDBVectorSearchMultiCollection( + collection_names=["test_collection_1", "test_collection_2"], + embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + ) + output = multisearch.similarity_search("Räuber Hotzenplotz", k=2) + assert Document(page_content="Räuber") in output[:2] + output = multisearch.similarity_search("John Doe", k=2) + assert Document(page_content="Doe") in output[:2] + + +def test_cratedb_multicollection_fail_indexing_not_permitted() -> None: + """ + `CrateDBVectorSearchMultiCollection` does not provide functionality for + indexing documents. + """ + + with pytest.raises(NotImplementedError) as ex: + CrateDBVectorSearchMultiCollection.from_texts( + texts=["foo"], + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + ) + assert ex.match("This adapter can not be used for indexing documents") + + +def test_cratedb_multicollection_search_table_does_not_exist() -> None: + """ + `CrateDBVectorSearchMultiCollection` will fail when the `collection` + table does not exist. + """ + + store = CrateDBVectorSearchMultiCollection( + collection_names=["unknown"], + embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + ) + with pytest.raises(ProgrammingError) as ex: + store.similarity_search("foo") + assert ex.match(re.escape("RelationUnknown[Relation 'collection' unknown]")) + + +def test_cratedb_multicollection_search_unknown_collection() -> None: + """ + `CrateDBVectorSearchMultiCollection` will fail when not able to identify + collections to search in. + """ + + CrateDBVectorSearch.from_texts( + texts=["Räuber", "Hotzenplotz"], + collection_name="test_collection", + embedding=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + + store = CrateDBVectorSearchMultiCollection( + collection_names=["unknown"], + embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + ) + with pytest.raises(ValueError) as ex: + store.similarity_search("foo") + assert ex.match("No collections found") + + +def test_cratedb_multicollection_no_embedding_dimension() -> None: + """ + Verify that addressing collections fails when not specifying dimensions. + """ + store = CrateDBVectorSearchMultiCollection( + embedding_function=None, # type: ignore[arg-type] + connection_string=CONNECTION_STRING, + ) + session = store.Session() + with pytest.raises(RuntimeError) as ex: + store.get_collection(session) + assert ex.match( + "Collection can't be accessed without specifying " + "dimension size of embedding vectors" + ) diff --git a/libs/community/tests/unit_tests/vectorstores/test_imports.py b/libs/community/tests/unit_tests/vectorstores/test_imports.py index 5ac0ca72b49c5..1bf5fca73ba57 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_imports.py +++ b/libs/community/tests/unit_tests/vectorstores/test_imports.py @@ -27,6 +27,7 @@ "Clickhouse", "ClickhouseSettings", "CouchbaseVectorStore", + "CrateDBVectorSearch", "DashVector", "DatabricksVectorSearch", "DeepLake", diff --git a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py index 041f4172b2dcb..b5a943c610636 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py +++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py @@ -16,7 +16,7 @@ def test_compatible_vectorstore_documentation() -> None: case and 1) update docs in [1] and 2) update the `documented` dict in this test case. - [1] langchain/docs/docs/modules/data_connection/indexing.ipynb + [1] langchain/docs/docs/how_to/indexing.ipynb """ # Check if a vectorstore is compatible with the indexing API @@ -60,6 +60,7 @@ def check_compatibility(vector_store: VectorStore) -> bool: "Cassandra", "Chroma", "CouchbaseVectorStore", + "CrateDBVectorSearch", "DashVector", "DatabricksVectorSearch", "TiDBVectorStore", From be39c5ae99fa00d712f1f012a9b9235cb7fbbe4b Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Thu, 31 Oct 2024 07:56:27 +0100 Subject: [PATCH 2/7] CrateDB: Vector Store -- rename to CrateDBVectorStore --- .../vectorstores/__init__.py | 6 +- .../vectorstores/cratedb/__init__.py | 8 +-- .../vectorstores/cratedb/base.py | 12 ++-- .../vectorstores/cratedb/extended.py | 10 +-- .../vectorstores/test_cratedb.py | 70 +++++++++---------- .../unit_tests/vectorstores/test_imports.py | 2 +- .../vectorstores/test_indexing_docs.py | 2 +- 7 files changed, 55 insertions(+), 55 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/__init__.py b/libs/community/langchain_community/vectorstores/__init__.py index 5741fd7a644b7..76ca824bd4e3e 100644 --- a/libs/community/langchain_community/vectorstores/__init__.py +++ b/libs/community/langchain_community/vectorstores/__init__.py @@ -93,7 +93,7 @@ CouchbaseVectorStore, ) from langchain_community.vectorstores.cratedb import ( - CrateDBVectorSearch, + CrateDBVectorStore, ) from langchain_community.vectorstores.dashvector import ( DashVector, @@ -337,7 +337,7 @@ "Clickhouse", "ClickhouseSettings", "CouchbaseVectorStore", - "CrateDBVectorSearch", + "CrateDBVectorStore", "DashVector", "DatabricksVectorSearch", "DeepLake", @@ -442,7 +442,7 @@ "Clickhouse": "langchain_community.vectorstores.clickhouse", "ClickhouseSettings": "langchain_community.vectorstores.clickhouse", "CouchbaseVectorStore": "langchain_community.vectorstores.couchbase", - "CrateDBVectorSearch": "langchain_community.vectorstores.cratedb", + "CrateDBVectorStore": "langchain_community.vectorstores.cratedb", "DashVector": "langchain_community.vectorstores.dashvector", "DatabricksVectorSearch": "langchain_community.vectorstores.databricks_vector_search", # noqa: E501 "DeepLake": "langchain_community.vectorstores.deeplake", diff --git a/libs/community/langchain_community/vectorstores/cratedb/__init__.py b/libs/community/langchain_community/vectorstores/cratedb/__init__.py index 62462bce1eba9..de17f15f7b252 100644 --- a/libs/community/langchain_community/vectorstores/cratedb/__init__.py +++ b/libs/community/langchain_community/vectorstores/cratedb/__init__.py @@ -1,7 +1,7 @@ -from .base import CrateDBVectorSearch -from .extended import CrateDBVectorSearchMultiCollection +from .base import CrateDBVectorStore +from .extended import CrateDBVectorStoreMultiCollection __all__ = [ - "CrateDBVectorSearch", - "CrateDBVectorSearchMultiCollection", + "CrateDBVectorStore", + "CrateDBVectorStoreMultiCollection", ] diff --git a/libs/community/langchain_community/vectorstores/cratedb/base.py b/libs/community/langchain_community/vectorstores/cratedb/base.py index ff392923cba44..bb1d2102c18f5 100644 --- a/libs/community/langchain_community/vectorstores/cratedb/base.py +++ b/libs/community/langchain_community/vectorstores/cratedb/base.py @@ -42,7 +42,7 @@ def _results_to_docs(docs_and_scores: Any) -> List[Document]: return [doc for doc, _ in docs_and_scores] -class CrateDBVectorSearch(PGVector): +class CrateDBVectorStore(PGVector): """`CrateDB` vector store. To use it, you should have the ``crate[sqlalchemy]`` python package installed. @@ -62,13 +62,13 @@ class CrateDBVectorSearch(PGVector): Example: .. code-block:: python - from langchain.vectorstores import CrateDBVectorSearch + from langchain.vectorstores import CrateDBVectorStore from langchain.embeddings.openai import OpenAIEmbeddings CONNECTION_STRING = "crate://crate@localhost:4200/test3" COLLECTION_NAME = "state_of_the_union_test" embeddings = OpenAIEmbeddings() - vectorestore = CrateDBVectorSearch.from_documents( + vectorestore = CrateDBVectorStore.from_documents( embedding=embeddings, documents=docs, collection_name=COLLECTION_NAME, @@ -337,7 +337,7 @@ def _query_collection_multi( @classmethod def from_texts( # type: ignore[override] - cls: Type[CrateDBVectorSearch], + cls: Type[CrateDBVectorStore], texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, @@ -346,7 +346,7 @@ def from_texts( # type: ignore[override] ids: Optional[List[str]] = None, pre_delete_collection: bool = False, **kwargs: Any, - ) -> CrateDBVectorSearch: + ) -> CrateDBVectorStore: """ Return VectorStore initialized from texts and embeddings. Database connection string is required. @@ -431,7 +431,7 @@ def _select_relevance_score_fn(self) -> Callable[[float], float]: raise ValueError( "No supported normalization function for distance_strategy of " f"{self._distance_strategy}. Consider providing relevance_score_fn to " - "CrateDBVectorSearch constructor." + "CrateDBVectorStore constructor." ) @staticmethod diff --git a/libs/community/langchain_community/vectorstores/cratedb/extended.py b/libs/community/langchain_community/vectorstores/cratedb/extended.py index 4d5ae8e88f3fd..2a123eb2ee14a 100644 --- a/libs/community/langchain_community/vectorstores/cratedb/extended.py +++ b/libs/community/langchain_community/vectorstores/cratedb/extended.py @@ -12,13 +12,13 @@ from langchain_community.vectorstores.cratedb.base import ( DEFAULT_DISTANCE_STRATEGY, - CrateDBVectorSearch, + CrateDBVectorStore, DistanceStrategy, ) from langchain_community.vectorstores.pgvector import _LANGCHAIN_DEFAULT_COLLECTION_NAME -class CrateDBVectorSearchMultiCollection(CrateDBVectorSearch): +class CrateDBVectorStoreMultiCollection(CrateDBVectorStore): """ Provide functionality for searching multiple collections. It can not be used for indexing documents. @@ -27,15 +27,15 @@ class CrateDBVectorSearchMultiCollection(CrateDBVectorSearch): Synopsis:: - from langchain.vectorstores.cratedb import CrateDBVectorSearchMultiCollection + from langchain_community.vectorstores.cratedb import CrateDBVectorStoreMultiCollection - multisearch = CrateDBVectorSearchMultiCollection( + multisearch = CrateDBVectorStoreMultiCollection( collection_names=["collection_foo", "collection_bar"], embedding_function=embeddings, connection_string=CONNECTION_STRING, ) docs_with_score = multisearch.similarity_search_with_score(query) - """ + """ # noqa: E501 def __init__( self, diff --git a/libs/community/tests/integration_tests/vectorstores/test_cratedb.py b/libs/community/tests/integration_tests/vectorstores/test_cratedb.py index e258a42177f70..c4f2b24835c62 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_cratedb.py +++ b/libs/community/tests/integration_tests/vectorstores/test_cratedb.py @@ -16,9 +16,9 @@ from sqlalchemy.exc import ProgrammingError from sqlalchemy.orm import Session -from langchain_community.vectorstores.cratedb import CrateDBVectorSearch +from langchain_community.vectorstores.cratedb import CrateDBVectorStore from langchain_community.vectorstores.cratedb.extended import ( - CrateDBVectorSearchMultiCollection, + CrateDBVectorStoreMultiCollection, ) from langchain_community.vectorstores.cratedb.model import ModelFactory from tests.integration_tests.vectorstores.fake_embeddings import ( @@ -28,7 +28,7 @@ SCHEMA_NAME = os.environ.get("TEST_CRATEDB_DATABASE", "testdrive") -CONNECTION_STRING = CrateDBVectorSearch.connection_string_from_db_params( +CONNECTION_STRING = CrateDBVectorStore.connection_string_from_db_params( driver=os.environ.get("TEST_CRATEDB_DRIVER", "crate"), host=os.environ.get("TEST_CRATEDB_HOST", "localhost"), port=int(os.environ.get("TEST_CRATEDB_PORT", "4200")), @@ -157,7 +157,7 @@ def __init__(self, *args: List, **kwargs: Dict) -> None: def test_cratedb_texts() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection", embedding=FakeEmbeddingsWithAdaDimension(), @@ -171,7 +171,7 @@ def test_cratedb_texts() -> None: def test_cratedb_embedding_dimension() -> None: """Verify the `embedding` column uses the correct vector dimensionality.""" texts = ["foo", "bar", "baz"] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection", embedding=ConsistentFakeEmbeddingsWithAdaDimension(), @@ -192,7 +192,7 @@ def test_cratedb_embeddings() -> None: texts = ["foo", "bar", "baz"] text_embeddings = FakeEmbeddingsWithAdaDimension().embed_documents(texts) text_embedding_pairs = list(zip(texts, text_embeddings)) - docsearch = CrateDBVectorSearch.from_embeddings( + docsearch = CrateDBVectorStore.from_embeddings( text_embeddings=text_embedding_pairs, collection_name="test_collection", embedding=FakeEmbeddingsWithAdaDimension(), @@ -207,7 +207,7 @@ def test_cratedb_with_metadatas() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection", embedding=FakeEmbeddingsWithAdaDimension(), @@ -223,7 +223,7 @@ def test_cratedb_with_metadatas_with_scores() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection", embedding=FakeEmbeddingsWithAdaDimension(), @@ -239,7 +239,7 @@ def test_cratedb_with_filter_match() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection_filter", embedding=FakeEmbeddingsWithAdaDimension(), @@ -259,7 +259,7 @@ def test_cratedb_with_filter_distant_match() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection_filter", embedding=FakeEmbeddingsWithAdaDimension(), @@ -278,7 +278,7 @@ def test_cratedb_with_filter_no_match() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection_filter", embedding=FakeEmbeddingsWithAdaDimension(), @@ -296,7 +296,7 @@ def test_cratedb_collection_delete() -> None: Uses two different collections of embeddings. """ - store_foo = CrateDBVectorSearch.from_texts( + store_foo = CrateDBVectorStore.from_texts( texts=["foo"], collection_name="test_collection_foo", collection_metadata={"category": "foo"}, @@ -305,7 +305,7 @@ def test_cratedb_collection_delete() -> None: connection_string=CONNECTION_STRING, pre_delete_collection=True, ) - store_bar = CrateDBVectorSearch.from_texts( + store_bar = CrateDBVectorStore.from_texts( texts=["bar"], collection_name="test_collection_bar", collection_metadata={"category": "bar"}, @@ -344,7 +344,7 @@ def test_cratedb_collection_with_metadata() -> None: """Test end to end collection construction""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - cratedb_vector = CrateDBVectorSearch.from_texts( + cratedb_vector = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection", collection_metadata={"foo": "bar"}, @@ -365,7 +365,7 @@ def test_cratedb_collection_no_embedding_dimension() -> None: """ Verify that addressing collections fails when not specifying dimensions. """ - cratedb_vector = CrateDBVectorSearch( + cratedb_vector = CrateDBVectorStore( embedding_function=None, # type: ignore[arg-type] connection_string=CONNECTION_STRING, ) @@ -396,7 +396,7 @@ def test_cratedb_collection_read_only(session: Session) -> None: # out at runtime. embedding = ConsistentFakeEmbeddingsWithAdaDimension() - vectorstore = CrateDBVectorSearch( + vectorstore = CrateDBVectorStore( collection_name="baz2", connection_string=CONNECTION_STRING, embedding_function=embedding, @@ -412,7 +412,7 @@ def test_cratedb_with_filter_in_set() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection_filter", embedding=FakeEmbeddingsWithAdaDimension(), @@ -434,7 +434,7 @@ def test_cratedb_delete_docs() -> None: """Add and delete documents.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection_filter", embedding=FakeEmbeddingsWithAdaDimension(), @@ -462,7 +462,7 @@ def test_cratedb_relevance_score() -> None: """Test to make sure the relevance score is scaled to 0-1.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection", embedding=FakeEmbeddingsWithAdaDimension(), @@ -484,7 +484,7 @@ def test_cratedb_retriever_search_threshold() -> None: """Test using retriever for searching with threshold.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection", embedding=FakeEmbeddingsWithAdaDimension(), @@ -508,7 +508,7 @@ def test_cratedb_retriever_search_threshold_custom_normalization_fn() -> None: """Test searching with threshold and custom normalization function""" texts = ["foo", "bar", "baz"] metadatas = [{"page": str(i)} for i in range(len(texts))] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection", embedding=FakeEmbeddingsWithAdaDimension(), @@ -529,7 +529,7 @@ def test_cratedb_retriever_search_threshold_custom_normalization_fn() -> None: def test_cratedb_max_marginal_relevance_search() -> None: """Test max marginal relevance search.""" texts = ["foo", "bar", "baz"] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection", embedding=FakeEmbeddingsWithAdaDimension(), @@ -543,7 +543,7 @@ def test_cratedb_max_marginal_relevance_search() -> None: def test_cratedb_max_marginal_relevance_search_with_score() -> None: """Test max marginal relevance search with relevance scores.""" texts = ["foo", "bar", "baz"] - docsearch = CrateDBVectorSearch.from_texts( + docsearch = CrateDBVectorStore.from_texts( texts=texts, collection_name="test_collection", embedding=FakeEmbeddingsWithAdaDimension(), @@ -556,18 +556,18 @@ def test_cratedb_max_marginal_relevance_search_with_score() -> None: def test_cratedb_multicollection_search_success() -> None: """ - `CrateDBVectorSearchMultiCollection` provides functionality for + `CrateDBVectorStoreMultiCollection` provides functionality for searching multiple collections. """ - store_1 = CrateDBVectorSearch.from_texts( + store_1 = CrateDBVectorStore.from_texts( texts=["Räuber", "Hotzenplotz"], collection_name="test_collection_1", embedding=ConsistentFakeEmbeddingsWithAdaDimension(), connection_string=CONNECTION_STRING, pre_delete_collection=True, ) - _ = CrateDBVectorSearch.from_texts( + _ = CrateDBVectorStore.from_texts( texts=["John", "Doe"], collection_name="test_collection_2", embedding=ConsistentFakeEmbeddingsWithAdaDimension(), @@ -584,7 +584,7 @@ def test_cratedb_multicollection_search_success() -> None: assert Document(page_content="Hotzenplotz") in output[:2] # Probe the multi-store. - multisearch = CrateDBVectorSearchMultiCollection( + multisearch = CrateDBVectorStoreMultiCollection( collection_names=["test_collection_1", "test_collection_2"], embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(), connection_string=CONNECTION_STRING, @@ -597,12 +597,12 @@ def test_cratedb_multicollection_search_success() -> None: def test_cratedb_multicollection_fail_indexing_not_permitted() -> None: """ - `CrateDBVectorSearchMultiCollection` does not provide functionality for + `CrateDBVectorStoreMultiCollection` does not provide functionality for indexing documents. """ with pytest.raises(NotImplementedError) as ex: - CrateDBVectorSearchMultiCollection.from_texts( + CrateDBVectorStoreMultiCollection.from_texts( texts=["foo"], collection_name="test_collection", embedding=FakeEmbeddingsWithAdaDimension(), @@ -613,11 +613,11 @@ def test_cratedb_multicollection_fail_indexing_not_permitted() -> None: def test_cratedb_multicollection_search_table_does_not_exist() -> None: """ - `CrateDBVectorSearchMultiCollection` will fail when the `collection` + `CrateDBVectorStoreMultiCollection` will fail when the `collection` table does not exist. """ - store = CrateDBVectorSearchMultiCollection( + store = CrateDBVectorStoreMultiCollection( collection_names=["unknown"], embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(), connection_string=CONNECTION_STRING, @@ -629,11 +629,11 @@ def test_cratedb_multicollection_search_table_does_not_exist() -> None: def test_cratedb_multicollection_search_unknown_collection() -> None: """ - `CrateDBVectorSearchMultiCollection` will fail when not able to identify + `CrateDBVectorStoreMultiCollection` will fail when not able to identify collections to search in. """ - CrateDBVectorSearch.from_texts( + CrateDBVectorStore.from_texts( texts=["Räuber", "Hotzenplotz"], collection_name="test_collection", embedding=ConsistentFakeEmbeddingsWithAdaDimension(), @@ -641,7 +641,7 @@ def test_cratedb_multicollection_search_unknown_collection() -> None: pre_delete_collection=True, ) - store = CrateDBVectorSearchMultiCollection( + store = CrateDBVectorStoreMultiCollection( collection_names=["unknown"], embedding_function=ConsistentFakeEmbeddingsWithAdaDimension(), connection_string=CONNECTION_STRING, @@ -655,7 +655,7 @@ def test_cratedb_multicollection_no_embedding_dimension() -> None: """ Verify that addressing collections fails when not specifying dimensions. """ - store = CrateDBVectorSearchMultiCollection( + store = CrateDBVectorStoreMultiCollection( embedding_function=None, # type: ignore[arg-type] connection_string=CONNECTION_STRING, ) diff --git a/libs/community/tests/unit_tests/vectorstores/test_imports.py b/libs/community/tests/unit_tests/vectorstores/test_imports.py index 1bf5fca73ba57..0412b3222005b 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_imports.py +++ b/libs/community/tests/unit_tests/vectorstores/test_imports.py @@ -27,7 +27,7 @@ "Clickhouse", "ClickhouseSettings", "CouchbaseVectorStore", - "CrateDBVectorSearch", + "CrateDBVectorStore", "DashVector", "DatabricksVectorSearch", "DeepLake", diff --git a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py index b5a943c610636..2f56c8098a396 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py +++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py @@ -60,7 +60,7 @@ def check_compatibility(vector_store: VectorStore) -> bool: "Cassandra", "Chroma", "CouchbaseVectorStore", - "CrateDBVectorSearch", + "CrateDBVectorStore", "DashVector", "DatabricksVectorSearch", "TiDBVectorStore", From 7e0bc9cc6573d025ec8d0385015cb1167b622ee3 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Thu, 31 Oct 2024 07:57:07 +0100 Subject: [PATCH 3/7] CrateDB: Vector Store -- improve inline documentation --- .../vectorstores/cratedb/base.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/cratedb/base.py b/libs/community/langchain_community/vectorstores/cratedb/base.py index bb1d2102c18f5..77984daf2cf39 100644 --- a/libs/community/langchain_community/vectorstores/cratedb/base.py +++ b/libs/community/langchain_community/vectorstores/cratedb/base.py @@ -24,7 +24,17 @@ class DistanceStrategy(str, enum.Enum): - """Enumerator of the Distance strategies.""" + """ + Enumerator of the Distance strategies. + + Note that CrateDB and Lucene currently only implement + similarity based on the Euclidean distance. + + > Today, when creating a FLOAT_VECTOR, it uses the default + > EUCLIDEAN_HNSW (L2) similarity. + > + > -- https://github.com/crate/crate/issues/15768 + """ EUCLIDEAN = "euclidean" COSINE = "cosine" @@ -45,7 +55,9 @@ def _results_to_docs(docs_and_scores: Any) -> List[Document]: class CrateDBVectorStore(PGVector): """`CrateDB` vector store. - To use it, you should have the ``crate[sqlalchemy]`` python package installed. + To use it, please install the Python package `sqlalchemy-cratedb`. + + uv pip install --upgrade sqlalchemy-cratedb Args: connection_string: Database connection string. @@ -312,6 +324,9 @@ def _query_collection_multi( results: List[Any] = ( session.query( # type: ignore[attr-defined] self.EmbeddingStore, + # FIXME: Using `_score` is definitively the wrong choice. + # - https://github.com/crate-workbench/langchain/issues/19 + # - https://github.com/crate/crate/issues/15835 # TODO: Original pgvector code uses `self.distance_strategy`. # CrateDB currently only supports EUCLIDEAN. # self.distance_strategy(embedding).label("distance") From 59429fd0356ae6555808d31398bcad2e15f65cf9 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Fri, 1 Nov 2024 22:41:00 +0100 Subject: [PATCH 4/7] CrateDB: Vector Store -- make it work using CrateDB's vector_similarity Before, the adapter used CrateDB's built-in `_score` field for ranking. Now, it uses the dedicated `vector_similarity()` function to compute the similarity between two vectors. --- .../vectorstores/cratedb/base.py | 27 +++++++++++------- .../vectorstores/test_cratedb.py | 28 ++++++++----------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/cratedb/base.py b/libs/community/langchain_community/vectorstores/cratedb/base.py index 77984daf2cf39..e9109764ecfc7 100644 --- a/libs/community/langchain_community/vectorstores/cratedb/base.py +++ b/libs/community/langchain_community/vectorstores/cratedb/base.py @@ -260,7 +260,7 @@ def _results_to_docs_and_scores(self, results: Any) -> List[Tuple[Document, floa page_content=result.EmbeddingStore.document, metadata=result.EmbeddingStore.cmetadata, ), - result._score if self.embedding_function is not None else None, + result.similarity if self.embedding_function is not None else None, ) for result in results ] @@ -324,15 +324,22 @@ def _query_collection_multi( results: List[Any] = ( session.query( # type: ignore[attr-defined] self.EmbeddingStore, - # FIXME: Using `_score` is definitively the wrong choice. - # - https://github.com/crate-workbench/langchain/issues/19 - # - https://github.com/crate/crate/issues/15835 # TODO: Original pgvector code uses `self.distance_strategy`. # CrateDB currently only supports EUCLIDEAN. # self.distance_strategy(embedding).label("distance") - sqlalchemy.literal_column( - f"{self.EmbeddingStore.__tablename__}._score" - ).label("_score"), + sqlalchemy.func.vector_similarity( + self.EmbeddingStore.embedding, + # TODO: Just reference the `embedding` symbol here, don't + # serialize its value prematurely. + # https://github.com/crate/crate/issues/16912 + # + # Until that got fixed, marshal the arguments to + # `vector_similarity()` manually, in order to work around + # this edge case bug. We don't need to use JSON marshalling, + # because Python's string representation of a list is just + # right. + sqlalchemy.text(str(embedding)), + ).label("similarity"), ) .filter(filter_by) # CrateDB applies `KNN_MATCH` within the `WHERE` clause. @@ -341,7 +348,7 @@ def _query_collection_multi( self.EmbeddingStore.embedding, embedding, k ) ) - .order_by(sqlalchemy.desc("_score")) + .order_by(sqlalchemy.desc("similarity")) .join( self.CollectionStore, self.EmbeddingStore.collection_id == self.CollectionStore.uuid, @@ -450,7 +457,7 @@ def _select_relevance_score_fn(self) -> Callable[[float], float]: ) @staticmethod - def _euclidean_relevance_score_fn(score: float) -> float: + def _euclidean_relevance_score_fn(similarity: float) -> float: """Return a similarity score on a scale [0, 1].""" # The 'correct' relevance function # may differ depending on a few things, including: @@ -465,4 +472,4 @@ def _euclidean_relevance_score_fn(score: float) -> float: # Original: # return 1.0 - distance / math.sqrt(2) - return score / math.sqrt(2) + return similarity / math.sqrt(2) diff --git a/libs/community/tests/integration_tests/vectorstores/test_cratedb.py b/libs/community/tests/integration_tests/vectorstores/test_cratedb.py index c4f2b24835c62..acc03547fe565 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_cratedb.py +++ b/libs/community/tests/integration_tests/vectorstores/test_cratedb.py @@ -232,7 +232,7 @@ def test_cratedb_with_metadatas_with_scores() -> None: pre_delete_collection=True, ) output = docsearch.similarity_search_with_score("foo", k=1) - assert output == [(Document(page_content="foo", metadata={"page": "0"}), 2.0)] + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)] def test_cratedb_with_filter_match() -> None: @@ -250,9 +250,7 @@ def test_cratedb_with_filter_match() -> None: # TODO: Original: # assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] # noqa: E501 output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "0"}) - assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), pytest.approx(2.2, 0.3)) - ] + assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)] def test_cratedb_with_filter_distant_match() -> None: @@ -269,9 +267,7 @@ def test_cratedb_with_filter_distant_match() -> None: ) output = docsearch.similarity_search_with_score("foo", k=2, filter={"page": "2"}) # Original score value: 0.0013003906671379406 - assert output == [ - (Document(page_content="baz", metadata={"page": "2"}), pytest.approx(1.5, 0.2)) - ] + assert output == [(Document(page_content="baz", metadata={"page": "2"}), 0.2)] def test_cratedb_with_filter_no_match() -> None: @@ -425,8 +421,8 @@ def test_cratedb_with_filter_in_set() -> None: ) # Original score values: 0.0, 0.0013003906671379406 assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), pytest.approx(3.0, 0.1)), - (Document(page_content="baz", metadata={"page": "2"}), pytest.approx(2.2, 0.1)), + (Document(page_content="foo", metadata={"page": "0"}), 1.0), + (Document(page_content="baz", metadata={"page": "2"}), 0.2), ] @@ -474,9 +470,9 @@ def test_cratedb_relevance_score() -> None: output = docsearch.similarity_search_with_relevance_scores("foo", k=3) # Original score values: 1.0, 0.9996744261675065, 0.9986996093328621 assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), pytest.approx(1.4, 0.1)), - (Document(page_content="bar", metadata={"page": "1"}), pytest.approx(1.1, 0.1)), - (Document(page_content="baz", metadata={"page": "2"}), pytest.approx(0.8, 0.1)), + (Document(page_content="foo", metadata={"page": "0"}), 0.7071067811865475), + (Document(page_content="bar", metadata={"page": "1"}), 0.35355339059327373), + (Document(page_content="baz", metadata={"page": "2"}), 0.1414213562373095), ] @@ -495,9 +491,9 @@ def test_cratedb_retriever_search_threshold() -> None: retriever = docsearch.as_retriever( search_type="similarity_score_threshold", - search_kwargs={"k": 3, "score_threshold": 0.999}, + search_kwargs={"k": 3, "score_threshold": 0.35}, # Original value: 0.999 ) - output = retriever.get_relevant_documents("summer") + output = retriever.invoke("summer") assert output == [ Document(page_content="foo", metadata={"page": "0"}), Document(page_content="bar", metadata={"page": "1"}), @@ -522,7 +518,7 @@ def test_cratedb_retriever_search_threshold_custom_normalization_fn() -> None: search_type="similarity_score_threshold", search_kwargs={"k": 3, "score_threshold": 0.5}, ) - output = retriever.get_relevant_documents("foo") + output = retriever.invoke("foo") assert output == [] @@ -551,7 +547,7 @@ def test_cratedb_max_marginal_relevance_search_with_score() -> None: pre_delete_collection=True, ) output = docsearch.max_marginal_relevance_search_with_score("foo", k=1, fetch_k=3) - assert output == [(Document(page_content="foo"), 2.0)] + assert output == [(Document(page_content="foo"), 1.0)] def test_cratedb_multicollection_search_success() -> None: From 467ee6a5e3cd0074a10279dbc3264726254f0618 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 6 Nov 2024 23:49:26 +0100 Subject: [PATCH 5/7] CrateDB: Vector Store -- update dependencies --- libs/community/extended_testing_deps.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt index f65e0de40507f..fa6a8dade0916 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_testing_deps.txt @@ -14,7 +14,6 @@ chardet>=5.1.0,<6 cloudpathlib>=0.18,<0.19 cloudpickle>=2.0.0 cohere>=4,<6 -crate==1.0.0.dev1 databricks-vectorsearch>=0.21,<0.22 datasets>=2.15.0,<3 dgml-utils>=0.3.0,<0.4 @@ -78,7 +77,7 @@ requests-toolbelt>=1.0.0,<2 rspace_client>=2.5.0,<3 scikit-learn>=1.2.2,<2 simsimd>=5.0.0,<6 -sqlalchemy-cratedb>=0.40.0,<1 +sqlalchemy-cratedb>=0.40.1,<1 sqlite-vss>=0.1.2,<0.2 sqlite-vec>=0.1.0,<0.2 sseclient-py>=1.8.0,<2 From 09db9e47d44d2b8921316961319a5a9e5309e752 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Mon, 11 Nov 2024 05:49:07 +0100 Subject: [PATCH 6/7] CrateDB: Vector Store -- make _euclidean_relevance_score_fn identity f. We don't need anything on top of it, ie we don't need this function and instead should use value from CrateDB as is. Similarity is already in the (0,1] interval and dividing by math.sqrt(2) won't normalize it but return wrong result, for example 1 will become 0.714. --- .../langchain_community/vectorstores/cratedb/base.py | 5 ++--- .../tests/integration_tests/vectorstores/test_cratedb.py | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/cratedb/base.py b/libs/community/langchain_community/vectorstores/cratedb/base.py index e9109764ecfc7..eb05571cbf945 100644 --- a/libs/community/langchain_community/vectorstores/cratedb/base.py +++ b/libs/community/langchain_community/vectorstores/cratedb/base.py @@ -1,7 +1,6 @@ from __future__ import annotations import enum -import math from typing import ( Any, Callable, @@ -466,10 +465,10 @@ def _euclidean_relevance_score_fn(similarity: float) -> float: # others are not!) # - embedding dimensionality # - etc. - # This function converts the euclidean norm of normalized embeddings + # This function converts the Euclidean norm of normalized embeddings # (0 is most similar, sqrt(2) most dissimilar) # to a similarity function (0 to 1) # Original: # return 1.0 - distance / math.sqrt(2) - return similarity / math.sqrt(2) + return similarity diff --git a/libs/community/tests/integration_tests/vectorstores/test_cratedb.py b/libs/community/tests/integration_tests/vectorstores/test_cratedb.py index acc03547fe565..52aad4a0a8537 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_cratedb.py +++ b/libs/community/tests/integration_tests/vectorstores/test_cratedb.py @@ -470,9 +470,9 @@ def test_cratedb_relevance_score() -> None: output = docsearch.similarity_search_with_relevance_scores("foo", k=3) # Original score values: 1.0, 0.9996744261675065, 0.9986996093328621 assert output == [ - (Document(page_content="foo", metadata={"page": "0"}), 0.7071067811865475), - (Document(page_content="bar", metadata={"page": "1"}), 0.35355339059327373), - (Document(page_content="baz", metadata={"page": "2"}), 0.1414213562373095), + (Document(page_content="foo", metadata={"page": "0"}), 1.0), + (Document(page_content="bar", metadata={"page": "1"}), 0.5), + (Document(page_content="baz", metadata={"page": "2"}), 0.2), ] From 4b9310e5175312ce087f9148383cd4336480a7b2 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 20 Nov 2024 17:20:09 +0100 Subject: [PATCH 7/7] CrateDB: Vector Store -- update docs in _euclidean_relevance_score_fn --- .../langchain_community/vectorstores/cratedb/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/cratedb/base.py b/libs/community/langchain_community/vectorstores/cratedb/base.py index eb05571cbf945..a9ac1a285bdd9 100644 --- a/libs/community/langchain_community/vectorstores/cratedb/base.py +++ b/libs/community/langchain_community/vectorstores/cratedb/base.py @@ -469,6 +469,7 @@ def _euclidean_relevance_score_fn(similarity: float) -> float: # (0 is most similar, sqrt(2) most dissimilar) # to a similarity function (0 to 1) - # Original: - # return 1.0 - distance / math.sqrt(2) + # CrateDB uses the `vector_similarity()` SQL function in this context, + # which already returns a normalized value. + # https://cratedb.com/docs/crate/reference/en/latest/general/builtins/scalar-functions.html#vector-similarity-float-vector-float-vector return similarity