langchain-ai · zc277584121 · Sep 30, 2024 · Sep 30, 2024
diff --git a/libs/milvus/langchain_milvus/vectorstores/milvus.py b/libs/milvus/langchain_milvus/vectorstores/milvus.py
@@ -258,6 +258,7 @@ def __init__(
         replica_number: int = 1,
         timeout: Optional[float] = None,
         num_shards: Optional[int] = None,
+        vector_schema: Optional[dict[str, Any]] = None,
         metadata_schema: Optional[dict[str, Any]] = None,
     ):
         """Initialize the Milvus vector store."""
@@ -271,6 +272,7 @@ def __init__(
 
         # Default search params when one is not provided.
         self.default_search_params = {
+            "FLAT": {"metric_type": "L2", "params": {}},
             "IVF_FLAT": {"metric_type": "L2", "params": {"nprobe": 10}},
             "IVF_SQ8": {"metric_type": "L2", "params": {"nprobe": 10}},
             "IVF_PQ": {"metric_type": "L2", "params": {"nprobe": 10}},
@@ -335,6 +337,7 @@ def __init__(
         self.timeout = timeout
         self.num_shards = num_shards
         self.metadata_schema = metadata_schema
+        self.vector_schema = vector_schema
 
         # Create the connection to the server
         if connection_args is None:
@@ -456,12 +459,9 @@ def _create_collection(
                         and key in self.metadata_schema  # type: ignore
                         and "dtype" in self.metadata_schema[key]  # type: ignore
                     ):
-                        kwargs = self.metadata_schema[key].get("kwargs", {})  # type: ignore
                         fields.append(
-                            FieldSchema(
-                                name=key,
-                                dtype=self.metadata_schema[key]["dtype"],  # type: ignore
-                                **kwargs,
+                            self._get_field_schema_from_dict(
+                                key, self.metadata_schema[key]
                             )
                         )
                     else:
@@ -515,15 +515,23 @@ def _create_collection(
                     max_length=65_535,
                 )
             )
-        # Create the vector field, supports binary or float vectors
-        if self._is_sparse_embedding:
-            fields.append(FieldSchema(self._vector_field, DataType.SPARSE_FLOAT_VECTOR))
-        else:
+        # Create the vector field
+        if self.vector_schema and "dtype" in self.vector_schema:
             fields.append(
-                FieldSchema(
-                    self._vector_field, infer_dtype_bydata(embeddings[0]), dim=dim
-                )
+                self._get_field_schema_from_dict(self._vector_field, self.vector_schema)
             )
+        else:
+            if self._is_sparse_embedding:
+                fields.append(
+                    FieldSchema(self._vector_field, DataType.SPARSE_FLOAT_VECTOR)
+                )
+            else:
+                # Supports binary or float vectors
+                fields.append(
+                    FieldSchema(
+                        self._vector_field, infer_dtype_bydata(embeddings[0]), dim=dim
+                    )
+                )
 
         # Create the schema for the collection
         schema = CollectionSchema(
@@ -561,6 +569,18 @@ def _create_collection(
             )
             raise e
 
+    def _get_field_schema_from_dict(self, field_name: str, schema_dict: dict):  # type: ignore[no-untyped-def]
+        from pymilvus import FieldSchema
+
+        assert "dtype" in schema_dict, (
+            f"Please provide `dtype` in the schema dict. "
+            f"Existing keys are: {schema_dict.keys()}"
+        )
+        dtype = schema_dict.pop("dtype")
+        kwargs = schema_dict.pop("kwargs", {})
+        kwargs.update(schema_dict)
+        return FieldSchema(name=field_name, dtype=dtype, **kwargs)
+
     def _extract_fields(self) -> None:
         """Grab the existing fields from the Collection"""
         from pymilvus import Collection

diff --git a/libs/milvus/tests/integration_tests/utils.py b/libs/milvus/tests/integration_tests/utils.py
@@ -1,5 +1,6 @@
 from typing import List
 
+import numpy as np
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
 
@@ -28,6 +29,35 @@ async def aembed_query(self, text: str) -> List[float]:
         return self.embed_query(text)
 
 
+class FakeFp16Embeddings(Embeddings):
+    """Fake fp16 precision embeddings functionality for testing."""
+
+    def embed_documents(self, texts: List[str]) -> List:  # type: ignore[no-untyped-def]
+        """Return simple embeddings with fp16 precision.
+        Embeddings encode each text as its index."""
+        fp16_vectors = []
+        for i in range(len(texts)):
+            raw_vector = [(1 / 9) * d for d in range(9)] + [float(i)]
+            fp16_vector = np.array(raw_vector, dtype=np.float16)
+            fp16_vectors.append(fp16_vector)
+        return fp16_vectors
+
+    async def aembed_documents(self, texts: List[str]) -> List:  # type: ignore[no-untyped-def]
+        return self.embed_documents(texts)
+
+    def embed_query(self, text: str):  # type: ignore[no-untyped-def]
+        """Return constant query embeddings.
+        Embeddings are identical to embed_documents(texts)[0].
+        Distance to each text will be that text's index,
+        as it was passed to embed_documents."""
+        return np.array(
+            [(1 / 9) * d for d in range(9)] + [float(0.0)], dtype=np.float16
+        )
+
+    async def aembed_query(self, text: str):  # type: ignore[no-untyped-def]
+        return self.embed_query(text)
+
+
 def assert_docs_equal_without_pk(
     docs1: List[Document], docs2: List[Document], pk_field: str = "pk"
 ) -> None:

diff --git a/libs/milvus/tests/integration_tests/vectorstores/test_milvus.py b/libs/milvus/tests/integration_tests/vectorstores/test_milvus.py
@@ -10,6 +10,7 @@
 from langchain_milvus.vectorstores import Milvus
 from tests.integration_tests.utils import (
     FakeEmbeddings,
+    FakeFp16Embeddings,
     assert_docs_equal_without_pk,
     fake_texts,
 )
@@ -356,6 +357,7 @@ def test_milvus_sparse_embeddings() -> None:
             texts=texts,
             connection_args={"uri": temp_db.name},
             drop_old=True,
+            consistency_level="Strong",
         )
 
         output = docsearch.similarity_search("Pilgrim", k=1)
@@ -377,10 +379,11 @@ def test_milvus_array_field(temp_milvus_db: Any) -> None:
     docsearch = _milvus_from_texts(
         metadatas=metadatas,
         metadata_schema={
-            "array_field": {
-                "dtype": DataType.ARRAY,
-                "kwargs": {"element_type": DataType.INT64, "max_capacity": 50},
-            },
+            "array_field": dict(
+                dtype=DataType.ARRAY,
+                element_type=DataType.INT64,
+                max_capacity=50,
+            ),
             # "id": {
             #     "dtype": DataType.INT64,
             # }
@@ -409,6 +412,35 @@ def test_milvus_array_field(temp_milvus_db: Any) -> None:
     assert len(output) == 2
 
 
+def test_milvus_vector_field(temp_milvus_db: Any) -> None:
+    # Support custom vector field schema, e.g. supporting Float16 and BFloat
+    # https://milvus.io/docs/release_notes.md#Float16-and-BFloat--Vector-DataType
+    from pymilvus import DataType
+
+    texts = ["foo", "bar", "baz"]
+
+    with tempfile.NamedTemporaryFile(suffix=".db") as temp_db:
+        docsearch = Milvus.from_texts(
+            embedding=FakeFp16Embeddings(),
+            texts=texts,
+            connection_args={"uri": temp_db.name},
+            vector_schema=dict(
+                dtype=DataType.FLOAT16_VECTOR,
+                dim=10,
+                # or kwargs={"dim": 10},
+            ),
+            index_params={
+                "metric_type": "L2",
+                "index_type": "FLAT",  # For milvus lite, only support FLAT for fp16
+            },
+            drop_old=True,
+            consistency_level="Strong",
+        )
+
+        output = docsearch.similarity_search("foo", k=1)
+    assert_docs_equal_without_pk(output, [Document(page_content="foo")])
+
+
 # if __name__ == "__main__":
 #     test_milvus()
 #     test_milvus_vector_search()
@@ -428,3 +460,4 @@ def test_milvus_array_field(temp_milvus_db: Any) -> None:
 #     test_milvus_enable_dynamic_field_with_partition_key()
 #     test_milvus_sparse_embeddings()
 #     test_milvus_array_field()
+#     test_milvus_vector_field()