hazelcast · yuce · Jun 21, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/.github/workflows/coverage_runner.yml b/.github/workflows/coverage_runner.yml
@@ -75,7 +75,7 @@ jobs:
 
       - name: Run tests
         env:
-          HAZELCAST_ENTERPRISE_KEY: ${{ secrets.HAZELCAST_ENTERPRISE_KEY }}
+          HAZELCAST_ENTERPRISE_KEY: ${{ secrets.HAZELCAST_ENTERPRISE_KEY_V7 }}
         run: python run_tests.py
 
       - name: Publish results to Codecov for PR coming from hazelcast organization

diff --git a/.github/workflows/nightly_runner.yml b/.github/workflows/nightly_runner.yml
@@ -32,7 +32,7 @@ jobs:
           pip install -r requirements-test.txt
       - name: Run tests
         env:
-          HAZELCAST_ENTERPRISE_KEY: ${{ secrets.HAZELCAST_ENTERPRISE_KEY }}
+          HAZELCAST_ENTERPRISE_KEY: ${{ secrets.HAZELCAST_ENTERPRISE_KEY_V7 }}
         run: python run_tests.py
       - name: Upload remote controller logs on test failure
         uses: actions/upload-artifact@v2

diff --git a/README.rst b/README.rst
@@ -143,6 +143,7 @@ Features
 -  Distributed, CRDT based counter, called **PNCounter**
 -  Distributed concurrency primitives from CP Subsystem such as
    **FencedLock**, **Semaphore**, **AtomicLong**
+-  Similarity search using **VectorCollection** (Beta)
 -  Integration with `Hazelcast Cloud <https://cloud.hazelcast.com/>`__
 -  Support for serverless and traditional web service architectures with
    **Unisocket** and **Smart** operation modes

diff --git a/examples/vector_collection/vector_collection.py b/examples/vector_collection/vector_collection.py
@@ -1,6 +1,5 @@
 import logging
 
-from hazelcast import vector
 from hazelcast.vector import Type, Vector, Metric, IndexConfig
 
 logging.basicConfig(level=logging.DEBUG)
@@ -48,12 +47,42 @@ def main():
     key = "key-2"
     vc.set(key, doc2)
 
+    # Optimize collection
+    vc.optimize()
+
     # Search for a vector
     results = vc.search_near_vector(
         Vector("default-vector", Type.DENSE, [0.2, 0.3]),
+        limit=2,
         include_value=True,
         include_vectors=True,
+    )
+    for i, result in enumerate(results):
+        print(
+            f"{i+1}.",
+            "Key:",
+            result.key,
+            "Value:",
+            result.value,
+            "Score:",
+            result.score,
+            "Vector:",
+            result.vectors,
+        )
+
+    print("size:", vc.size())
+
+    # Delete all entries
+    vc.clear()
+    print("cleared collection")
+    print("size:", vc.size())
+
+    # Search for a vector
+    results = vc.search_near_vector(
+        Vector("default-vector", Type.DENSE, [0.2, 0.3]),
         limit=2,
+        include_value=True,
+        include_vectors=True,
     )
     for i, result in enumerate(results):
         print(

diff --git a/hazelcast/protocol/codec/custom/vector_search_options_codec.py b/hazelcast/protocol/codec/custom/vector_search_options_codec.py
@@ -2,8 +2,6 @@
 from hazelcast.serialization.bits import *
 from hazelcast.protocol.client_message import END_FRAME_BUF, END_FINAL_FRAME_BUF, SIZE_OF_FRAME_LENGTH_AND_FLAGS, create_initial_buffer_custom
 from hazelcast.vector import VectorSearchOptions
-from hazelcast.protocol.builtin import ListMultiFrameCodec
-from hazelcast.protocol.codec.custom.vector_pair_codec import VectorPairCodec
 from hazelcast.protocol.builtin import MapCodec
 from hazelcast.protocol.builtin import StringCodec
 
@@ -24,7 +22,6 @@ def encode(buf, vector_search_options, is_final=False):
         FixSizedTypesCodec.encode_boolean(initial_frame_buf, _INCLUDE_VECTORS_ENCODE_OFFSET, vector_search_options.include_vectors)
         FixSizedTypesCodec.encode_int(initial_frame_buf, _LIMIT_ENCODE_OFFSET, vector_search_options.limit)
         buf.extend(initial_frame_buf)
-        ListMultiFrameCodec.encode(buf, vector_search_options.vectors, VectorPairCodec.encode)
         MapCodec.encode_nullable(buf, vector_search_options.hints, StringCodec.encode, StringCodec.encode)
         if is_final:
             buf.extend(END_FINAL_FRAME_BUF)
@@ -38,7 +35,6 @@ def decode(msg):
         include_value = FixSizedTypesCodec.decode_boolean(initial_frame.buf, _INCLUDE_VALUE_DECODE_OFFSET)
         include_vectors = FixSizedTypesCodec.decode_boolean(initial_frame.buf, _INCLUDE_VECTORS_DECODE_OFFSET)
         limit = FixSizedTypesCodec.decode_int(initial_frame.buf, _LIMIT_DECODE_OFFSET)
-        vectors = ListMultiFrameCodec.decode(msg, VectorPairCodec.decode)
         hints = MapCodec.decode_nullable(msg, StringCodec.decode, StringCodec.decode)
         CodecUtil.fast_forward_to_end_frame(msg)
-        return VectorSearchOptions(include_value, include_vectors, limit, vectors, hints)
+        return VectorSearchOptions(include_value, include_vectors, limit, hints)
diff --git a/hazelcast/protocol/codec/custom/vector_search_result_codec.py b/hazelcast/protocol/codec/custom/vector_search_result_codec.py
@@ -18,7 +18,7 @@ def encode(buf, vector_search_result, is_final=False):
         FixSizedTypesCodec.encode_float(initial_frame_buf, _SCORE_ENCODE_OFFSET, vector_search_result.score)
         buf.extend(initial_frame_buf)
         DataCodec.encode(buf, vector_search_result.key)
-        CodecUtil.encode_nullable(buf, vector_search_result.document, DataCodec.encode)
+        CodecUtil.encode_nullable(buf, vector_search_result.value, DataCodec.encode)
         ListMultiFrameCodec.encode_nullable(buf, vector_search_result.vectors, VectorPairCodec.encode)
         if is_final:
             buf.extend(END_FINAL_FRAME_BUF)
@@ -31,7 +31,7 @@ def decode(msg):
         initial_frame = msg.next_frame()
         score = FixSizedTypesCodec.decode_float(initial_frame.buf, _SCORE_DECODE_OFFSET)
         key = DataCodec.decode(msg)
-        document = CodecUtil.decode_nullable(msg, DataCodec.decode)
+        value = CodecUtil.decode_nullable(msg, DataCodec.decode)
         vectors = ListMultiFrameCodec.decode_nullable(msg, VectorPairCodec.decode)
         CodecUtil.fast_forward_to_end_frame(msg)
-        return VectorSearchResult(key, document, score, vectors)
+        return VectorSearchResult(key, value, score, vectors)
diff --git a/hazelcast/protocol/codec/vector_collection_clear_codec.py b/hazelcast/protocol/codec/vector_collection_clear_codec.py
@@ -0,0 +1,15 @@
+from hazelcast.protocol.client_message import OutboundMessage, REQUEST_HEADER_SIZE, create_initial_buffer
+from hazelcast.protocol.builtin import StringCodec
+
+# hex: 0x240A00
+_REQUEST_MESSAGE_TYPE = 2361856
+# hex: 0x240A01
+_RESPONSE_MESSAGE_TYPE = 2361857
+
+_REQUEST_INITIAL_FRAME_SIZE = REQUEST_HEADER_SIZE
+
+
+def encode_request(name):
+    buf = create_initial_buffer(_REQUEST_INITIAL_FRAME_SIZE, _REQUEST_MESSAGE_TYPE)
+    StringCodec.encode(buf, name, True)
+    return OutboundMessage(buf, False)
diff --git a/hazelcast/protocol/codec/vector_collection_optimize_codec.py b/hazelcast/protocol/codec/vector_collection_optimize_codec.py
@@ -0,0 +1,17 @@
+from hazelcast.protocol.client_message import OutboundMessage, REQUEST_HEADER_SIZE, create_initial_buffer
+from hazelcast.protocol.builtin import StringCodec
+from hazelcast.protocol.builtin import CodecUtil
+
+# hex: 0x240900
+_REQUEST_MESSAGE_TYPE = 2361600
+# hex: 0x240901
+_RESPONSE_MESSAGE_TYPE = 2361601
+
+_REQUEST_INITIAL_FRAME_SIZE = REQUEST_HEADER_SIZE
+
+
+def encode_request(name, index_name):
+    buf = create_initial_buffer(_REQUEST_INITIAL_FRAME_SIZE, _REQUEST_MESSAGE_TYPE)
+    StringCodec.encode(buf, name)
+    CodecUtil.encode_nullable(buf, index_name, StringCodec.encode, True)
+    return OutboundMessage(buf, True)
diff --git a/hazelcast/protocol/codec/vector_collection_put_all_codec.py b/hazelcast/protocol/codec/vector_collection_put_all_codec.py
@@ -1,5 +1,7 @@
 from hazelcast.protocol.client_message import OutboundMessage, REQUEST_HEADER_SIZE, create_initial_buffer
-from hazelcast.protocol.builtin import StringCodec, EntryListCodec, DataCodec
+from hazelcast.protocol.builtin import StringCodec
+from hazelcast.protocol.builtin import EntryListCodec
+from hazelcast.protocol.builtin import DataCodec
 from hazelcast.protocol.codec.custom.vector_document_codec import VectorDocumentCodec
 
 # hex: 0x240300

diff --git a/hazelcast/protocol/codec/vector_collection_search_near_vector_codec.py b/hazelcast/protocol/codec/vector_collection_search_near_vector_codec.py
@@ -1,7 +1,8 @@
 from hazelcast.protocol.client_message import OutboundMessage, REQUEST_HEADER_SIZE, create_initial_buffer
 from hazelcast.protocol.builtin import StringCodec
-from hazelcast.protocol.codec.custom.vector_search_options_codec import VectorSearchOptionsCodec
 from hazelcast.protocol.builtin import ListMultiFrameCodec
+from hazelcast.protocol.codec.custom.vector_pair_codec import VectorPairCodec
+from hazelcast.protocol.codec.custom.vector_search_options_codec import VectorSearchOptionsCodec
 from hazelcast.protocol.codec.custom.vector_search_result_codec import VectorSearchResultCodec
 
 # hex: 0x240800
@@ -12,9 +13,10 @@
 _REQUEST_INITIAL_FRAME_SIZE = REQUEST_HEADER_SIZE
 
 
-def encode_request(name, options):
+def encode_request(name, vectors, options):
     buf = create_initial_buffer(_REQUEST_INITIAL_FRAME_SIZE, _REQUEST_MESSAGE_TYPE)
     StringCodec.encode(buf, name)
+    ListMultiFrameCodec.encode(buf, vectors, VectorPairCodec.encode)
     VectorSearchOptionsCodec.encode(buf, options, True)
     return OutboundMessage(buf, True)
 

diff --git a/hazelcast/protocol/codec/vector_collection_size_codec.py b/hazelcast/protocol/codec/vector_collection_size_codec.py
@@ -0,0 +1,22 @@
+from hazelcast.protocol.builtin import FixSizedTypesCodec
+from hazelcast.protocol.client_message import OutboundMessage, REQUEST_HEADER_SIZE, create_initial_buffer, RESPONSE_HEADER_SIZE
+from hazelcast.protocol.builtin import StringCodec
+
+# hex: 0x240B00
+_REQUEST_MESSAGE_TYPE = 2362112
+# hex: 0x240B01
+_RESPONSE_MESSAGE_TYPE = 2362113
+
+_REQUEST_INITIAL_FRAME_SIZE = REQUEST_HEADER_SIZE
+_RESPONSE_RESPONSE_OFFSET = RESPONSE_HEADER_SIZE
+
+
+def encode_request(name):
+    buf = create_initial_buffer(_REQUEST_INITIAL_FRAME_SIZE, _REQUEST_MESSAGE_TYPE)
+    StringCodec.encode(buf, name, True)
+    return OutboundMessage(buf, True)
+
+
+def decode_response(msg):
+    initial_frame = msg.next_frame()
+    return FixSizedTypesCodec.decode_long(initial_frame.buf, _RESPONSE_RESPONSE_OFFSET)
diff --git a/hazelcast/proxy/vector_collection.py b/hazelcast/proxy/vector_collection.py
@@ -11,6 +11,9 @@
     vector_collection_put_if_absent_codec,
     vector_collection_remove_codec,
     vector_collection_put_all_codec,
+    vector_collection_clear_codec,
+    vector_collection_optimize_codec,
+    vector_collection_size_codec,
 )
 from hazelcast.proxy import Proxy
 from hazelcast.serialization.compact import SchemaNotReplicatedError
@@ -88,14 +91,16 @@ def search_near_vector(
         *,
         include_value: bool = False,
         include_vectors: bool = False,
-        limit: int = -1
+        limit: int,
+        hints: Dict[str, str] = None
     ) -> Future[List[SearchResult]]:
         check_not_none(vector, "vector can't be None")
         return self._search_near_vector_internal(
             vector,
             include_value=include_value,
             include_vectors=include_vectors,
             limit=limit,
+            hints=hints,
         )
 
     def remove(self, key: Any) -> Future[Optional[Document]]:
@@ -106,6 +111,35 @@ def delete(self, key: Any) -> Future[None]:
         check_not_none(key, "key can't be None")
         return self._delete_internal(key)
 
+    def optimize(self, index_name: str = None) -> Future[None]:
+        """Optimize index by fully removing nodes marked for deletion, trimming neighbor sets
+        to the advertised degree, and updating the entry node as necessary.
+
+        Warning:
+            This operation can take long time to execute and consume a lot of server resources.
+
+        Args:
+            index_name: Name of the index to optimize. If not specified, the only index defined
+                for the collection will be used. Must be specified if the collection has more than
+                one index.
+        """
+        request = vector_collection_optimize_codec.encode_request(self.name, index_name)
+        return self._invoke(request)
+
+    def clear(self) -> Future[None]:
+        """Clears the VectorCollection."""
+        request = vector_collection_clear_codec.encode_request(self.name)
+        return self._invoke(request)
+
+    def size(self) -> Future[int]:
+        """Returns the number of Documents in this VectorCollection.
+
+        Returns:
+            Number of Documents in this VectorCollection.
+        """
+        request = vector_collection_size_codec.encode_request(self.name)
+        return self._invoke(request, vector_collection_size_codec.decode_response)
+
     def _set_internal(self, key: Any, document: Document) -> Future[None]:
         try:
             key_data = self._to_data(key)
@@ -142,7 +176,8 @@ def _search_near_vector_internal(
         *,
         include_value: bool = False,
         include_vectors: bool = False,
-        limit: int = -1
+        limit: int,
+        hints: Dict[str, str] = None
     ) -> Future[List[SearchResult]]:
         def handler(message):
             results: List[
@@ -159,13 +194,14 @@ def handler(message):
             return results
 
         options = VectorSearchOptions(
-            vectors=[vector],
             include_value=include_value,
             include_vectors=include_vectors,
             limit=limit,
+            hints=hints or {},
         )
         request = vector_collection_search_near_vector_codec.encode_request(
             self.name,
+            [vector],
             options,
         )
         return self._invoke(request, response_handler=handler)
@@ -249,13 +285,15 @@ def search_near_vector(
         *,
         include_value: bool = False,
         include_vectors: bool = False,
-        limit: Optional[int] = None
+        limit: int,
+        hints: Dict[str, str] = None
     ) -> List[SearchResult]:
         future = self._wrapped.search_near_vector(
             vector,
             include_value=include_value,
             include_vectors=include_vectors,
             limit=limit,
+            hints=hints,
         )
         return future.result()
 
@@ -274,5 +312,14 @@ def put_all(self, map: Dict[Any, Document]) -> None:
     def put_if_absent(self, key: Any, document: Document) -> Optional[Document]:
         return self._wrapped.put_if_absent(key, document).result()
 
+    def clear(self) -> None:
+        return self._wrapped.clear().result()
+
+    def optimize(self, index_name: str = None) -> None:
+        return self._wrapped.optimize(index_name).result()
+
+    def size(self) -> int:
+        return self._wrapped.size().result()
+
     def destroy(self) -> bool:
         return self._wrapped.destroy()
diff --git a/hazelcast/vector.py b/hazelcast/vector.py
@@ -40,6 +40,9 @@ def vector(self) -> Optional[Vector]:
     def __copy__(self):
         return Document(self.value, self.vectors)
 
+    def __repr__(self):
+        return f"Document<value={self.value}, vectors={self.vectors}>"
+
 
 VectorDocument = Document
 
@@ -80,7 +83,6 @@ class IndexConfig:
 @dataclasses.dataclass
 class VectorSearchOptions:
 
-    vectors: List[Vector]
     include_value: bool
     include_vectors: bool
     limit: int

diff --git a/start_rc.py b/start_rc.py
@@ -77,14 +77,6 @@ def start_rc(stdout=None, stderr=None):
         server = download_if_necessary(
             ENTERPRISE_REPO, HAZELCAST_GROUP, "hazelcast-enterprise", SERVER_VERSION
         )
-        vector = download_if_necessary(
-            ENTERPRISE_REPO, HAZELCAST_GROUP, "hazelcast-enterprise-vector", SERVER_VERSION
-        )
-        artifacts.append(vector)
-        jvector = download_if_necessary(RELEASE_REPO, "io.github.jbellis", "jvector", "2.0.5")
-        artifacts.append(jvector)
-        math3 = download_if_necessary(RELEASE_REPO, "org.apache.commons", "commons-math3", "3.6.1")
-        artifacts.append(math3)
     else:
         server = download_if_necessary(REPO, HAZELCAST_GROUP, "hazelcast", SERVER_VERSION)