Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update vector collection API - search, optimize, clear, size #679

Merged
merged 11 commits into from
Jun 21, 2024
2 changes: 1 addition & 1 deletion .github/workflows/coverage_runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:

- name: Run tests
env:
HAZELCAST_ENTERPRISE_KEY: ${{ secrets.HAZELCAST_ENTERPRISE_KEY }}
HAZELCAST_ENTERPRISE_KEY: ${{ secrets.HAZELCAST_ENTERPRISE_KEY_V7 }}
run: python run_tests.py

- name: Publish results to Codecov for PR coming from hazelcast organization
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nightly_runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
pip install -r requirements-test.txt
- name: Run tests
env:
HAZELCAST_ENTERPRISE_KEY: ${{ secrets.HAZELCAST_ENTERPRISE_KEY }}
HAZELCAST_ENTERPRISE_KEY: ${{ secrets.HAZELCAST_ENTERPRISE_KEY_V7 }}
run: python run_tests.py
- name: Upload remote controller logs on test failure
uses: actions/upload-artifact@v2
Expand Down
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ Features
- Distributed, CRDT based counter, called **PNCounter**
- Distributed concurrency primitives from CP Subsystem such as
**FencedLock**, **Semaphore**, **AtomicLong**
- Similarity search using **VectorCollection** (Beta)
- Integration with `Hazelcast Cloud <https://cloud.hazelcast.com/>`__
- Support for serverless and traditional web service architectures with
**Unisocket** and **Smart** operation modes
Expand Down
31 changes: 30 additions & 1 deletion examples/vector_collection/vector_collection.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging

from hazelcast import vector
from hazelcast.vector import Type, Vector, Metric, IndexConfig

logging.basicConfig(level=logging.DEBUG)
Expand Down Expand Up @@ -48,12 +47,42 @@ def main():
key = "key-2"
vc.set(key, doc2)

# Optimize collection
vc.optimize()

# Search for a vector
results = vc.search_near_vector(
Vector("default-vector", Type.DENSE, [0.2, 0.3]),
limit=2,
include_value=True,
include_vectors=True,
)
for i, result in enumerate(results):
print(
f"{i+1}.",
"Key:",
result.key,
"Value:",
result.value,
"Score:",
result.score,
"Vector:",
result.vectors,
)

print("size:", vc.size())

# Delete all entries
vc.clear()
print("cleared collection")
print("size:", vc.size())

# Search for a vector
results = vc.search_near_vector(
Vector("default-vector", Type.DENSE, [0.2, 0.3]),
limit=2,
include_value=True,
include_vectors=True,
)
for i, result in enumerate(results):
print(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from hazelcast.serialization.bits import *
from hazelcast.protocol.client_message import END_FRAME_BUF, END_FINAL_FRAME_BUF, SIZE_OF_FRAME_LENGTH_AND_FLAGS, create_initial_buffer_custom
from hazelcast.vector import VectorSearchOptions
from hazelcast.protocol.builtin import ListMultiFrameCodec
from hazelcast.protocol.codec.custom.vector_pair_codec import VectorPairCodec
from hazelcast.protocol.builtin import MapCodec
from hazelcast.protocol.builtin import StringCodec

Expand All @@ -24,7 +22,6 @@ def encode(buf, vector_search_options, is_final=False):
FixSizedTypesCodec.encode_boolean(initial_frame_buf, _INCLUDE_VECTORS_ENCODE_OFFSET, vector_search_options.include_vectors)
FixSizedTypesCodec.encode_int(initial_frame_buf, _LIMIT_ENCODE_OFFSET, vector_search_options.limit)
buf.extend(initial_frame_buf)
ListMultiFrameCodec.encode(buf, vector_search_options.vectors, VectorPairCodec.encode)
MapCodec.encode_nullable(buf, vector_search_options.hints, StringCodec.encode, StringCodec.encode)
if is_final:
buf.extend(END_FINAL_FRAME_BUF)
Expand All @@ -38,7 +35,6 @@ def decode(msg):
include_value = FixSizedTypesCodec.decode_boolean(initial_frame.buf, _INCLUDE_VALUE_DECODE_OFFSET)
include_vectors = FixSizedTypesCodec.decode_boolean(initial_frame.buf, _INCLUDE_VECTORS_DECODE_OFFSET)
limit = FixSizedTypesCodec.decode_int(initial_frame.buf, _LIMIT_DECODE_OFFSET)
vectors = ListMultiFrameCodec.decode(msg, VectorPairCodec.decode)
hints = MapCodec.decode_nullable(msg, StringCodec.decode, StringCodec.decode)
CodecUtil.fast_forward_to_end_frame(msg)
return VectorSearchOptions(include_value, include_vectors, limit, vectors, hints)
return VectorSearchOptions(include_value, include_vectors, limit, hints)
6 changes: 3 additions & 3 deletions hazelcast/protocol/codec/custom/vector_search_result_codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def encode(buf, vector_search_result, is_final=False):
FixSizedTypesCodec.encode_float(initial_frame_buf, _SCORE_ENCODE_OFFSET, vector_search_result.score)
buf.extend(initial_frame_buf)
DataCodec.encode(buf, vector_search_result.key)
CodecUtil.encode_nullable(buf, vector_search_result.document, DataCodec.encode)
CodecUtil.encode_nullable(buf, vector_search_result.value, DataCodec.encode)
ListMultiFrameCodec.encode_nullable(buf, vector_search_result.vectors, VectorPairCodec.encode)
if is_final:
buf.extend(END_FINAL_FRAME_BUF)
Expand All @@ -31,7 +31,7 @@ def decode(msg):
initial_frame = msg.next_frame()
score = FixSizedTypesCodec.decode_float(initial_frame.buf, _SCORE_DECODE_OFFSET)
key = DataCodec.decode(msg)
document = CodecUtil.decode_nullable(msg, DataCodec.decode)
value = CodecUtil.decode_nullable(msg, DataCodec.decode)
vectors = ListMultiFrameCodec.decode_nullable(msg, VectorPairCodec.decode)
CodecUtil.fast_forward_to_end_frame(msg)
return VectorSearchResult(key, document, score, vectors)
return VectorSearchResult(key, value, score, vectors)
15 changes: 15 additions & 0 deletions hazelcast/protocol/codec/vector_collection_clear_codec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from hazelcast.protocol.client_message import OutboundMessage, REQUEST_HEADER_SIZE, create_initial_buffer
from hazelcast.protocol.builtin import StringCodec

# hex: 0x240A00
_REQUEST_MESSAGE_TYPE = 2361856
# hex: 0x240A01
_RESPONSE_MESSAGE_TYPE = 2361857

_REQUEST_INITIAL_FRAME_SIZE = REQUEST_HEADER_SIZE


def encode_request(name):
buf = create_initial_buffer(_REQUEST_INITIAL_FRAME_SIZE, _REQUEST_MESSAGE_TYPE)
StringCodec.encode(buf, name, True)
return OutboundMessage(buf, False)
17 changes: 17 additions & 0 deletions hazelcast/protocol/codec/vector_collection_optimize_codec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from hazelcast.protocol.client_message import OutboundMessage, REQUEST_HEADER_SIZE, create_initial_buffer
from hazelcast.protocol.builtin import StringCodec
from hazelcast.protocol.builtin import CodecUtil

# hex: 0x240900
_REQUEST_MESSAGE_TYPE = 2361600
# hex: 0x240901
_RESPONSE_MESSAGE_TYPE = 2361601

_REQUEST_INITIAL_FRAME_SIZE = REQUEST_HEADER_SIZE


def encode_request(name, index_name):
buf = create_initial_buffer(_REQUEST_INITIAL_FRAME_SIZE, _REQUEST_MESSAGE_TYPE)
StringCodec.encode(buf, name)
CodecUtil.encode_nullable(buf, index_name, StringCodec.encode, True)
return OutboundMessage(buf, True)
4 changes: 3 additions & 1 deletion hazelcast/protocol/codec/vector_collection_put_all_codec.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from hazelcast.protocol.client_message import OutboundMessage, REQUEST_HEADER_SIZE, create_initial_buffer
from hazelcast.protocol.builtin import StringCodec, EntryListCodec, DataCodec
from hazelcast.protocol.builtin import StringCodec
from hazelcast.protocol.builtin import EntryListCodec
from hazelcast.protocol.builtin import DataCodec
from hazelcast.protocol.codec.custom.vector_document_codec import VectorDocumentCodec

# hex: 0x240300
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from hazelcast.protocol.client_message import OutboundMessage, REQUEST_HEADER_SIZE, create_initial_buffer
from hazelcast.protocol.builtin import StringCodec
from hazelcast.protocol.codec.custom.vector_search_options_codec import VectorSearchOptionsCodec
from hazelcast.protocol.builtin import ListMultiFrameCodec
from hazelcast.protocol.codec.custom.vector_pair_codec import VectorPairCodec
from hazelcast.protocol.codec.custom.vector_search_options_codec import VectorSearchOptionsCodec
from hazelcast.protocol.codec.custom.vector_search_result_codec import VectorSearchResultCodec

# hex: 0x240800
Expand All @@ -12,9 +13,10 @@
_REQUEST_INITIAL_FRAME_SIZE = REQUEST_HEADER_SIZE


def encode_request(name, options):
def encode_request(name, vectors, options):
buf = create_initial_buffer(_REQUEST_INITIAL_FRAME_SIZE, _REQUEST_MESSAGE_TYPE)
StringCodec.encode(buf, name)
ListMultiFrameCodec.encode(buf, vectors, VectorPairCodec.encode)
VectorSearchOptionsCodec.encode(buf, options, True)
return OutboundMessage(buf, True)

Expand Down
22 changes: 22 additions & 0 deletions hazelcast/protocol/codec/vector_collection_size_codec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from hazelcast.protocol.builtin import FixSizedTypesCodec
from hazelcast.protocol.client_message import OutboundMessage, REQUEST_HEADER_SIZE, create_initial_buffer, RESPONSE_HEADER_SIZE
from hazelcast.protocol.builtin import StringCodec

# hex: 0x240B00
_REQUEST_MESSAGE_TYPE = 2362112
# hex: 0x240B01
_RESPONSE_MESSAGE_TYPE = 2362113

_REQUEST_INITIAL_FRAME_SIZE = REQUEST_HEADER_SIZE
_RESPONSE_RESPONSE_OFFSET = RESPONSE_HEADER_SIZE


def encode_request(name):
buf = create_initial_buffer(_REQUEST_INITIAL_FRAME_SIZE, _REQUEST_MESSAGE_TYPE)
StringCodec.encode(buf, name, True)
return OutboundMessage(buf, True)


def decode_response(msg):
initial_frame = msg.next_frame()
return FixSizedTypesCodec.decode_long(initial_frame.buf, _RESPONSE_RESPONSE_OFFSET)
55 changes: 51 additions & 4 deletions hazelcast/proxy/vector_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
vector_collection_put_if_absent_codec,
vector_collection_remove_codec,
vector_collection_put_all_codec,
vector_collection_clear_codec,
vector_collection_optimize_codec,
vector_collection_size_codec,
)
from hazelcast.proxy import Proxy
from hazelcast.serialization.compact import SchemaNotReplicatedError
Expand Down Expand Up @@ -88,14 +91,16 @@ def search_near_vector(
*,
include_value: bool = False,
include_vectors: bool = False,
limit: int = -1
limit: int,
hints: Dict[str, str] = None
) -> Future[List[SearchResult]]:
check_not_none(vector, "vector can't be None")
return self._search_near_vector_internal(
vector,
include_value=include_value,
include_vectors=include_vectors,
limit=limit,
hints=hints,
)

def remove(self, key: Any) -> Future[Optional[Document]]:
Expand All @@ -106,6 +111,35 @@ def delete(self, key: Any) -> Future[None]:
check_not_none(key, "key can't be None")
return self._delete_internal(key)

def optimize(self, index_name: str = None) -> Future[None]:
"""Optimize index by fully removing nodes marked for deletion, trimming neighbor sets
to the advertised degree, and updating the entry node as necessary.

Warning:
This operation can take long time to execute and consume a lot of server resources.

Args:
index_name: Name of the index to optimize. If not specified, the only index defined
for the collection will be used. Must be specified if the collection has more than
one index.
"""
request = vector_collection_optimize_codec.encode_request(self.name, index_name)
return self._invoke(request)

def clear(self) -> Future[None]:
"""Clears the VectorCollection."""
request = vector_collection_clear_codec.encode_request(self.name)
return self._invoke(request)

def size(self) -> Future[int]:
"""Returns the number of Documents in this VectorCollection.

Returns:
Number of Documents in this VectorCollection.
"""
request = vector_collection_size_codec.encode_request(self.name)
return self._invoke(request, vector_collection_size_codec.decode_response)

def _set_internal(self, key: Any, document: Document) -> Future[None]:
try:
key_data = self._to_data(key)
Expand Down Expand Up @@ -142,7 +176,8 @@ def _search_near_vector_internal(
*,
include_value: bool = False,
include_vectors: bool = False,
limit: int = -1
limit: int,
hints: Dict[str, str] = None
yuce marked this conversation as resolved.
Show resolved Hide resolved
) -> Future[List[SearchResult]]:
def handler(message):
results: List[
Expand All @@ -159,13 +194,14 @@ def handler(message):
return results

options = VectorSearchOptions(
vectors=[vector],
include_value=include_value,
include_vectors=include_vectors,
limit=limit,
hints=hints or {},
)
request = vector_collection_search_near_vector_codec.encode_request(
self.name,
[vector],
options,
)
return self._invoke(request, response_handler=handler)
Expand Down Expand Up @@ -249,13 +285,15 @@ def search_near_vector(
*,
include_value: bool = False,
include_vectors: bool = False,
limit: Optional[int] = None
limit: int,
hints: Dict[str, str] = None
) -> List[SearchResult]:
future = self._wrapped.search_near_vector(
vector,
include_value=include_value,
include_vectors=include_vectors,
limit=limit,
hints=hints,
)
return future.result()

Expand All @@ -274,5 +312,14 @@ def put_all(self, map: Dict[Any, Document]) -> None:
def put_if_absent(self, key: Any, document: Document) -> Optional[Document]:
return self._wrapped.put_if_absent(key, document).result()

def clear(self) -> None:
return self._wrapped.clear().result()

def optimize(self, index_name: str = None) -> None:
return self._wrapped.optimize(index_name).result()

def size(self) -> int:
return self._wrapped.size().result()

def destroy(self) -> bool:
return self._wrapped.destroy()
4 changes: 3 additions & 1 deletion hazelcast/vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ def vector(self) -> Optional[Vector]:
def __copy__(self):
return Document(self.value, self.vectors)

def __repr__(self):
return f"Document<value={self.value}, vectors={self.vectors}>"


VectorDocument = Document

Expand Down Expand Up @@ -80,7 +83,6 @@ class IndexConfig:
@dataclasses.dataclass
class VectorSearchOptions:

vectors: List[Vector]
include_value: bool
include_vectors: bool
limit: int
Expand Down
8 changes: 0 additions & 8 deletions start_rc.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,6 @@ def start_rc(stdout=None, stderr=None):
server = download_if_necessary(
ENTERPRISE_REPO, HAZELCAST_GROUP, "hazelcast-enterprise", SERVER_VERSION
)
vector = download_if_necessary(
ENTERPRISE_REPO, HAZELCAST_GROUP, "hazelcast-enterprise-vector", SERVER_VERSION
)
artifacts.append(vector)
jvector = download_if_necessary(RELEASE_REPO, "io.github.jbellis", "jvector", "2.0.5")
artifacts.append(jvector)
math3 = download_if_necessary(RELEASE_REPO, "org.apache.commons", "commons-math3", "3.6.1")
artifacts.append(math3)
else:
server = download_if_necessary(REPO, HAZELCAST_GROUP, "hazelcast", SERVER_VERSION)

Expand Down
Loading
Loading