Skip to content

Commit

Permalink
Review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
codetheweb committed Jul 29, 2024
1 parent 5eb6620 commit ad0eb64
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 6 deletions.
2 changes: 1 addition & 1 deletion chromadb/db/mixins/embeddings_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def purge_log(self, collection_id: UUID) -> None:
if results:
min_seq_id = min(self.decode_seq_id(row[0]) for row in results)
else:
min_seq_id = -1
return

t = Table("embeddings_queue")
q = (
Expand Down
3 changes: 2 additions & 1 deletion chromadb/ingest/impl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def trigger_vector_segments_max_seq_id_migration(
"""
SELECT collection
FROM "segments"
WHERE "id" NOT IN (SELECT "segment_id" FROM "max_seq_id")
WHERE "id" NOT IN (SELECT "segment_id" FROM "max_seq_id") AND
"type" = 'urn:chroma:segment/vector/hnsw-local-persisted'
"""
)
collection_ids_with_unmigrated_segments = [row[0] for row in cur.fetchall()]
Expand Down
2 changes: 1 addition & 1 deletion chromadb/segment/impl/vector/local_hnsw.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def _apply_batch(self, batch: Batch) -> None:
self._total_elements_added += batch.add_count

# If that succeeds, finally the seq ID
self._max_seq_id = max(self._max_seq_id, batch.max_seq_id)
self._max_seq_id = batch.max_seq_id

@trace_method("LocalHnswSegment._write_records", OpenTelemetryGranularity.ALL)
def _write_records(self, records: Sequence[LogRecord]) -> None:
Expand Down
13 changes: 10 additions & 3 deletions chromadb/test/property/invariants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import gc
import math
from chromadb.api.configuration import HNSWConfigurationInternal
from chromadb.config import System
from chromadb.db.base import get_sql
from chromadb.db.impl.sqlite import SqliteDB
Expand Down Expand Up @@ -341,9 +342,15 @@ def log_size_below_max(
# Must always keep one entry to avoid reusing seq_ids
assert _total_embedding_queue_log_size(sqlite) >= 1

# todo: use new collection config API when available
sync_threshold = collection.metadata.get("hnsw:sync_threshold", 1000)
batch_size = collection.metadata.get("hnsw:batch_size", 100)
hnsw_config = cast(
HNSWConfigurationInternal,
collection.get_model()
.get_configuration()
.get_parameter("hnsw_configuration")
.value,
)
sync_threshold = cast(int, hnsw_config.get_parameter("sync_threshold").value)
batch_size = cast(int, hnsw_config.get_parameter("batch_size").value)

# -1 is used because the queue is always at least 1 entry long, so deletion stops before the max ack'ed sequence ID.
# And if the batch_size != sync_threshold, the queue can have up to batch_size - 1 more entries.
Expand Down
1 change: 1 addition & 0 deletions chromadb/test/property/test_cross_version_persist.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ def test_cycle_versions(
# Should be able to clean log immediately after updating
embeddings_queue = system.instance(SqliteDB)

# 07/29/24: the max_seq_id for vector segments was moved from the pickled metadata file to SQLite.
# Cleaning the log is dependent on vector segments migrating their max_seq_id from the pickled metadata file to SQLite.
# Vector segments migrate this field automatically on init, but at this point the segment has not been loaded yet.
trigger_vector_segments_max_seq_id_migration(
Expand Down

0 comments on commit ad0eb64

Please sign in to comment.