From ccfa91470a491ca41e8a2e1bebfb14fc93eaa899 Mon Sep 17 00:00:00 2001 From: Ben Eggers <64657842+beggers@users.noreply.github.com> Date: Tue, 7 Nov 2023 10:51:14 -0800 Subject: [PATCH] More Posthog batching (#1342) ## Description of changes *Summarize the changes made by this PR.* - Improvements & Bug fixes - Batch posthog requests much more aggressively. We really should do batches of ever-increasing size for a given collection but I want to get this fix out this week. - New functionality - ... ## Test plan *How are these changes tested?* - [x] Tests pass locally with `pytest` for python, `yarn test` for js ## Documentation Changes *Are all docstrings for user-facing APIs updated if required? Do we need to make documentation changes in the [docs repository](https://github.com/chroma-core/docs)?* --- chromadb/telemetry/product/events.py | 47 ++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/chromadb/telemetry/product/events.py b/chromadb/telemetry/product/events.py index e5f6bc688c1..2e1617af4b6 100644 --- a/chromadb/telemetry/product/events.py +++ b/chromadb/telemetry/product/events.py @@ -26,7 +26,7 @@ def __init__(self, collection_uuid: str, embedding_function: str): class CollectionAddEvent(ProductTelemetryEvent): - max_batch_size: ClassVar[int] = 100 + max_batch_size: ClassVar[int] = 1000 batch_size: int collection_uuid: str add_amount: int @@ -67,6 +67,8 @@ def batch(self, other: "ProductTelemetryEvent") -> "CollectionAddEvent": class CollectionUpdateEvent(ProductTelemetryEvent): + max_batch_size: ClassVar[int] = 100 + batch_size: int collection_uuid: str update_amount: int with_embeddings: int @@ -80,6 +82,7 @@ def __init__( with_embeddings: int, with_metadata: int, with_documents: int, + batch_size: int = 1, ): super().__init__() self.collection_uuid = collection_uuid @@ -87,10 +90,28 @@ def __init__( self.with_embeddings = with_embeddings self.with_metadata = with_metadata self.with_documents = with_documents + self.batch_size = batch_size + @property + def batch_key(self) -> str: + return self.collection_uuid + self.name + + def batch(self, other: "ProductTelemetryEvent") -> "CollectionUpdateEvent": + if not self.batch_key == other.batch_key: + raise ValueError("Cannot batch events") + other = cast(CollectionUpdateEvent, other) + total_amount = self.update_amount + other.update_amount + return CollectionUpdateEvent( + collection_uuid=self.collection_uuid, + update_amount=total_amount, + with_documents=self.with_documents + other.with_documents, + with_metadata=self.with_metadata + other.with_metadata, + with_embeddings=self.with_embeddings + other.with_embeddings, + batch_size=self.batch_size + other.batch_size, + ) class CollectionQueryEvent(ProductTelemetryEvent): - max_batch_size: ClassVar[int] = 20 + max_batch_size: ClassVar[int] = 1000 batch_size: int collection_uuid: str query_amount: int @@ -147,6 +168,8 @@ def batch(self, other: "ProductTelemetryEvent") -> "CollectionQueryEvent": class CollectionGetEvent(ProductTelemetryEvent): + max_batch_size: ClassVar[int] = 100 + batch_size: int collection_uuid: str ids_count: int limit: int @@ -160,6 +183,7 @@ def __init__( limit: int, include_metadata: int, include_documents: int, + batch_size: int = 1, ): super().__init__() self.collection_uuid = collection_uuid @@ -167,6 +191,25 @@ def __init__( self.limit = limit self.include_metadata = include_metadata self.include_documents = include_documents + self.batch_size = batch_size + + @property + def batch_key(self) -> str: + return self.collection_uuid + self.name + str(self.limit) + + def batch(self, other: "ProductTelemetryEvent") -> "CollectionGetEvent": + if not self.batch_key == other.batch_key: + raise ValueError("Cannot batch events") + other = cast(CollectionGetEvent, other) + total_amount = self.ids_count + other.ids_count + return CollectionGetEvent( + collection_uuid=self.collection_uuid, + ids_count=total_amount, + limit=self.limit, + include_metadata=self.include_metadata + other.include_metadata, + include_documents=self.include_documents + other.include_documents, + batch_size=self.batch_size + other.batch_size, + ) class CollectionDeleteEvent(ProductTelemetryEvent):