Skip to content

Commit

Permalink
Further posthog improvements (and a little .gitignore) (#1222)
Browse files Browse the repository at this point in the history
## Description of changes

*Summarize the changes made by this PR.*
 - Improvements & Bug fixes
	 - Increase add() batch size.
	 - Flatten server context so we can do things like group by version.
- Add a few things to `.gitignore` which seem to be created by test
runs.
 - New functionality
	 - Batch query() calls (batch size = 20, where we started for add()).
- Change `collection.get()` so its fields are `int` instead of `bool`
since I imagine we'll eventually batch it as well.

## Test plan
*How are these changes tested?*

- [x] Tests pass locally with `pytest` for python, `yarn test` for js
- [x] Tested locally by printing posthog events:

```python
>>> import chromadb
>>> chroma_client = chromadb.Client()
bf9b885c-b86e-4194-97b4-d9701d293cce ClientStartEvent {'batch_size': 1, 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False}
>>> collection = chroma_client.create_collection(name="my_collection")
bf9b885c-b86e-4194-97b4-d9701d293cce ClientCreateCollectionEvent {'batch_size': 1, 'collection_uuid': '50de2cd2-06ba-442d-82c1-ae0d94b620e4', 'embedding_function': 'ONNXMiniLM_L6_V2', 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False}
>>> collection.add(
...     documents=["This is a document", "This is another document"],
...     metadatas=[{"source": "my_source"}, {"source": "my_source"}],
...     ids=["id1", "id2"]
... )
bf9b885c-b86e-4194-97b4-d9701d293cce CollectionAddEvent {'batch_size': 1, 'collection_uuid': '50de2cd2-06ba-442d-82c1-ae0d94b620e4', 'add_amount': 2, 'with_documents': 2, 'with_metadata': 2, 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False}
>>> for i in range(41):
...   results = collection.query(
...     query_texts=["This is a query document"],
...     n_results=2
...   )
... 
bf9b885c-b86e-4194-97b4-d9701d293cce CollectionQueryEvent {'batch_size': 1, 'collection_uuid': '50de2cd2-06ba-442d-82c1-ae0d94b620e4', 'query_amount': 1, 'with_metadata_filter': 1, 'with_document_filter': 1, 'n_results': 2, 'include_metadatas': 1, 'include_documents': 1, 'include_distances': 1, 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False}
bf9b885c-b86e-4194-97b4-d9701d293cce CollectionQueryEvent {'batch_size': 20, 'collection_uuid': '50de2cd2-06ba-442d-82c1-ae0d94b620e4', 'query_amount': 20, 'with_metadata_filter': 20, 'with_document_filter': 20, 'n_results': 40, 'include_metadatas': 20, 'include_documents': 20, 'include_distances': 20, 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False}
bf9b885c-b86e-4194-97b4-d9701d293cce CollectionQueryEvent {'batch_size': 20, 'collection_uuid': '50de2cd2-06ba-442d-82c1-ae0d94b620e4', 'query_amount': 20, 'with_metadata_filter': 20, 'with_document_filter': 20, 'n_results': 40, 'include_metadatas': 20, 'include_documents': 20, 'include_distances': 20, 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False}
>>> for i in range(100):
...   collection.add(documents=[str(i)], ids=[str(i)])
... 
bf9b885c-b86e-4194-97b4-d9701d293cce CollectionAddEvent {'batch_size': 100, 'collection_uuid': '50de2cd2-06ba-442d-82c1-ae0d94b620e4', 'add_amount': 100, 'with_documents': 100, 'with_metadata': 0, 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False}
```

Also confirmed that `collection.get()` spits out an event every time
it's called. It also spits out a bunch of data so I'll elide it from
this PR description.

## Documentation Changes
*Are all docstrings for user-facing APIs updated if required? Do we need
to make documentation changes in the [docs
repository](https://github.com/chroma-core/docs)?*

No docs change needed -- we're not collecting anything new or changing
anything significant about how we collect.
  • Loading branch information
beggers authored Oct 13, 2023
1 parent 85f125a commit 734b133
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 25 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ index_data
# Default configuration for persist_directory in chromadb/config.py
# Currently it's located in "./chroma/"
chroma/
chroma_test_data
server.htpasswd

.venv
venv
.env
.chroma
Expand Down
20 changes: 11 additions & 9 deletions chromadb/api/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,13 +408,14 @@ def _get(
if "documents" in include:
documents = [_doc(m) for m in metadatas]

ids_amount = len(ids) if ids else 0
self._telemetry_client.capture(
CollectionGetEvent(
collection_uuid=str(collection_id),
ids_count=len(ids) if ids else 0,
ids_count=ids_amount,
limit=limit if limit else 0,
include_metadata="metadatas" in include,
include_documents="documents" in include,
include_metadata=ids_amount if "metadatas" in include else 0,
include_documents=ids_amount if "documents" in include else 0,
)
)

Expand Down Expand Up @@ -571,16 +572,17 @@ def _query(
doc_list = [_doc(m) for m in metadata_list]
documents.append(doc_list) # type: ignore

query_amount = len(query_embeddings)
self._telemetry_client.capture(
CollectionQueryEvent(
collection_uuid=str(collection_id),
query_amount=len(query_embeddings),
query_amount=query_amount,
n_results=n_results,
with_metadata_filter=where is not None,
with_document_filter=where_document is not None,
include_metadatas="metadatas" in include,
include_documents="documents" in include,
include_distances="distances" in include,
with_metadata_filter=query_amount if where is not None else 0,
with_document_filter=query_amount if where_document is not None else 0,
include_metadatas=query_amount if "metadatas" in include else 0,
include_documents=query_amount if "documents" in include else 0,
include_distances=query_amount if "distances" in include else 0,
)
)

Expand Down
56 changes: 41 additions & 15 deletions chromadb/telemetry/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def __init__(self, collection_uuid: str, embedding_function: str):


class CollectionAddEvent(TelemetryEvent):
max_batch_size: ClassVar[int] = 20
max_batch_size: ClassVar[int] = 100
batch_size: int
collection_uuid: str
add_amount: int
with_documents: int
Expand Down Expand Up @@ -89,25 +90,28 @@ def __init__(


class CollectionQueryEvent(TelemetryEvent):
max_batch_size: ClassVar[int] = 20
batch_size: int
collection_uuid: str
query_amount: int
with_metadata_filter: bool
with_document_filter: bool
with_metadata_filter: int
with_document_filter: int
n_results: int
include_metadatas: bool
include_documents: bool
include_distances: bool
include_metadatas: int
include_documents: int
include_distances: int

def __init__(
self,
collection_uuid: str,
query_amount: int,
with_metadata_filter: bool,
with_document_filter: bool,
with_metadata_filter: int,
with_document_filter: int,
n_results: int,
include_metadatas: bool,
include_documents: bool,
include_distances: bool,
include_metadatas: int,
include_documents: int,
include_distances: int,
batch_size: int = 1,
):
super().__init__()
self.collection_uuid = collection_uuid
Expand All @@ -118,22 +122,44 @@ def __init__(
self.include_metadatas = include_metadatas
self.include_documents = include_documents
self.include_distances = include_distances
self.batch_size = batch_size

@property
def batch_key(self) -> str:
return self.collection_uuid + self.name

def batch(self, other: "TelemetryEvent") -> "CollectionQueryEvent":
if not self.batch_key == other.batch_key:
raise ValueError("Cannot batch events")
other = cast(CollectionQueryEvent, other)
total_amount = self.query_amount + other.query_amount
return CollectionQueryEvent(
collection_uuid=self.collection_uuid,
query_amount=total_amount,
with_metadata_filter=self.with_metadata_filter + other.with_metadata_filter,
with_document_filter=self.with_document_filter + other.with_document_filter,
n_results=self.n_results + other.n_results,
include_metadatas=self.include_metadatas + other.include_metadatas,
include_documents=self.include_documents + other.include_documents,
include_distances=self.include_distances + other.include_distances,
batch_size=self.batch_size + other.batch_size,
)


class CollectionGetEvent(TelemetryEvent):
collection_uuid: str
ids_count: int
limit: int
include_metadata: bool
include_documents: bool
include_metadata: int
include_documents: int

def __init__(
self,
collection_uuid: str,
ids_count: int,
limit: int,
include_metadata: bool,
include_documents: bool,
include_metadata: int,
include_documents: int,
):
super().__init__()
self.collection_uuid = collection_uuid
Expand Down
2 changes: 1 addition & 1 deletion chromadb/telemetry/posthog.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _direct_capture(self, event: TelemetryEvent) -> None:
posthog.capture(
self.user_id,
event.name,
{**(event.properties), "chroma_context": self.context},
{**event.properties, **self.context},
)
except Exception as e:
logger.error(f"Failed to send telemetry event {event.name}: {e}")

0 comments on commit 734b133

Please sign in to comment.