From a677c0d97e231095be6a3b53af9e18e1bb983d90 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 29 Nov 2023 21:45:19 +0100 Subject: [PATCH] Patch chroma filters tests (#67) * fix test class * remove deprecated method * make the class not discoverable by pytest * patch it with glue and sticks * lint --- .../src/chroma_haystack/document_store.py | 2 +- .../chroma/src/chroma_haystack/errors.py | 2 +- .../chroma/tests/test_document_store.py | 105 +++++++++++------- integrations/chroma/tests/test_retriever.py | 4 +- 4 files changed, 67 insertions(+), 46 deletions(-) diff --git a/integrations/chroma/src/chroma_haystack/document_store.py b/integrations/chroma/src/chroma_haystack/document_store.py index 16e6f5e9c..b6840b2a7 100644 --- a/integrations/chroma/src/chroma_haystack/document_store.py +++ b/integrations/chroma/src/chroma_haystack/document_store.py @@ -155,7 +155,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D raise ValueError(msg) if doc.content is None: - logger.warn( + logger.warning( "ChromaDocumentStore can only store the text field of Documents: " "'array', 'dataframe' and 'blob' will be dropped." ) diff --git a/integrations/chroma/src/chroma_haystack/errors.py b/integrations/chroma/src/chroma_haystack/errors.py index 474938be4..aeb0230cd 100644 --- a/integrations/chroma/src/chroma_haystack/errors.py +++ b/integrations/chroma/src/chroma_haystack/errors.py @@ -9,7 +9,7 @@ class ChromaDocumentStoreError(DocumentStoreError): pass -class ChromaDocumentStoreFilterError(FilterError): +class ChromaDocumentStoreFilterError(FilterError, ValueError): pass diff --git a/integrations/chroma/tests/test_document_store.py b/integrations/chroma/tests/test_document_store.py index ece91b252..e99a2bbfd 100644 --- a/integrations/chroma/tests/test_document_store.py +++ b/integrations/chroma/tests/test_document_store.py @@ -9,12 +9,16 @@ import pytest from chromadb.api.types import Documents, EmbeddingFunction, Embeddings from haystack import Document -from haystack.testing.document_store import DocumentStoreBaseTests +from haystack.testing.document_store import ( + CountDocumentsTest, + DeleteDocumentsTest, + LegacyFilterDocumentsTest, +) from chroma_haystack.document_store import ChromaDocumentStore -class TestEmbeddingFunction(EmbeddingFunction): +class _TestEmbeddingFunction(EmbeddingFunction): """ Chroma lets you provide custom functions to compute embeddings, we use this feature to provide a fake algorithm returning random @@ -26,49 +30,64 @@ def __call__(self, input: Documents) -> Embeddings: # noqa - chroma will inspec return [np.random.default_rng().uniform(-1, 1, 768).tolist()] -class TestDocumentStore(DocumentStoreBaseTests): +class TestDocumentStore(CountDocumentsTest, DeleteDocumentsTest, LegacyFilterDocumentsTest): """ Common test cases will be provided by `DocumentStoreBaseTests` but you can add more to this class. """ @pytest.fixture - def docstore(self) -> ChromaDocumentStore: + def document_store(self) -> ChromaDocumentStore: """ This is the most basic requirement for the child class: provide an instance of this document store so the base class can use it. """ with mock.patch("chroma_haystack.document_store.get_embedding_function") as get_func: - get_func.return_value = TestEmbeddingFunction() + get_func.return_value = _TestEmbeddingFunction() return ChromaDocumentStore(embedding_function="test_function", collection_name=str(uuid.uuid1())) + def assert_documents_are_equal(self, received: List[Document], expected: List[Document]): + """ + Assert that two lists of Documents are equal. + This is used in every test, if a Document Store implementation has a different behaviour + it should override this method. + + This can happen for example when the Document Store sets a score to returned Documents. + Since we can't know what the score will be, we can't compare the Documents reliably. + """ + for doc_received, doc_expected in zip(received, expected): + assert doc_received.content == doc_expected.content + assert doc_received.meta == doc_expected.meta + @pytest.mark.unit - def test_ne_filter(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_ne_filter(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): """ We customize this test because Chroma consider "not equal" true when a field is missing """ - docstore.write_documents(filterable_docs) - result = docstore.filter_documents(filters={"page": {"$ne": "100"}}) - assert self.contains_same_docs(result, [doc for doc in filterable_docs if doc.meta.get("page", "100") != "100"]) + document_store.write_documents(filterable_docs) + result = document_store.filter_documents(filters={"page": {"$ne": "100"}}) + self.assert_documents_are_equal( + result, [doc for doc in filterable_docs if doc.meta.get("page", "100") != "100"] + ) @pytest.mark.unit - def test_delete_empty(self, docstore: ChromaDocumentStore): + def test_delete_empty(self, document_store: ChromaDocumentStore): """ Deleting a non-existing document should not raise with Chroma """ - docstore.delete_documents(["test"]) + document_store.delete_documents(["test"]) @pytest.mark.unit - def test_delete_not_empty_nonexisting(self, docstore: ChromaDocumentStore): + def test_delete_not_empty_nonexisting(self, document_store: ChromaDocumentStore): """ Deleting a non-existing document should not raise with Chroma """ doc = Document(content="test doc") - docstore.write_documents([doc]) - docstore.delete_documents(["non_existing"]) + document_store.write_documents([doc]) + document_store.delete_documents(["non_existing"]) - assert docstore.filter_documents(filters={"id": doc.id}) == [doc] + assert document_store.filter_documents(filters={"id": doc.id}) == [doc] @pytest.mark.integration def test_to_json(self, request): @@ -95,141 +114,143 @@ def test_from_json(self): @pytest.mark.skip(reason="Filter on array contents is not supported.") @pytest.mark.unit - def test_filter_document_array(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_document_array(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on dataframe contents is not supported.") @pytest.mark.unit - def test_filter_document_dataframe(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_document_dataframe(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on table contents is not supported.") @pytest.mark.unit - def test_eq_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_eq_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on embedding value is not supported.") @pytest.mark.unit - def test_eq_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_eq_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$in operator is not supported.") @pytest.mark.unit - def test_in_filter_explicit(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_in_filter_explicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$in operator is not supported. Filter on table contents is not supported.") @pytest.mark.unit - def test_in_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_in_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$in operator is not supported.") @pytest.mark.unit - def test_in_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_in_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on table contents is not supported.") @pytest.mark.unit - def test_ne_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_ne_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on embedding value is not supported.") @pytest.mark.unit - def test_ne_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_ne_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$nin operator is not supported. Filter on table contents is not supported.") @pytest.mark.unit - def test_nin_filter_table(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_nin_filter_table(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$nin operator is not supported. Filter on embedding value is not supported.") @pytest.mark.unit - def test_nin_filter_embedding(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_nin_filter_embedding(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="$nin operator is not supported.") @pytest.mark.unit - def test_nin_filter(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_nin_filter(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit def test_filter_simple_implicit_and_with_multi_key_dict( - self, docstore: ChromaDocumentStore, filterable_docs: List[Document] + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] ): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit def test_filter_simple_explicit_and_with_multikey_dict( - self, docstore: ChromaDocumentStore, filterable_docs: List[Document] + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] ): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_simple_explicit_and_with_list(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_simple_explicit_and_with_list( + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] + ): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_simple_implicit_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_simple_implicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_explicit_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_explicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_implicit_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_implicit_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_simple_or(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_simple_or(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_or(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_or(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter on table contents is not supported.") @pytest.mark.unit - def test_filter_nested_and_or_explicit(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_and_or_explicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_and_or_implicit(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_and_or_implicit(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit - def test_filter_nested_or_and(self, docstore: ChromaDocumentStore, filterable_docs: List[Document]): + def test_filter_nested_or_and(self, document_store: ChromaDocumentStore, filterable_docs: List[Document]): pass @pytest.mark.skip(reason="Filter syntax not supported.") @pytest.mark.unit def test_filter_nested_multiple_identical_operators_same_level( - self, docstore: ChromaDocumentStore, filterable_docs: List[Document] + self, document_store: ChromaDocumentStore, filterable_docs: List[Document] ): pass @pytest.mark.skip(reason="Duplicate policy not supported.") @pytest.mark.unit - def test_write_duplicate_fail(self, docstore: ChromaDocumentStore): + def test_write_duplicate_fail(self, document_store: ChromaDocumentStore): pass @pytest.mark.skip(reason="Duplicate policy not supported.") @pytest.mark.unit - def test_write_duplicate_skip(self, docstore: ChromaDocumentStore): + def test_write_duplicate_skip(self, document_store: ChromaDocumentStore): pass @pytest.mark.skip(reason="Duplicate policy not supported.") @pytest.mark.unit - def test_write_duplicate_overwrite(self, docstore: ChromaDocumentStore): + def test_write_duplicate_overwrite(self, document_store: ChromaDocumentStore): pass diff --git a/integrations/chroma/tests/test_retriever.py b/integrations/chroma/tests/test_retriever.py index d1bbe5c49..b77dd4ca4 100644 --- a/integrations/chroma/tests/test_retriever.py +++ b/integrations/chroma/tests/test_retriever.py @@ -11,7 +11,7 @@ def test_retriever_to_json(request): ) retriever = ChromaQueryRetriever(ds, filters={"foo": "bar"}, top_k=99) assert retriever.to_dict() == { - "type": "ChromaQueryRetriever", + "type": "chroma_haystack.retriever.ChromaQueryRetriever", "init_parameters": { "filters": {"foo": "bar"}, "top_k": 99, @@ -27,7 +27,7 @@ def test_retriever_to_json(request): @pytest.mark.integration def test_retriever_from_json(request): data = { - "type": "ChromaQueryRetriever", + "type": "chroma_haystack.retriever.ChromaQueryRetriever", "init_parameters": { "filters": {"bar": "baz"}, "top_k": 42,