diff --git a/integrations/pgvector/examples/example.py b/integrations/pgvector/examples/example.py new file mode 100644 index 000000000..14c2cba60 --- /dev/null +++ b/integrations/pgvector/examples/example.py @@ -0,0 +1,58 @@ +# Before running this example, ensure you have PostgreSQL installed with the pgvector extension. +# For a quick setup using Docker: +# docker run -d -p 5432:5432 -e POSTGRES_USER=postgres -e POSTGRES_PASSWORD=postgres +# -e POSTGRES_DB=postgres ankane/pgvector + +# Install required packages for this example, including pgvector-haystack and other libraries needed +# for Markdown conversion and embeddings generation. Use the following command: +# pip install pgvector-haystack markdown-it-py mdit_plain "sentence-transformers>=2.2.0" + +# Download some Markdown files to index. +# git clone https://github.com/anakin87/neural-search-pills + +import glob + +from haystack import Pipeline +from haystack.components.converters import MarkdownToDocument +from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack.components.preprocessors import DocumentSplitter +from haystack.components.writers import DocumentWriter +from haystack_integrations.components.retrievers.pgvector import PgvectorEmbeddingRetriever +from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore + +# Initialize PgvectorDocumentStore +document_store = PgvectorDocumentStore( + connection_string="postgresql://postgres:postgres@localhost:5432/postgres", + table_name="haystack_test", + embedding_dimension=768, + vector_function="cosine_similarity", + recreate_table=True, + search_strategy="hnsw", +) + +# Create the indexing Pipeline and index some documents +file_paths = glob.glob("neural-search-pills/pills/*.md") + + +indexing = Pipeline() +indexing.add_component("converter", MarkdownToDocument()) +indexing.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=2)) +indexing.add_component("embedder", SentenceTransformersDocumentEmbedder()) +indexing.add_component("writer", DocumentWriter(document_store)) +indexing.connect("converter", "splitter") +indexing.connect("splitter", "embedder") +indexing.connect("embedder", "writer") + +indexing.run({"converter": {"sources": file_paths}}) + +# Create the querying Pipeline and try a query +querying = Pipeline() +querying.add_component("embedder", SentenceTransformersTextEmbedder()) +querying.add_component("retriever", PgvectorEmbeddingRetriever(document_store=document_store, top_k=3)) +querying.connect("embedder", "retriever") + +results = querying.run({"embedder": {"text": "What is a cross-encoder?"}}) + +for doc in results["retriever"]["documents"]: + print(doc) + print("-" * 10) diff --git a/integrations/pgvector/pyproject.toml b/integrations/pgvector/pyproject.toml index 10ef5d314..65ded967f 100644 --- a/integrations/pgvector/pyproject.toml +++ b/integrations/pgvector/pyproject.toml @@ -153,6 +153,8 @@ ban-relative-imports = "parents" [tool.ruff.per-file-ignores] # Tests can use magic values, assertions, and relative imports "tests/**/*" = ["PLR2004", "S101", "TID252"] +# examples can contain "print" commands +"examples/**/*" = ["T201"] [tool.coverage.run] source_pkgs = ["src", "tests"]