bump langchain integration

sdan · Apr 18, 2024 · ee4a6b6 · ee4a6b6
1 parent 8215d5d
commit ee4a6b6
Show file tree

Hide file tree

Showing 3 changed files with 157 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -9,11 +9,12 @@ there is no database you need to set up, no server to run, and no complex config
 ## Features
 
 - 🔥 *Fastest* vector db retrieval with binary embeddings
-- 🔋 Made for RAG -- with embedding generation baked in
+- 🔋 Made for RAG -- with embedding generation with [mixedbread embed-large](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) baked in
 - 🍪 CTX (context) file format, a novel abstraction for storing user context similar to browser cookies
 - Ingest text, PDF, CSV, PPTX, and webpages
 - Chunking, metadata filtering, PDF OCR support for extracting text from scanned PDFs
-- **Over 77.95% faster than Chroma on indexing, and 422% faster on retrieval**
+- **>77.95% faster than Chroma on indexing, >422% faster on retrieval, and >3.6x smaller on disk**
+- 🦜 available in LangChain since vlite v0.2.2
 
 
 ## Installation
@@ -44,6 +45,30 @@ results = vdb.retrieve("how do transformers work?")
 print(results)
 ```
 
+### Usage with LangChain
+```python
+from langchain.document_loaders import TextLoader
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.vectorstores import VLite
+
+# Load the document and split it into chunks
+loader = TextLoader("path/to/document.txt")
+documents = loader.load()
+
+# Create a VLite instance
+vlite = VLite(collection="my_collection")
+
+# Add documents to the VLite vector database
+vlite.add_documents(documents)
+
+# Perform a similarity search
+query = "What is the main topic of the document?"
+docs = vlite.similarity_search(query)
+
+# Print the most relevant document
+print(docs[0].page_content)
+```
+
 ## About
 
 vlite is a vector database built for agents, ChatGPT Plugins, and other AI apps that need a fast and simple database to store vectors. It was developed to support the billions of embeddings generated, indexed, and sorted with [ChatWith+ ChatGPT Plugins](https://plugins.sdan.io/), which run for millions of users. Most vector databases either repeatedly crashed on a daily basis or were too expensive for the high throughput required.

diff --git a/tests/langchain.ipynb b/tests/langchain.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Langchain <> VLite integration script\n",
+    "# !pip install langchain==0.1.17\n",
+    "import requests\n",
+    "from langchain.document_loaders import TextLoader\n",
+    "from langchain.text_splitter import CharacterTextSplitter\n",
+    "from langchain.vectorstores import VLite\n",
+    "\n",
+    "# Download the PDF\n",
+    "open('attention.pdf', 'wb').write(requests.get('https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf').content)\n",
+    "\n",
+    "# Load the PDF document\n",
+    "loader = TextLoader('attention.pdf')\n",
+    "documents = loader.load()\n",
+    "\n",
+    "# Split the documents into chunks\n",
+    "text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)\n",
+    "texts = text_splitter.split_documents(documents)\n",
+    "\n",
+    "# Create a VLite instance\n",
+    "vlite = VLite(collection=\"attention\")\n",
+    "\n",
+    "# Add texts to the VLite vector database\n",
+    "vlite.add_texts([text.page_content for text in texts])\n",
+    "\n",
+    "# Perform a similarity search\n",
+    "query = \"What is attention?\"\n",
+    "docs = vlite.similarity_search(query, k=3)\n",
+    "\n",
+    "# Print the most relevant chunks\n",
+    "for doc in docs:\n",
+    "    print(doc.page_content)\n",
+    "    print('---')\n",
+    "\n",
+    "# Get collection information\n",
+    "vlite.info()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Pure vlite example\n",
+    "import requests\n",
+    "from vlite import VLite\n",
+    "from vlite.utils import process_pdf\n",
+    "\n",
+    "# Start VLite\n",
+    "vdb = VLite('attention2')\n",
+    "\n",
+    "# Download the pdf\n",
+    "open('attention.pdf', 'wb').write(requests.get('https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf').content)\n",
+    "\n",
+    "# Process the pdf\n",
+    "corpus = process_pdf('attention.pdf')\n",
+    "\n",
+    "# Add the PDF to the VLite database\n",
+    "vdb.add(corpus)\n",
+    "\n",
+    "# Query the VLite database\n",
+    "print(vdb.retrieve('what is attention'))\n",
+    "\n",
+    "# Print the VLite database\n",
+    "vdb.info()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/vlite/main.py b/vlite/main.py
@@ -260,10 +260,10 @@ def clear(self):
         logger.info("[VLite.clear] Collection cleared.")
 
     def info(self):
-        logger.info("[VLite.info] Collection Information:")
-        logger.info(f"[VLite.info] Items: {self.count()}")
-        logger.info(f"[VLite.info] Collection file: {self.collection}")
-        logger.info(f"[VLite.info] Embedding model: {self.model}")
+        print("[VLite.info] Collection Information:")
+        print(f"[VLite.info] Items: {self.count()}")
+        print(f"[VLite.info] Collection file: {self.collection}")
+        print(f"[VLite.info] Embedding model: {self.model}")
 
     def __repr__(self):
         return f"VLite(collection={self.collection}, device={self.device}, model={self.model})"