Skip to content

Commit

Permalink
bump langchain integration
Browse files Browse the repository at this point in the history
  • Loading branch information
sdan committed Apr 18, 2024
1 parent 8215d5d commit ee4a6b6
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 6 deletions.
29 changes: 27 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ there is no database you need to set up, no server to run, and no complex config
## Features

- 🔥 *Fastest* vector db retrieval with binary embeddings
- 🔋 Made for RAG -- with embedding generation baked in
- 🔋 Made for RAG -- with embedding generation with [mixedbread embed-large](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1) baked in
- 🍪 CTX (context) file format, a novel abstraction for storing user context similar to browser cookies
- Ingest text, PDF, CSV, PPTX, and webpages
- Chunking, metadata filtering, PDF OCR support for extracting text from scanned PDFs
- **Over 77.95% faster than Chroma on indexing, and 422% faster on retrieval**
- **>77.95% faster than Chroma on indexing, >422% faster on retrieval, and >3.6x smaller on disk**
- 🦜 available in LangChain since vlite v0.2.2


## Installation
Expand Down Expand Up @@ -44,6 +45,30 @@ results = vdb.retrieve("how do transformers work?")
print(results)
```

### Usage with LangChain
```python
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import VLite

# Load the document and split it into chunks
loader = TextLoader("path/to/document.txt")
documents = loader.load()

# Create a VLite instance
vlite = VLite(collection="my_collection")

# Add documents to the VLite vector database
vlite.add_documents(documents)

# Perform a similarity search
query = "What is the main topic of the document?"
docs = vlite.similarity_search(query)

# Print the most relevant document
print(docs[0].page_content)
```

## About

vlite is a vector database built for agents, ChatGPT Plugins, and other AI apps that need a fast and simple database to store vectors. It was developed to support the billions of embeddings generated, indexed, and sorted with [ChatWith+ ChatGPT Plugins](https://plugins.sdan.io/), which run for millions of users. Most vector databases either repeatedly crashed on a daily basis or were too expensive for the high throughput required.
Expand Down
126 changes: 126 additions & 0 deletions tests/langchain.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Langchain <> VLite integration script\n",
"# !pip install langchain==0.1.17\n",
"import requests\n",
"from langchain.document_loaders import TextLoader\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores import VLite\n",
"\n",
"# Download the PDF\n",
"open('attention.pdf', 'wb').write(requests.get('https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf').content)\n",
"\n",
"# Load the PDF document\n",
"loader = TextLoader('attention.pdf')\n",
"documents = loader.load()\n",
"\n",
"# Split the documents into chunks\n",
"text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)\n",
"texts = text_splitter.split_documents(documents)\n",
"\n",
"# Create a VLite instance\n",
"vlite = VLite(collection=\"attention\")\n",
"\n",
"# Add texts to the VLite vector database\n",
"vlite.add_texts([text.page_content for text in texts])\n",
"\n",
"# Perform a similarity search\n",
"query = \"What is attention?\"\n",
"docs = vlite.similarity_search(query, k=3)\n",
"\n",
"# Print the most relevant chunks\n",
"for doc in docs:\n",
" print(doc.page_content)\n",
" print('---')\n",
"\n",
"# Get collection information\n",
"vlite.info()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Pure vlite example\n",
"import requests\n",
"from vlite import VLite\n",
"from vlite.utils import process_pdf\n",
"\n",
"# Start VLite\n",
"vdb = VLite('attention2')\n",
"\n",
"# Download the pdf\n",
"open('attention.pdf', 'wb').write(requests.get('https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf').content)\n",
"\n",
"# Process the pdf\n",
"corpus = process_pdf('attention.pdf')\n",
"\n",
"# Add the PDF to the VLite database\n",
"vdb.add(corpus)\n",
"\n",
"# Query the VLite database\n",
"print(vdb.retrieve('what is attention'))\n",
"\n",
"# Print the VLite database\n",
"vdb.info()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
8 changes: 4 additions & 4 deletions vlite/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,10 +260,10 @@ def clear(self):
logger.info("[VLite.clear] Collection cleared.")

def info(self):
logger.info("[VLite.info] Collection Information:")
logger.info(f"[VLite.info] Items: {self.count()}")
logger.info(f"[VLite.info] Collection file: {self.collection}")
logger.info(f"[VLite.info] Embedding model: {self.model}")
print("[VLite.info] Collection Information:")
print(f"[VLite.info] Items: {self.count()}")
print(f"[VLite.info] Collection file: {self.collection}")
print(f"[VLite.info] Embedding model: {self.model}")

def __repr__(self):
return f"VLite(collection={self.collection}, device={self.device}, model={self.model})"
Expand Down

0 comments on commit ee4a6b6

Please sign in to comment.