Skip to content

Commit

Permalink
rm onnxruntime, no longer used in 2+
Browse files Browse the repository at this point in the history
  • Loading branch information
sdan committed Apr 25, 2024
1 parent ddbf16b commit 9ce4907
Show file tree
Hide file tree
Showing 7 changed files with 755 additions and 3 deletions.
14 changes: 14 additions & 0 deletions docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,20 @@ vlite uses the CTX (Context) file format for efficient storage and retrieval of

The CTX file format is designed to be memory-efficient and allows for fast loading and saving of embeddings and associated data.

| Section | Byte Size | Example |
|---------------|-------------|----------------------------------------------------------------------------------------------|
| Magic Number | 4 bytes | `b"CTXF"` |
| Version | 4 bytes | `1` which results in something like `b'\x01\x00\x00\x00'` |
| Header | Variable | JSON string of the header dict, encoded in UTF-8 and prefixed with its length in bytes. |
| | | Example JSON: `{"embedding_model": "default", "embedding_size": 64, "embedding_dtype": "float32", "context_length": 512}` |
| Embeddings | Variable | Each embedding is 64 dimensions of `float32`. 64 floats * 4 bytes each = 256 bytes. |
| | | An embedding example might look like a sequence of 256 bytes after packing. |
| Contexts | Variable | Prefixed with the length of the string in bytes followed by the string encoded in UTF-8. |
| | | Example: A context string with its length prefix. |
| Metadata | Variable | JSON string of the metadata dict, encoded in UTF-8 and prefixed with its length in bytes. |
| | | Example JSON: `{"created_at": "2024-04-18", ...}` |


### Creating a CTX File
To create a new CTX file, use the `create` method of the `Ctx` class:
```python
Expand Down
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,4 @@ Requests
beautifulsoup4
huggingface_hub
tiktoken
onnxruntime==1.17.1
tokenizers==0.15.2
tokenizers==0.15.2
Binary file added tests/attention.pdf
Binary file not shown.
Binary file added tests/chromadb/chroma.sqlite3
Binary file not shown.
468 changes: 468 additions & 0 deletions tests/notebook3.ipynb

Large diffs are not rendered by default.

56 changes: 55 additions & 1 deletion tests/notebook4.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,61 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"!pip install langchain-chroma\n",
"!pip install pypdf\n",
"!pip install sentence-transformers\n",
"import requests\n",
"\n",
"from vlite.utils import process_pdf\n",
"\n",
"from langchain_community.document_loaders import TextLoader\n",
"from langchain_community.document_loaders import PyPDFLoader\n",
"\n",
"from langchain_text_splitters import CharacterTextSplitter\n",
"\n",
"from langchain_community.embeddings.sentence_transformer import (\n",
" SentenceTransformerEmbeddings,\n",
")\n",
"\n",
"from langchain_chroma import Chroma\n",
"\n",
"import chromadb\n",
"\n",
"# Download the PDF\n",
"open('attention.pdf', 'wb').write(requests.get('https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf').content)\n",
"\n",
"# Load the PDF document\n",
"loader = PyPDFLoader(\"attention.pdf\")\n",
"pages = loader.load_and_split()\n",
"\n",
"# Split the documents into chunks\n",
"text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)\n",
"texts = text_splitter.split_documents(pages)\n",
"\n",
"# Initialize embeddings\n",
"embeddings = SentenceTransformerEmbeddings(model_name=\"mixedbread-ai/mxbai-embed-large-v1\")\n",
"\n",
"# Initialize Chroma\n",
"chroma_client = chromadb.PersistentClient(path=\"chromadb\")\n",
"collection = chroma_client.create_collection(name=\"attention\")\n",
"\n",
"# Add texts to the Chroma vector database\n",
"chroma = Chroma.from_documents(texts, embeddings, collection_name=\"attention\")\n",
"\n",
"# Perform a similarity search\n",
"query = \"What is attention?\"\n",
"docs = chroma.similarity_search(query, k=3)\n",
"\n",
"# Print the most relevant chunks\n",
"for doc in docs:\n",
" print(doc.page_content)\n",
" print('---')\n",
"\n",
"# Get collection information\n",
"collection = chroma_client.get_collection(name=\"attention\")\n",
"print(f\"Number of elements in the collection: {collection.count()}\")"
]
}
],
"metadata": {
Expand Down
217 changes: 217 additions & 0 deletions tests/querytest.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/sdan/miniforge3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Not using OCR for data/attention2.pdf\n",
"Initial length of the texts: 4\n",
"Initial length of the embeddings: 4\n",
"Extended length of the texts: 500000\n",
"Extended length of the embeddings: 500000\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of items in the VLite instance: 500000\n",
"[VLite.info] Collection Information:\n",
"[VLite.info] Items: 500000\n",
"[VLite.info] Collection file: vlite_20240417_211628\n",
"[VLite.info] Embedding model: <vlite.model.EmbeddingModel object at 0x104301480>\n",
"Execution time: 1.197732925415039 seconds\n"
]
}
],
"source": [
"import sys\n",
"sys.path.insert(0, '..')\n",
"\n",
"from vlite import VLite\n",
"from vlite import EmbeddingModel\n",
"from vlite.utils import process_file, process_pdf, process_webpage\n",
"import time\n",
"\n",
"# Load and process data\n",
"corpus = process_file(\"data/attention2.pdf\")\n",
"emd = EmbeddingModel()\n",
"embeddings = emd.embed(corpus)\n",
"\n",
"# Verify the initial lengths\n",
"print(f\"Initial length of the texts: {len(corpus)}\")\n",
"print(f\"Initial length of the embeddings: {len(embeddings)}\")\n",
"\n",
"# Multiply texts and embeddings\n",
"multiplier = 125000\n",
"extended_corpus = corpus * multiplier\n",
"# Correctly extend the list of embeddings\n",
"extended_embeddings = [emb for emb in embeddings for _ in range(multiplier)]\n",
"\n",
"# Verify the extended lengths\n",
"print(f\"Extended length of the texts: {len(extended_corpus)}\")\n",
"print(f\"Extended length of the embeddings: {len(extended_embeddings)}\")\n",
"\n",
"# Create VLite instance and use set_batch\n",
"vdb = VLite()\n",
"vdb.set_batch(texts=extended_corpus, embeddings=extended_embeddings)\n",
"\n",
"# Check and display results\n",
"print(f\"Number of items in the VLite instance: {vdb.count()}\")\n",
"vdb.info()\n",
"\n",
"start_time = time.time()\n",
"\n",
"vdb.retrieve(\"attention\", top_k=1)\n",
"\n",
"end_time = time.time()\n",
"execution_time = end_time - start_time\n",
"\n",
"print(f\"Execution time: {execution_time} seconds\")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Not using OCR for data/attention2.pdf\n",
"Initial length of the texts: 4\n",
"Initial length of the embeddings: 4\n",
"Extended length of the texts: 1000000\n",
"Extended length of the embeddings: 1000000\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"To disable this warning, you can either:\n",
"\t- Avoid using `tokenizers` before the fork if possible\n",
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of items in the VLite instance: 1000000\n",
"[VLite.info] Collection Information:\n",
"[VLite.info] Items: 1000000\n",
"[VLite.info] Collection file: vlite_20240417_211918\n",
"[VLite.info] Embedding model: <vlite.model.EmbeddingModel object at 0x1043021a0>\n",
"Execution time: 2.6589159965515137 seconds\n"
]
}
],
"source": [
"import sys\n",
"sys.path.insert(0, '..')\n",
"\n",
"from vlite import VLite\n",
"from vlite import EmbeddingModel\n",
"from vlite.utils import process_file, process_pdf, process_webpage\n",
"import time\n",
"\n",
"# Load and process data\n",
"corpus = process_file(\"data/attention2.pdf\")\n",
"emd = EmbeddingModel()\n",
"embeddings = emd.embed(corpus)\n",
"\n",
"# Verify the initial lengths\n",
"print(f\"Initial length of the texts: {len(corpus)}\")\n",
"print(f\"Initial length of the embeddings: {len(embeddings)}\")\n",
"\n",
"# Multiply texts and embeddings\n",
"multiplier = 125000*2\n",
"extended_corpus = corpus * multiplier\n",
"# Correctly extend the list of embeddings\n",
"extended_embeddings = [emb for emb in embeddings for _ in range(multiplier)]\n",
"\n",
"# Verify the extended lengths\n",
"print(f\"Extended length of the texts: {len(extended_corpus)}\")\n",
"print(f\"Extended length of the embeddings: {len(extended_embeddings)}\")\n",
"\n",
"# Create VLite instance and use set_batch\n",
"vdb = VLite()\n",
"vdb.set_batch(texts=extended_corpus, embeddings=extended_embeddings)\n",
"\n",
"# Check and display results\n",
"print(f\"Number of items in the VLite instance: {vdb.count()}\")\n",
"vdb.info()\n",
"\n",
"start_time = time.time()\n",
"\n",
"vdb.retrieve(\"attention\", top_k=1)\n",
"\n",
"end_time = time.time()\n",
"execution_time = end_time - start_time\n",
"\n",
"print(f\"Execution time: {execution_time} seconds\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 9ce4907

Please sign in to comment.