rm onnxruntime, no longer used in 2+

sdan · Apr 25, 2024 · 9ce4907 · 9ce4907
1 parent ddbf16b
commit 9ce4907
Show file tree

Hide file tree

Showing 7 changed files with 755 additions and 3 deletions.
diff --git a/docs.md b/docs.md
@@ -146,6 +146,20 @@ vlite uses the CTX (Context) file format for efficient storage and retrieval of
 
 The CTX file format is designed to be memory-efficient and allows for fast loading and saving of embeddings and associated data.
 
+| Section       | Byte Size   | Example                                                                                      |
+|---------------|-------------|----------------------------------------------------------------------------------------------|
+| Magic Number  | 4 bytes     | `b"CTXF"`                                                                                    |
+| Version       | 4 bytes     | `1` which results in something like `b'\x01\x00\x00\x00'`                 |
+| Header        | Variable    | JSON string of the header dict, encoded in UTF-8 and prefixed with its length in bytes.      |
+|               |             | Example JSON: `{"embedding_model": "default", "embedding_size": 64, "embedding_dtype": "float32", "context_length": 512}` |
+| Embeddings    | Variable    | Each embedding is 64 dimensions of `float32`. 64 floats * 4 bytes each = 256 bytes.             |
+|               |             | An embedding example might look like a sequence of 256 bytes after packing.                   |
+| Contexts      | Variable    | Prefixed with the length of the string in bytes followed by the string encoded in UTF-8.      |
+|               |             | Example: A context string with its length prefix.                                            |
+| Metadata      | Variable    | JSON string of the metadata dict, encoded in UTF-8 and prefixed with its length in bytes.     |
+|               |             | Example JSON: `{"created_at": "2024-04-18", ...}`                                             |
+
+
 ### Creating a CTX File
 To create a new CTX file, use the `create` method of the `Ctx` class:
 ```python

diff --git a/requirements.txt b/requirements.txt
@@ -8,5 +8,4 @@ Requests
 beautifulsoup4
 huggingface_hub
 tiktoken
-onnxruntime==1.17.1
-tokenizers==0.15.2
+tokenizers==0.15.2
diff --git a/tests/attention.pdf b/tests/attention.pdf
diff --git a/tests/chromadb/chroma.sqlite3 b/tests/chromadb/chroma.sqlite3
diff --git a/tests/notebook3.ipynb b/tests/notebook3.ipynb
diff --git a/tests/notebook4.ipynb b/tests/notebook4.ipynb
@@ -306,7 +306,61 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "!pip install langchain-chroma\n",
+    "!pip install pypdf\n",
+    "!pip install sentence-transformers\n",
+    "import requests\n",
+    "\n",
+    "from vlite.utils import process_pdf\n",
+    "\n",
+    "from langchain_community.document_loaders import TextLoader\n",
+    "from langchain_community.document_loaders import PyPDFLoader\n",
+    "\n",
+    "from langchain_text_splitters import CharacterTextSplitter\n",
+    "\n",
+    "from langchain_community.embeddings.sentence_transformer import (\n",
+    "    SentenceTransformerEmbeddings,\n",
+    ")\n",
+    "\n",
+    "from langchain_chroma import Chroma\n",
+    "\n",
+    "import chromadb\n",
+    "\n",
+    "# Download the PDF\n",
+    "open('attention.pdf', 'wb').write(requests.get('https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf').content)\n",
+    "\n",
+    "# Load the PDF document\n",
+    "loader = PyPDFLoader(\"attention.pdf\")\n",
+    "pages = loader.load_and_split()\n",
+    "\n",
+    "# Split the documents into chunks\n",
+    "text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)\n",
+    "texts = text_splitter.split_documents(pages)\n",
+    "\n",
+    "# Initialize embeddings\n",
+    "embeddings = SentenceTransformerEmbeddings(model_name=\"mixedbread-ai/mxbai-embed-large-v1\")\n",
+    "\n",
+    "# Initialize Chroma\n",
+    "chroma_client = chromadb.PersistentClient(path=\"chromadb\")\n",
+    "collection = chroma_client.create_collection(name=\"attention\")\n",
+    "\n",
+    "# Add texts to the Chroma vector database\n",
+    "chroma = Chroma.from_documents(texts, embeddings, collection_name=\"attention\")\n",
+    "\n",
+    "# Perform a similarity search\n",
+    "query = \"What is attention?\"\n",
+    "docs = chroma.similarity_search(query, k=3)\n",
+    "\n",
+    "# Print the most relevant chunks\n",
+    "for doc in docs:\n",
+    "    print(doc.page_content)\n",
+    "    print('---')\n",
+    "\n",
+    "# Get collection information\n",
+    "collection = chroma_client.get_collection(name=\"attention\")\n",
+    "print(f\"Number of elements in the collection: {collection.count()}\")"
+   ]
   }
  ],
  "metadata": {

diff --git a/tests/querytest.ipynb b/tests/querytest.ipynb
@@ -0,0 +1,217 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sdan/miniforge3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Not using OCR for data/attention2.pdf\n",
+      "Initial length of the texts: 4\n",
+      "Initial length of the embeddings: 4\n",
+      "Extended length of the texts: 500000\n",
+      "Extended length of the embeddings: 500000\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of items in the VLite instance: 500000\n",
+      "[VLite.info] Collection Information:\n",
+      "[VLite.info] Items: 500000\n",
+      "[VLite.info] Collection file: vlite_20240417_211628\n",
+      "[VLite.info] Embedding model: <vlite.model.EmbeddingModel object at 0x104301480>\n",
+      "Execution time: 1.197732925415039 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '..')\n",
+    "\n",
+    "from vlite import VLite\n",
+    "from vlite import EmbeddingModel\n",
+    "from vlite.utils import process_file, process_pdf, process_webpage\n",
+    "import time\n",
+    "\n",
+    "# Load and process data\n",
+    "corpus = process_file(\"data/attention2.pdf\")\n",
+    "emd = EmbeddingModel()\n",
+    "embeddings = emd.embed(corpus)\n",
+    "\n",
+    "# Verify the initial lengths\n",
+    "print(f\"Initial length of the texts: {len(corpus)}\")\n",
+    "print(f\"Initial length of the embeddings: {len(embeddings)}\")\n",
+    "\n",
+    "# Multiply texts and embeddings\n",
+    "multiplier = 125000\n",
+    "extended_corpus = corpus * multiplier\n",
+    "# Correctly extend the list of embeddings\n",
+    "extended_embeddings = [emb for emb in embeddings for _ in range(multiplier)]\n",
+    "\n",
+    "# Verify the extended lengths\n",
+    "print(f\"Extended length of the texts: {len(extended_corpus)}\")\n",
+    "print(f\"Extended length of the embeddings: {len(extended_embeddings)}\")\n",
+    "\n",
+    "# Create VLite instance and use set_batch\n",
+    "vdb = VLite()\n",
+    "vdb.set_batch(texts=extended_corpus, embeddings=extended_embeddings)\n",
+    "\n",
+    "# Check and display results\n",
+    "print(f\"Number of items in the VLite instance: {vdb.count()}\")\n",
+    "vdb.info()\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "vdb.retrieve(\"attention\", top_k=1)\n",
+    "\n",
+    "end_time = time.time()\n",
+    "execution_time = end_time - start_time\n",
+    "\n",
+    "print(f\"Execution time: {execution_time} seconds\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Not using OCR for data/attention2.pdf\n",
+      "Initial length of the texts: 4\n",
+      "Initial length of the embeddings: 4\n",
+      "Extended length of the texts: 1000000\n",
+      "Extended length of the embeddings: 1000000\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of items in the VLite instance: 1000000\n",
+      "[VLite.info] Collection Information:\n",
+      "[VLite.info] Items: 1000000\n",
+      "[VLite.info] Collection file: vlite_20240417_211918\n",
+      "[VLite.info] Embedding model: <vlite.model.EmbeddingModel object at 0x1043021a0>\n",
+      "Execution time: 2.6589159965515137 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '..')\n",
+    "\n",
+    "from vlite import VLite\n",
+    "from vlite import EmbeddingModel\n",
+    "from vlite.utils import process_file, process_pdf, process_webpage\n",
+    "import time\n",
+    "\n",
+    "# Load and process data\n",
+    "corpus = process_file(\"data/attention2.pdf\")\n",
+    "emd = EmbeddingModel()\n",
+    "embeddings = emd.embed(corpus)\n",
+    "\n",
+    "# Verify the initial lengths\n",
+    "print(f\"Initial length of the texts: {len(corpus)}\")\n",
+    "print(f\"Initial length of the embeddings: {len(embeddings)}\")\n",
+    "\n",
+    "# Multiply texts and embeddings\n",
+    "multiplier = 125000*2\n",
+    "extended_corpus = corpus * multiplier\n",
+    "# Correctly extend the list of embeddings\n",
+    "extended_embeddings = [emb for emb in embeddings for _ in range(multiplier)]\n",
+    "\n",
+    "# Verify the extended lengths\n",
+    "print(f\"Extended length of the texts: {len(extended_corpus)}\")\n",
+    "print(f\"Extended length of the embeddings: {len(extended_embeddings)}\")\n",
+    "\n",
+    "# Create VLite instance and use set_batch\n",
+    "vdb = VLite()\n",
+    "vdb.set_batch(texts=extended_corpus, embeddings=extended_embeddings)\n",
+    "\n",
+    "# Check and display results\n",
+    "print(f\"Number of items in the VLite instance: {vdb.count()}\")\n",
+    "vdb.info()\n",
+    "\n",
+    "start_time = time.time()\n",
+    "\n",
+    "vdb.retrieve(\"attention\", top_k=1)\n",
+    "\n",
+    "end_time = time.time()\n",
+    "execution_time = end_time - start_time\n",
+    "\n",
+    "print(f\"Execution time: {execution_time} seconds\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}