chore: Fixes, better param handling (#77)

* fix: miner fixes, encoding length * chore: pre-release
AnswerDotAI · Jan 27, 2024 · 2ac1b1d · 2ac1b1d
1 parent c94210c
commit 2ac1b1d
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 54 deletions.
diff --git a/examples/06-index_free_use.ipynb b/examples/06-index_free_use.ipynb
@@ -22,22 +22,22 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/bclavie/miniforge3/envs/ragatouille/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "/Users/bclavie/miniforge3/envs/test_rag/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
       "  from .autonotebook import tqdm as notebook_tqdm\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[Jan 25, 18:45:56] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...\n"
+      "[Jan 27, 16:56:39] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/bclavie/miniforge3/envs/ragatouille/lib/python3.11/site-packages/torch/cuda/amp/grad_scaler.py:125: UserWarning: torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling.\n",
+      "/Users/bclavie/miniforge3/envs/test_rag/lib/python3.9/site-packages/torch/cuda/amp/grad_scaler.py:125: UserWarning: torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling.\n",
       "  warnings.warn(\n"
      ]
     }
@@ -97,17 +97,20 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "  0%|          | 0/7 [00:00<?, ?it/s]/Users/bclavie/miniforge3/envs/ragatouille/lib/python3.11/site-packages/torch/amp/autocast_mode.py:250: UserWarning: User provided device_type of 'cuda', but CUDA is not available. Disabling\n",
+      "  0%|          | 0/7 [00:00<?, ?it/s]/Users/bclavie/miniforge3/envs/test_rag/lib/python3.9/site-packages/torch/amp/autocast_mode.py:250: UserWarning: User provided device_type of 'cuda', but CUDA is not available. Disabling\n",
       "  warnings.warn(\n",
-      " 14%|█▍        | 1/7 [00:03<00:23,  3.97s/it]/Users/bclavie/miniforge3/envs/ragatouille/lib/python3.11/site-packages/torch/amp/autocast_mode.py:250: UserWarning: User provided device_type of 'cuda', but CUDA is not available. Disabling\n",
+      " 14%|█▍        | 1/7 [00:03<00:21,  3.62s/it]/Users/bclavie/miniforge3/envs/test_rag/lib/python3.9/site-packages/torch/amp/autocast_mode.py:250: UserWarning: User provided device_type of 'cuda', but CUDA is not available. Disabling\n",
       "  warnings.warn(\n",
-      "100%|██████████| 7/7 [00:22<00:00,  3.21s/it]"
+      "100%|██████████| 7/7 [00:20<00:00,  2.90s/it]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Shapes:\n",
+      "encodings: torch.Size([212, 256, 128])\n",
+      "doc_masks: torch.Size([212, 256])\n",
       "Documents encoded!\n"
      ]
     },
@@ -182,13 +185,16 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  6.30it/s]"
+      "100%|██████████| 1/1 [00:00<00:00, 10.43it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Shapes:\n",
+      "encodings: torch.Size([2, 256, 128])\n",
+      "doc_masks: torch.Size([2, 256])\n",
       "Documents encoded!\n"
      ]
     },
@@ -271,13 +277,23 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 1/1 [00:00<00:00,  4.91it/s]"
+      "  0%|          | 0/1 [00:00<?, ?it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1/1 [00:00<00:00,  4.49it/s]"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Shapes:\n",
+      "encodings: torch.Size([2, 256, 128])\n",
+      "doc_masks: torch.Size([2, 256])\n",
       "Documents encoded!\n"
      ]
     },
@@ -341,7 +357,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "RAGatouille"
-version = "0.0.5b3"
+version = "0.0.6a1"
 description = "Library to facilitate the use of state-of-the-art retrieval models in common RAG contexts."
 authors = ["Benjamin Clavie <[email protected]>"]
 readme = "README.md"

diff --git a/ragatouille/RAGPretrainedModel.py b/ragatouille/RAGPretrainedModel.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Any, Callable, List, Optional, TypeVar, Union
+from typing import Any, Callable, List, Literal, Optional, TypeVar, Union
 from uuid import uuid4
 
 from langchain.retrievers.document_compressors.base import BaseDocumentCompressor
@@ -132,7 +132,7 @@ def index(
         document_ids: Union[TypeVar("T"), List[TypeVar("T")]] = None,
         document_metadatas: Optional[list[dict]] = None,
         index_name: str = None,
-        overwrite_index: bool = True,
+        overwrite_index: Union[bool, str] = True,
         max_document_length: int = 256,
         split_documents: bool = True,
         document_splitter_fn: Optional[Callable] = llama_index_sentence_splitter,
@@ -144,7 +144,7 @@ def index(
             collection (list[str]): The collection of documents to index.
             document_ids (Optional[list[str]]): An optional list of document ids. Ids will be generated at index time if not supplied.
             index_name (str): The name of the index that will be built.
-            overwrite_index (bool): Whether to overwrite an existing index with the same name.
+            overwrite_index (Union[bool, str]): Whether to overwrite an existing index with the same name.
             max_document_length (int): The maximum length of a document. Documents longer than this will be split into chunks.
             split_documents (bool): Whether to split documents into chunks.
             document_splitter_fn (Optional[Callable]): A function to split documents into chunks. If None and by default, will use the llama_index_sentence_splitter.
@@ -180,17 +180,13 @@ def index(
             index: item["document_id"] for index, item in enumerate(collection_with_ids)
         }
         collection = [x["content"] for x in collection_with_ids]
-
-        overwrite = "reuse"
-        if overwrite_index:
-            overwrite = True
         return self.model.index(
             collection,
             pid_docid_map=pid_docid_map,
             docid_metadata_map=docid_metadata_map,
             index_name=index_name,
             max_document_length=max_document_length,
-            overwrite=overwrite,
+            overwrite=overwrite_index,
         )
 
     def add_to_index(
@@ -342,6 +338,7 @@ def encode(
         bsize: int = 32,
         document_metadatas: Optional[list[dict]] = None,
         verbose: bool = True,
+        max_document_length: Union[Literal["auto"], int] = "auto",
     ):
         """Encode documents in memory to be searched through with no Index. Performance degrades rapidly with more documents.
 
@@ -357,6 +354,7 @@ def encode(
             bsize=bsize,
             document_metadatas=document_metadatas,
             verbose=verbose,
+            max_tokens=max_document_length,
         )
         if verbose:
             print("Documents encoded!")

diff --git a/ragatouille/__init__.py b/ragatouille/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.0.5b3"
+__version__ = "0.0.6a1"
 from .RAGPretrainedModel import RAGPretrainedModel
 from .RAGTrainer import RAGTrainer
 

diff --git a/ragatouille/data/training_data_processor.py b/ragatouille/data/training_data_processor.py
@@ -100,9 +100,7 @@ def _get_new_negatives(self, query, passages, mine_hard_negatives, n_new_negativ
         - Randomly sampling from the full collection otherwise
         """
         if mine_hard_negatives:
-            hard_negatives = self.negative_miner.mine_hard_negatives(
-                query, n_new_negatives
-            )
+            hard_negatives = self.negative_miner.mine_hard_negatives(query)
             candidates = [
                 x
                 for x in hard_negatives

diff --git a/ragatouille/models/colbert.py b/ragatouille/models/colbert.py
@@ -553,17 +553,17 @@ def _set_inference_max_tokens(
         ):
             if max_tokens == "auto" or max_tokens > 512:
                 max_tokens = 512
-                percentile_80 = np.percentile(
-                    [len(x.split(" ")) for x in documents], 80
+                percentile_90 = np.percentile(
+                    [len(x.split(" ")) for x in documents], 90
                 )
                 max_tokens = min(
-                    math.ceil((percentile_80 * 1.35) / 32) * 32,
+                    (math.ceil((percentile_90 * 1.35) / 32) * 32) * 1.1,
                     512,
                 )
                 max_tokens = max(256, max_tokens)
                 if max_tokens > 288:
                     print(
-                        f"Your documents are roughly {percentile_80} tokens long at the 80th percentile!",
+                        f"Your documents are roughly {percentile_90} tokens long at the 90th percentile!",
                         "This is quite long and might slow down reranking!\n",
                         "Provide fewer documents, build smaller chunks or run on GPU",
                         "if it takes too long for your needs!",
@@ -655,6 +655,38 @@ def encode(
         encodings, doc_masks = self._encode_index_free_documents(
             documents, bsize=bsize, verbose=verbose
         )
+        encodings = torch.cat(
+            [
+                encodings,
+                torch.zeros(
+                    (
+                        encodings.shape[0],
+                        self.inference_ckpt.doc_tokenizer.doc_maxlen
+                        - encodings.shape[1],
+                        encodings.shape[2],
+                    )
+                ),
+            ],
+            dim=1,
+        )
+        doc_masks = torch.cat(
+            [
+                doc_masks,
+                torch.full(
+                    (
+                        doc_masks.shape[0],
+                        self.inference_ckpt.colbert_config.max_doclen
+                        - doc_masks.shape[1],
+                    ),
+                    -float("inf"),
+                ),
+            ],
+            dim=1,
+        )
+
+        print("Shapes:")
+        print(f"encodings: {encodings.shape}")
+        print(f"doc_masks: {doc_masks.shape}")
 
         if hasattr(self, "in_memory_collection"):
             if self.in_memory_metadata is not None:
@@ -669,34 +701,7 @@ def encode(
             self.in_memory_collection.extend(documents)
 
             # add 0 padding to encodings so they're self.inference_ckpt.doc_tokenizer.doc_maxlen length
-            encodings = torch.cat(
-                [
-                    encodings,
-                    torch.zeros(
-                        (
-                            encodings.shape[0],
-                            self.inference_ckpt.doc_tokenizer.doc_maxlen
-                            - encodings.shape[1],
-                            encodings.shape[2],
-                        )
-                    ),
-                ],
-                dim=1,
-            )
-            doc_masks = torch.cat(
-                [
-                    doc_masks,
-                    torch.full(
-                        (
-                            doc_masks.shape[0],
-                            self.inference_ckpt.doc_tokenizer.doc_maxlen
-                            - doc_masks.shape[1],
-                        ),
-                        -float("inf"),
-                    ),
-                ],
-                dim=1,
-            )
+
             self.in_memory_embed_docs = torch.cat(
                 [self.in_memory_embed_docs, encodings], dim=0
             )