add hybrid search with splade and bge-m3;

Tiledesk · Sep 29, 2024 · be08fb7 · be08fb7
1 parent 1954fc8
commit be08fb7
Show file tree

Hide file tree

Showing 18 changed files with 1,916 additions and 1,497 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,8 +5,21 @@
     *Andrea Sponziello* 
 ### **Copyrigth**: *Tiledesk SRL*
 
+## [2024-09-21]
+### 0.3.1
+- add sentence embedding with bge-m3
+- add: hybrid search with bg3-m3
+- modify: deleted env variable for vector store 
+
+## [2024-09-23]
+### 0.3.0
+- add: hybrid search
+- add: indexing based on spade 
+- minor fix
+
+
 ## [2024-09-17]
-### 0.2.19
+### 0.2.20
 - upgrade: worker to 0.0.27
 
 ## [2024-09-14]

diff --git a/Dockerfile b/Dockerfile
@@ -16,8 +16,11 @@ RUN python -m nltk.downloader punkt
 RUN python -m nltk.downloader punkt_tab
 RUN python -m nltk.downloader averaged_perceptron_tagger
 RUN python -m nltk.downloader averaged_perceptron_tagger_eng
+RUN python -m nltk.downloader stopwords
 RUN playwright install chromium
 RUN playwright install-deps chromium
+RUN python -c "from transformers import AutoModelForSequenceClassification; model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-m3');"
+RUN python -c "from transformers import AutoModelForSequenceClassification; model = AutoModelForSequenceClassification.from_pretrained('naver/splade-cocondenser-ensembledistil');"
 # Aggiustare redis
 ENV REDIS_HOST=redis
 ENV REDIS_URL=redis://redis:6379/0

diff --git a/README.md b/README.md
@@ -94,6 +94,7 @@ Models for /api/ask
 - gpt-4
 - gpt-4-turbo
 - got-4o
+- got-4o-mini
 
 ### Cohere - engine: cohere
 - command-r
@@ -135,4 +136,58 @@ In this method, any difference greater than X standard deviations is split.
 In this method, the interquartile distance is used to split chunks.
 
 ### gradient
-In this method, the gradient of distance is used to split chunks along with the percentile method. This method is useful when chunks are highly correlated with each other or specific to a domain e.g. legal or medical. The idea is to apply anomaly detection on gradient array so that the distribution become wider and easy to identify boundaries in highly semantic data.
+In this method, the gradient of distance is used to split chunks along with the percentile method. This method is useful when chunks are highly correlated with each other or specific to a domain e.g. legal or medical. The idea is to apply anomaly detection on gradient array so that the distribution become wider and easy to identify boundaries in highly semantic data.
+
+
+## Hybrid Search
+
+### /api/scrape/single
+
+```json
+{
+ ...
+ "embedding":"huggingface",
+  "hybrid":true,
+  "sparse_encoder":"splade|bge-m3",
+  ... 
+  "engine":
+   {
+    "name": "",
+    "type": "",
+    "apikey" : "",
+    "vector_size": 1024,
+    "index_name": "" 
+   }  
+}
+```
+
+### /api/qa
+
+```json
+{
+  "question": "question",
+  "namespace": "",
+  "debug":true,
+  "citations":true,
+  "llm": "anthropic|groq",
+  "gptkey": "api-key of llm",
+  "model": "es. claude-3-5-sonnet-20240620 | llama-3.1-70b-versatile",
+  "temperature": 0.9,
+  "max_tokens":2048,
+  "embedding":"huggingfacce",
+  "sparse_encoder":"splade|bge-m3",
+  "search_type":"hybrid",
+  "alpha": 0.2,
+  "similarity_threshold":0.95,
+  "system_context":"",
+  "top_k": 6,
+  "engine":
+  {
+    "name": "",
+    "type": "",
+    "apikey" : "",
+    "vector_size": 1024,
+    "index_name": "" 
+  }
+}
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "tilellm"
-version = "0.2.20"
+version = "0.3.0"
 description = "tiledesk for RAG"
 authors = ["Gianluca Lorenzo <[email protected]>"]
 repository = "https://github.com/Tiledesk/tiledesk-llm"
@@ -18,20 +18,23 @@ jsonschema= "4.23.0"
 redis= "^5.0.7"
 aioredis= "2.0.1"
 #redismutex = "^1.0.0"
-langchain = "0.3.0" #"0.2.16"
+langchain = "0.3.1"#"0.3.0" #"0.2.16"
 jq = "1.8.0" #"1.7.0"
-openai = "1.45.1" #"1.37.1"
-langchain-openai = "0.2.0" #"0.1.19"
+openai = "1.48.0" #"1.45.1" #"1.37.1"
+langchain-openai ="0.2.1" #"0.2.0" #"0.1.19"
 langchain-voyageai = "0.1.2" #"0.1.1"
-langchain-anthropic = "0.2.0" #"0.1.21"
+langchain-anthropic = "0.2.1" #"0.2.0" #"0.1.21"
 langchain-cohere= "0.3.0" #"0.1.9"
 langchain-google-genai= "2.0.0" #"1.0.8"
 langchain-groq = "0.2.0" #"0.1.8"
-langchain-aws= "0.2.0" #"0.1.12"
+langchain-aws= "0.2.1" #"0.1.12"
 pinecone-client = "5.0.1" #"5.0.0"
 python-dotenv = "1.0.1"
-langchain-community = "0.3.0" #"0.2.10"
-langchain-experimental = "0.3.0" #no previous
+langchain-community = "0.3.1" #"0.2.10"
+langchain-experimental = "0.3.1" #no previous
+langchain-pinecone = "0.2.0"
+langchain-huggingface="0.1.0"
+peft = "0.13.0"
 
 tiktoken = "0.7.0"
 beautifulsoup4 = "4.12.3"
@@ -45,8 +48,18 @@ html2text= "2024.2.26"
 psutil= "6.0.0"
 httpx= "0.27.2" #"0.27.0"
 gql= "3.5.0"
+PyJWT= "2.9.0"
+#pinecone-text= "0.9.0"
+torch="2.4.1"
+FlagEmbedding="1.2.11"
 
 
+
+
+[tool.poetry.dependencies.pinecone-text]
+version = "0.9.0"
+extras = ["splade"]
+
 [tool.poetry.dependencies.uvicorn]
 version = "0.30.6" #"0.30.3"
 extras = ["standard"]