fix: user-agent for scrape

Tiledesk · Jul 26, 2024 · 3803d70 · 3803d70
1 parent b2b497e
commit 3803d70
Show file tree

Hide file tree

Showing 8 changed files with 114 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@
     *Andrea Sponziello* 
 ### **Copyrigth**: *Tiledesk SRL*
 
+## [2024-07-26]
+### 0.2.7
+- add: scrape_type=3|4 
+- add: to /api/qa "similarity_threshold" 
+
 ## [2024-07-09]
 ### 0.2.6
 - add: DELETE /api/chunk/<chunk_id>/namespace/<namespace>

diff --git a/Dockerfile b/Dockerfile
@@ -14,6 +14,8 @@ RUN pip install .
 RUN pip install "uvicorn[standard]" gunicorn
 RUN python -m nltk.downloader punkt
 RUN python -m nltk.downloader averaged_perceptron_tagger
+RUN playwright install chromium
+RUN playwright install-deps chromium
 # Aggiustare redis
 ENV REDIS_HOST=redis
 ENV REDIS_URL=redis://redis:6379/0

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "tilellm"
-version = "0.2.6"
+version = "0.2.7"
 description = "tiledesk for RAG"
 authors = ["Gianluca Lorenzo <[email protected]>"]
 repository = "https://github.com/Tiledesk/tiledesk-llm"
@@ -34,11 +34,12 @@ langchain_community = "0.2.x"
 tiktoken = "0.7.x"
 beautifulsoup4 = "^4.12.3"
 #uvicorn = "^0.28"
-unstructured= "0.14.x"
-#playwright = "^1.43.0"
+unstructured= "0.15.0"
+playwright = "1.45.1"
 pypdf="^4.2.0"
 docx2txt="^0.8"
 wikipedia="^1.4.0"
+html2text="2024.2.26"
 psutil="^6.0.0"
 
 [tool.poetry.dependencies.uvicorn]

diff --git a/tilellm/controller/controller.py b/tilellm/controller/controller.py
@@ -2,6 +2,9 @@
 
 import fastapi
 from langchain.chains import ConversationalRetrievalChain, LLMChain  # Deprecata
+from langchain.retrievers import ContextualCompressionRetriever
+from langchain.retrievers.document_compressors import DocumentCompressorPipeline
+from langchain_community.document_transformers import EmbeddingsRedundantFilter
 from langchain_core.prompts import PromptTemplate, SystemMessagePromptTemplate
 from langchain_openai import ChatOpenAI
 # from tilellm.store.pinecone_repository import add_pc_item as pinecone_add_item
@@ -240,10 +243,19 @@ async def ask_with_memory(question_answer, repo=None) -> RetrievalResult:
 
         vector_store = await repo.create_pc_index(oai_embeddings, emb_dimension)
 
-        retriever = vector_store.as_retriever(search_type=question_answer.search_type,
-                                              search_kwargs={'k': question_answer.top_k,
-                                                             'namespace': question_answer.namespace}
-                                              )
+        vs_retriever = vector_store.as_retriever(search_type=question_answer.search_type,
+                                                 search_kwargs={'k': question_answer.top_k,
+                                                                'namespace': question_answer.namespace}
+                                                 )
+
+        redundant_filter = EmbeddingsRedundantFilter(embeddings=oai_embeddings,
+                                                     similarity_threshold=question_answer.similarity_threshold)
+        pipeline_compressor = DocumentCompressorPipeline(
+            transformers=[redundant_filter]
+        )
+        retriever = ContextualCompressionRetriever(
+            base_compressor=pipeline_compressor, base_retriever=vs_retriever
+        )
 
         if question_answer.system_context is not None and question_answer.system_context:
 

diff --git a/tilellm/models/item_model.py b/tilellm/models/item_model.py
@@ -1,8 +1,25 @@
-from pydantic import BaseModel, Field,  field_validator, ValidationError
+from pydantic import BaseModel, Field, field_validator, ValidationError, model_validator
 from typing import Dict, Optional, List, Union
 import datetime
 
 
+class ParametersScrapeType4(BaseModel):
+    unwanted_tags: Optional[List[str]] = Field(default_factory=list)
+    tags_to_extract: Optional[List[str]] = Field(default_factory=list)
+    unwanted_classnames: Optional[List[str]] = Field(default_factory=list)
+    desired_classnames: Optional[List[str]] = Field(default_factory=list)
+    remove_lines: Optional[bool] = Field(default=False)
+    remove_comments: Optional[bool] = Field(default=False)
+
+    @model_validator(mode='after')
+    def check_booleans(cls, values):
+        remove_lines = values.remove_lines
+        remove_comments = values.remove_comments
+        if remove_lines is None or remove_comments is None:
+            raise ValueError('remove_lines and remove_comments must be provided in ParametersScrapeType4')
+        return values
+
+
 class ItemSingle(BaseModel):
     id: str
     source: str | None = None
@@ -15,6 +32,19 @@ class ItemSingle(BaseModel):
     webhook: str = Field(default_factory=lambda: "")
     chunk_size: int = Field(default_factory=lambda: 1000)
     chunk_overlap: int = Field(default_factory=lambda: 400)
+    parameters_scrape_type_4: Optional[ParametersScrapeType4] = None
+
+    @model_validator(mode='after')
+    def check_scrape_type(cls, values):
+        scrape_type = values.scrape_type
+        parameters_scrape_type_4 = values.parameters_scrape_type_4
+
+        if scrape_type == 4:
+            if parameters_scrape_type_4 is None:
+                raise ValueError('parameters_scrape_type_4 must be provided when scrape_type is 4')
+        else:
+            values.parameters_scrape_type_4 = None
+        return values
 
 
 class MetadataItem(BaseModel):
@@ -57,6 +87,7 @@ class QuestionAnswer(BaseModel):
     top_k: int = Field(default=5)
     max_tokens: int = Field(default=128)
     embedding: str = Field(default_factory=lambda: "text-embedding-ada-002")
+    similarity_threshold: float = Field(default_factory=lambda: 1.0)
     debug: bool = Field(default_factory=lambda: False)
     system_context: Optional[str] = None
     search_type: str = Field(default_factory=lambda: "similarity")

diff --git a/tilellm/store/pinecone/pinecone_repository_pod.py b/tilellm/store/pinecone/pinecone_repository_pod.py
@@ -43,6 +43,7 @@ async def add_pc_item(self, item, embedding_obj=None, embedding_dimension=None)
         scrape_type = item.scrape_type
         chunk_size = item.chunk_size
         chunk_overlap = item.chunk_overlap
+        parameters_scrape_type_4 = item.parameters_scrape_type_4
         try:
             await self.delete_pc_ids_namespace(metadata_id=metadata_id, namespace=namespace)
         except Exception as ex:
@@ -67,7 +68,9 @@ async def add_pc_item(self, item, embedding_obj=None, embedding_dimension=None)
 
                 documents = []
                 if type_source == 'url' or type_source == 'txt':
-                    documents = get_content_by_url(source, scrape_type)
+                    documents = await get_content_by_url(source,
+                                                         scrape_type,
+                                                         parameters_scrape_type_4=parameters_scrape_type_4)
                 else:  # type_source == 'pdf' or 'docx' or 'txt':
                     documents = load_document(source, type_source)
 

diff --git a/tilellm/store/pinecone/pinecone_repository_serverless.py b/tilellm/store/pinecone/pinecone_repository_serverless.py
@@ -45,6 +45,7 @@ async def add_pc_item(self, item, embedding_obj=None, embedding_dimension=None):
         scrape_type = item.scrape_type
         chunk_size = item.chunk_size
         chunk_overlap = item.chunk_overlap
+        parameters_scrape_type_4 = item.parameters_scrape_type_4
         try:
             await self.delete_pc_ids_namespace(metadata_id=metadata_id, namespace=namespace)
         except Exception as ex:
@@ -73,7 +74,9 @@ async def add_pc_item(self, item, embedding_obj=None, embedding_dimension=None):
 
                 documents = []
                 if type_source == 'url' or type_source == 'txt':
-                    documents = get_content_by_url(source, scrape_type)
+                    documents = await get_content_by_url(source,
+                                                         scrape_type,
+                                                         parameters_scrape_type_4=parameters_scrape_type_4)
                 else:  # elif type_source == 'pdf' or 'docx' or 'txt':
                     documents = load_document(source, type_source)
 

diff --git a/tilellm/tools/document_tool_simple.py b/tilellm/tools/document_tool_simple.py
@@ -3,27 +3,68 @@
 
 from langchain_community.document_loaders import UnstructuredURLLoader
 from langchain_community.document_loaders import AsyncChromiumLoader
+from langchain_community.document_loaders import PlaywrightURLLoader
+
 import requests
 import logging
 
+from langchain_community.document_transformers import BeautifulSoupTransformer
+from langchain_core.documents import Document
+
 logger = logging.getLogger(__name__)
 
 
 # "https://help.tiledesk.com/mychatbots/articles/il-pnrr-per-la-ricerca-e-linnovazione/"
-def get_content_by_url(url: str, scrape_type: int):
+async def get_content_by_url(url: str, scrape_type: int,  **kwargs) -> list[Document]:
+    """
+    Get content by url! parse html page and extract content.
+    If scrape_type=0 Unstructured analyze the page and extract some useful information about page, like UL, Title etc.
+    If scrape_type=1, extract all the content.
+    If scape_type=2 is used playwright.
+    If scape_type=3 is used AsyncChromiumLoader and the html is transformed in text
+    If scape_type=4 is used AsyncChromiumLoader and BS4 in order to select the html element to extract
+    :param url: str representing url
+    :param scrape_type: 0|1|2!3!4
+    :return: list[Document]
+    """
     try:
         urls = [url]
         if scrape_type == 0:
             loader = UnstructuredURLLoader(
                 urls=urls, mode="elements", strategy="fast", continue_on_failure=False,
                 headers={'user-agent': 'Mozilla/5.0'}
             )
-        else:
+            docs = await loader.aload()
+
+        elif scrape_type == 1:
             loader = UnstructuredURLLoader(
                 urls=urls, mode="single", continue_on_failure=False,
                 headers={'user-agent': 'Mozilla/5.0'}
             )
-        docs = loader.load()
+            docs = await loader.aload()
+        elif scrape_type == 2:
+            loader = PlaywrightURLLoader(urls=urls)
+            docs = await loader.aload()
+        elif scrape_type == 3:
+            loader = AsyncChromiumLoader(urls=urls, user_agent='Mozilla/5.0')
+            docs = await loader.aload()
+            from langchain_community.document_transformers import Html2TextTransformer
+            html2text = Html2TextTransformer()
+            docs_transformed = html2text.transform_documents(docs)
+            docs = docs_transformed
+        else:
+            params_type_4 = kwargs.get("parameters_scrape_type_4")
+            loader = AsyncChromiumLoader(urls=urls, user_agent='Mozilla/5.0')
+            docs = await loader.aload()
+            bs_transformer = BeautifulSoupTransformer()
+            docs_transformed = bs_transformer.transform_documents(docs,
+                                                                  tags_to_extract=params_type_4.tags_to_extract,
+                                                                  unwanted_tags =params_type_4.unwanted_tags,
+                                                                  unwanted_classnames=params_type_4.unwanted_classnames,
+                                                                  remove_lines=params_type_4.remove_lines,
+                                                                  remove_comments=params_type_4.remove_comments
+                                                                  )
+            docs = docs_transformed
 
         for doc in docs:
             doc.metadata = clean_metadata(doc.metadata)
@@ -57,6 +98,8 @@ def load_document(url: str, type_source: str):
         return None
 
     data = loader.load()
+    # from pprint import pprint
+    # pprint(data)
     return data
 
 
@@ -67,7 +110,7 @@ def load_from_wikipedia(query, lang='en', load_max_docs=2):
     return data
 
 
-def get_content_by_url_with_bs(url:str):
+def get_content_by_url_with_bs(url: str):
     html = requests.get(url)
     # urls = [url]
     # Load HTML