From 3803d7000554466fa5f74541930a88d13a889ab0 Mon Sep 17 00:00:00 2001 From: glorenzo972 Date: Fri, 26 Jul 2024 14:53:22 +0200 Subject: [PATCH] fix: user-agent for scrape --- CHANGELOG.md | 5 ++ Dockerfile | 2 + pyproject.toml | 7 +-- tilellm/controller/controller.py | 20 ++++++-- tilellm/models/item_model.py | 33 +++++++++++- .../store/pinecone/pinecone_repository_pod.py | 5 +- .../pinecone_repository_serverless.py | 5 +- tilellm/tools/document_tool_simple.py | 51 +++++++++++++++++-- 8 files changed, 114 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5012133..440a385 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,11 @@ *Andrea Sponziello* ### **Copyrigth**: *Tiledesk SRL* +## [2024-07-26] +### 0.2.7 +- add: scrape_type=3|4 +- add: to /api/qa "similarity_threshold" + ## [2024-07-09] ### 0.2.6 - add: DELETE /api/chunk//namespace/ diff --git a/Dockerfile b/Dockerfile index 7df2d1e..b02c3b4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,8 @@ RUN pip install . RUN pip install "uvicorn[standard]" gunicorn RUN python -m nltk.downloader punkt RUN python -m nltk.downloader averaged_perceptron_tagger +RUN playwright install chromium +RUN playwright install-deps chromium # Aggiustare redis ENV REDIS_HOST=redis ENV REDIS_URL=redis://redis:6379/0 diff --git a/pyproject.toml b/pyproject.toml index ef3cde6..81dbaaf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "tilellm" -version = "0.2.6" +version = "0.2.7" description = "tiledesk for RAG" authors = ["Gianluca Lorenzo "] repository = "https://github.com/Tiledesk/tiledesk-llm" @@ -34,11 +34,12 @@ langchain_community = "0.2.x" tiktoken = "0.7.x" beautifulsoup4 = "^4.12.3" #uvicorn = "^0.28" -unstructured= "0.14.x" -#playwright = "^1.43.0" +unstructured= "0.15.0" +playwright = "1.45.1" pypdf="^4.2.0" docx2txt="^0.8" wikipedia="^1.4.0" +html2text="2024.2.26" psutil="^6.0.0" [tool.poetry.dependencies.uvicorn] diff --git a/tilellm/controller/controller.py b/tilellm/controller/controller.py index bee90b5..9b0bcff 100644 --- a/tilellm/controller/controller.py +++ b/tilellm/controller/controller.py @@ -2,6 +2,9 @@ import fastapi from langchain.chains import ConversationalRetrievalChain, LLMChain # Deprecata +from langchain.retrievers import ContextualCompressionRetriever +from langchain.retrievers.document_compressors import DocumentCompressorPipeline +from langchain_community.document_transformers import EmbeddingsRedundantFilter from langchain_core.prompts import PromptTemplate, SystemMessagePromptTemplate from langchain_openai import ChatOpenAI # from tilellm.store.pinecone_repository import add_pc_item as pinecone_add_item @@ -240,10 +243,19 @@ async def ask_with_memory(question_answer, repo=None) -> RetrievalResult: vector_store = await repo.create_pc_index(oai_embeddings, emb_dimension) - retriever = vector_store.as_retriever(search_type=question_answer.search_type, - search_kwargs={'k': question_answer.top_k, - 'namespace': question_answer.namespace} - ) + vs_retriever = vector_store.as_retriever(search_type=question_answer.search_type, + search_kwargs={'k': question_answer.top_k, + 'namespace': question_answer.namespace} + ) + + redundant_filter = EmbeddingsRedundantFilter(embeddings=oai_embeddings, + similarity_threshold=question_answer.similarity_threshold) + pipeline_compressor = DocumentCompressorPipeline( + transformers=[redundant_filter] + ) + retriever = ContextualCompressionRetriever( + base_compressor=pipeline_compressor, base_retriever=vs_retriever + ) if question_answer.system_context is not None and question_answer.system_context: diff --git a/tilellm/models/item_model.py b/tilellm/models/item_model.py index 90e4e9a..e9ceae4 100644 --- a/tilellm/models/item_model.py +++ b/tilellm/models/item_model.py @@ -1,8 +1,25 @@ -from pydantic import BaseModel, Field, field_validator, ValidationError +from pydantic import BaseModel, Field, field_validator, ValidationError, model_validator from typing import Dict, Optional, List, Union import datetime +class ParametersScrapeType4(BaseModel): + unwanted_tags: Optional[List[str]] = Field(default_factory=list) + tags_to_extract: Optional[List[str]] = Field(default_factory=list) + unwanted_classnames: Optional[List[str]] = Field(default_factory=list) + desired_classnames: Optional[List[str]] = Field(default_factory=list) + remove_lines: Optional[bool] = Field(default=False) + remove_comments: Optional[bool] = Field(default=False) + + @model_validator(mode='after') + def check_booleans(cls, values): + remove_lines = values.remove_lines + remove_comments = values.remove_comments + if remove_lines is None or remove_comments is None: + raise ValueError('remove_lines and remove_comments must be provided in ParametersScrapeType4') + return values + + class ItemSingle(BaseModel): id: str source: str | None = None @@ -15,6 +32,19 @@ class ItemSingle(BaseModel): webhook: str = Field(default_factory=lambda: "") chunk_size: int = Field(default_factory=lambda: 1000) chunk_overlap: int = Field(default_factory=lambda: 400) + parameters_scrape_type_4: Optional[ParametersScrapeType4] = None + + @model_validator(mode='after') + def check_scrape_type(cls, values): + scrape_type = values.scrape_type + parameters_scrape_type_4 = values.parameters_scrape_type_4 + + if scrape_type == 4: + if parameters_scrape_type_4 is None: + raise ValueError('parameters_scrape_type_4 must be provided when scrape_type is 4') + else: + values.parameters_scrape_type_4 = None + return values class MetadataItem(BaseModel): @@ -57,6 +87,7 @@ class QuestionAnswer(BaseModel): top_k: int = Field(default=5) max_tokens: int = Field(default=128) embedding: str = Field(default_factory=lambda: "text-embedding-ada-002") + similarity_threshold: float = Field(default_factory=lambda: 1.0) debug: bool = Field(default_factory=lambda: False) system_context: Optional[str] = None search_type: str = Field(default_factory=lambda: "similarity") diff --git a/tilellm/store/pinecone/pinecone_repository_pod.py b/tilellm/store/pinecone/pinecone_repository_pod.py index baa7cee..d5d6f9e 100644 --- a/tilellm/store/pinecone/pinecone_repository_pod.py +++ b/tilellm/store/pinecone/pinecone_repository_pod.py @@ -43,6 +43,7 @@ async def add_pc_item(self, item, embedding_obj=None, embedding_dimension=None) scrape_type = item.scrape_type chunk_size = item.chunk_size chunk_overlap = item.chunk_overlap + parameters_scrape_type_4 = item.parameters_scrape_type_4 try: await self.delete_pc_ids_namespace(metadata_id=metadata_id, namespace=namespace) except Exception as ex: @@ -67,7 +68,9 @@ async def add_pc_item(self, item, embedding_obj=None, embedding_dimension=None) documents = [] if type_source == 'url' or type_source == 'txt': - documents = get_content_by_url(source, scrape_type) + documents = await get_content_by_url(source, + scrape_type, + parameters_scrape_type_4=parameters_scrape_type_4) else: # type_source == 'pdf' or 'docx' or 'txt': documents = load_document(source, type_source) diff --git a/tilellm/store/pinecone/pinecone_repository_serverless.py b/tilellm/store/pinecone/pinecone_repository_serverless.py index 7cabc3b..9636441 100644 --- a/tilellm/store/pinecone/pinecone_repository_serverless.py +++ b/tilellm/store/pinecone/pinecone_repository_serverless.py @@ -45,6 +45,7 @@ async def add_pc_item(self, item, embedding_obj=None, embedding_dimension=None): scrape_type = item.scrape_type chunk_size = item.chunk_size chunk_overlap = item.chunk_overlap + parameters_scrape_type_4 = item.parameters_scrape_type_4 try: await self.delete_pc_ids_namespace(metadata_id=metadata_id, namespace=namespace) except Exception as ex: @@ -73,7 +74,9 @@ async def add_pc_item(self, item, embedding_obj=None, embedding_dimension=None): documents = [] if type_source == 'url' or type_source == 'txt': - documents = get_content_by_url(source, scrape_type) + documents = await get_content_by_url(source, + scrape_type, + parameters_scrape_type_4=parameters_scrape_type_4) else: # elif type_source == 'pdf' or 'docx' or 'txt': documents = load_document(source, type_source) diff --git a/tilellm/tools/document_tool_simple.py b/tilellm/tools/document_tool_simple.py index 33dbaff..8a3488b 100644 --- a/tilellm/tools/document_tool_simple.py +++ b/tilellm/tools/document_tool_simple.py @@ -3,14 +3,30 @@ from langchain_community.document_loaders import UnstructuredURLLoader from langchain_community.document_loaders import AsyncChromiumLoader +from langchain_community.document_loaders import PlaywrightURLLoader + import requests import logging +from langchain_community.document_transformers import BeautifulSoupTransformer +from langchain_core.documents import Document + logger = logging.getLogger(__name__) # "https://help.tiledesk.com/mychatbots/articles/il-pnrr-per-la-ricerca-e-linnovazione/" -def get_content_by_url(url: str, scrape_type: int): +async def get_content_by_url(url: str, scrape_type: int, **kwargs) -> list[Document]: + """ + Get content by url! parse html page and extract content. + If scrape_type=0 Unstructured analyze the page and extract some useful information about page, like UL, Title etc. + If scrape_type=1, extract all the content. + If scape_type=2 is used playwright. + If scape_type=3 is used AsyncChromiumLoader and the html is transformed in text + If scape_type=4 is used AsyncChromiumLoader and BS4 in order to select the html element to extract + :param url: str representing url + :param scrape_type: 0|1|2!3!4 + :return: list[Document] + """ try: urls = [url] if scrape_type == 0: @@ -18,12 +34,37 @@ def get_content_by_url(url: str, scrape_type: int): urls=urls, mode="elements", strategy="fast", continue_on_failure=False, headers={'user-agent': 'Mozilla/5.0'} ) - else: + docs = await loader.aload() + + elif scrape_type == 1: loader = UnstructuredURLLoader( urls=urls, mode="single", continue_on_failure=False, headers={'user-agent': 'Mozilla/5.0'} ) - docs = loader.load() + docs = await loader.aload() + elif scrape_type == 2: + loader = PlaywrightURLLoader(urls=urls) + docs = await loader.aload() + elif scrape_type == 3: + loader = AsyncChromiumLoader(urls=urls, user_agent='Mozilla/5.0') + docs = await loader.aload() + from langchain_community.document_transformers import Html2TextTransformer + html2text = Html2TextTransformer() + docs_transformed = html2text.transform_documents(docs) + docs = docs_transformed + else: + params_type_4 = kwargs.get("parameters_scrape_type_4") + loader = AsyncChromiumLoader(urls=urls, user_agent='Mozilla/5.0') + docs = await loader.aload() + bs_transformer = BeautifulSoupTransformer() + docs_transformed = bs_transformer.transform_documents(docs, + tags_to_extract=params_type_4.tags_to_extract, + unwanted_tags =params_type_4.unwanted_tags, + unwanted_classnames=params_type_4.unwanted_classnames, + remove_lines=params_type_4.remove_lines, + remove_comments=params_type_4.remove_comments + ) + docs = docs_transformed for doc in docs: doc.metadata = clean_metadata(doc.metadata) @@ -57,6 +98,8 @@ def load_document(url: str, type_source: str): return None data = loader.load() + # from pprint import pprint + # pprint(data) return data @@ -67,7 +110,7 @@ def load_from_wikipedia(query, lang='en', load_max_docs=2): return data -def get_content_by_url_with_bs(url:str): +def get_content_by_url_with_bs(url: str): html = requests.get(url) # urls = [url] # Load HTML