Skip to content

Commit

Permalink
fix: user-agent for scrape
Browse files Browse the repository at this point in the history
  • Loading branch information
glorenzo972 committed Jul 26, 2024
1 parent b2b497e commit 3803d70
Show file tree
Hide file tree
Showing 8 changed files with 114 additions and 14 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
*Andrea Sponziello*
### **Copyrigth**: *Tiledesk SRL*

## [2024-07-26]
### 0.2.7
- add: scrape_type=3|4
- add: to /api/qa "similarity_threshold"

## [2024-07-09]
### 0.2.6
- add: DELETE /api/chunk/<chunk_id>/namespace/<namespace>
Expand Down
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ RUN pip install .
RUN pip install "uvicorn[standard]" gunicorn
RUN python -m nltk.downloader punkt
RUN python -m nltk.downloader averaged_perceptron_tagger
RUN playwright install chromium
RUN playwright install-deps chromium
# Aggiustare redis
ENV REDIS_HOST=redis
ENV REDIS_URL=redis://redis:6379/0
Expand Down
7 changes: 4 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "tilellm"
version = "0.2.6"
version = "0.2.7"
description = "tiledesk for RAG"
authors = ["Gianluca Lorenzo <[email protected]>"]
repository = "https://github.com/Tiledesk/tiledesk-llm"
Expand Down Expand Up @@ -34,11 +34,12 @@ langchain_community = "0.2.x"
tiktoken = "0.7.x"
beautifulsoup4 = "^4.12.3"
#uvicorn = "^0.28"
unstructured= "0.14.x"
#playwright = "^1.43.0"
unstructured= "0.15.0"
playwright = "1.45.1"
pypdf="^4.2.0"
docx2txt="^0.8"
wikipedia="^1.4.0"
html2text="2024.2.26"
psutil="^6.0.0"

[tool.poetry.dependencies.uvicorn]
Expand Down
20 changes: 16 additions & 4 deletions tilellm/controller/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import fastapi
from langchain.chains import ConversationalRetrievalChain, LLMChain # Deprecata
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain_core.prompts import PromptTemplate, SystemMessagePromptTemplate
from langchain_openai import ChatOpenAI
# from tilellm.store.pinecone_repository import add_pc_item as pinecone_add_item
Expand Down Expand Up @@ -240,10 +243,19 @@ async def ask_with_memory(question_answer, repo=None) -> RetrievalResult:

vector_store = await repo.create_pc_index(oai_embeddings, emb_dimension)

retriever = vector_store.as_retriever(search_type=question_answer.search_type,
search_kwargs={'k': question_answer.top_k,
'namespace': question_answer.namespace}
)
vs_retriever = vector_store.as_retriever(search_type=question_answer.search_type,
search_kwargs={'k': question_answer.top_k,
'namespace': question_answer.namespace}
)

redundant_filter = EmbeddingsRedundantFilter(embeddings=oai_embeddings,
similarity_threshold=question_answer.similarity_threshold)
pipeline_compressor = DocumentCompressorPipeline(
transformers=[redundant_filter]
)
retriever = ContextualCompressionRetriever(
base_compressor=pipeline_compressor, base_retriever=vs_retriever
)

if question_answer.system_context is not None and question_answer.system_context:

Expand Down
33 changes: 32 additions & 1 deletion tilellm/models/item_model.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,25 @@
from pydantic import BaseModel, Field, field_validator, ValidationError
from pydantic import BaseModel, Field, field_validator, ValidationError, model_validator
from typing import Dict, Optional, List, Union
import datetime


class ParametersScrapeType4(BaseModel):
unwanted_tags: Optional[List[str]] = Field(default_factory=list)
tags_to_extract: Optional[List[str]] = Field(default_factory=list)
unwanted_classnames: Optional[List[str]] = Field(default_factory=list)
desired_classnames: Optional[List[str]] = Field(default_factory=list)
remove_lines: Optional[bool] = Field(default=False)
remove_comments: Optional[bool] = Field(default=False)

@model_validator(mode='after')
def check_booleans(cls, values):
remove_lines = values.remove_lines
remove_comments = values.remove_comments
if remove_lines is None or remove_comments is None:
raise ValueError('remove_lines and remove_comments must be provided in ParametersScrapeType4')
return values


class ItemSingle(BaseModel):
id: str
source: str | None = None
Expand All @@ -15,6 +32,19 @@ class ItemSingle(BaseModel):
webhook: str = Field(default_factory=lambda: "")
chunk_size: int = Field(default_factory=lambda: 1000)
chunk_overlap: int = Field(default_factory=lambda: 400)
parameters_scrape_type_4: Optional[ParametersScrapeType4] = None

@model_validator(mode='after')
def check_scrape_type(cls, values):
scrape_type = values.scrape_type
parameters_scrape_type_4 = values.parameters_scrape_type_4

if scrape_type == 4:
if parameters_scrape_type_4 is None:
raise ValueError('parameters_scrape_type_4 must be provided when scrape_type is 4')
else:
values.parameters_scrape_type_4 = None
return values


class MetadataItem(BaseModel):
Expand Down Expand Up @@ -57,6 +87,7 @@ class QuestionAnswer(BaseModel):
top_k: int = Field(default=5)
max_tokens: int = Field(default=128)
embedding: str = Field(default_factory=lambda: "text-embedding-ada-002")
similarity_threshold: float = Field(default_factory=lambda: 1.0)
debug: bool = Field(default_factory=lambda: False)
system_context: Optional[str] = None
search_type: str = Field(default_factory=lambda: "similarity")
Expand Down
5 changes: 4 additions & 1 deletion tilellm/store/pinecone/pinecone_repository_pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ async def add_pc_item(self, item, embedding_obj=None, embedding_dimension=None)
scrape_type = item.scrape_type
chunk_size = item.chunk_size
chunk_overlap = item.chunk_overlap
parameters_scrape_type_4 = item.parameters_scrape_type_4
try:
await self.delete_pc_ids_namespace(metadata_id=metadata_id, namespace=namespace)
except Exception as ex:
Expand All @@ -67,7 +68,9 @@ async def add_pc_item(self, item, embedding_obj=None, embedding_dimension=None)

documents = []
if type_source == 'url' or type_source == 'txt':
documents = get_content_by_url(source, scrape_type)
documents = await get_content_by_url(source,
scrape_type,
parameters_scrape_type_4=parameters_scrape_type_4)
else: # type_source == 'pdf' or 'docx' or 'txt':
documents = load_document(source, type_source)

Expand Down
5 changes: 4 additions & 1 deletion tilellm/store/pinecone/pinecone_repository_serverless.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ async def add_pc_item(self, item, embedding_obj=None, embedding_dimension=None):
scrape_type = item.scrape_type
chunk_size = item.chunk_size
chunk_overlap = item.chunk_overlap
parameters_scrape_type_4 = item.parameters_scrape_type_4
try:
await self.delete_pc_ids_namespace(metadata_id=metadata_id, namespace=namespace)
except Exception as ex:
Expand Down Expand Up @@ -73,7 +74,9 @@ async def add_pc_item(self, item, embedding_obj=None, embedding_dimension=None):

documents = []
if type_source == 'url' or type_source == 'txt':
documents = get_content_by_url(source, scrape_type)
documents = await get_content_by_url(source,
scrape_type,
parameters_scrape_type_4=parameters_scrape_type_4)
else: # elif type_source == 'pdf' or 'docx' or 'txt':
documents = load_document(source, type_source)

Expand Down
51 changes: 47 additions & 4 deletions tilellm/tools/document_tool_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,68 @@

from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_loaders import PlaywrightURLLoader

import requests
import logging

from langchain_community.document_transformers import BeautifulSoupTransformer
from langchain_core.documents import Document

logger = logging.getLogger(__name__)


# "https://help.tiledesk.com/mychatbots/articles/il-pnrr-per-la-ricerca-e-linnovazione/"
def get_content_by_url(url: str, scrape_type: int):
async def get_content_by_url(url: str, scrape_type: int, **kwargs) -> list[Document]:
"""
Get content by url! parse html page and extract content.
If scrape_type=0 Unstructured analyze the page and extract some useful information about page, like UL, Title etc.
If scrape_type=1, extract all the content.
If scape_type=2 is used playwright.
If scape_type=3 is used AsyncChromiumLoader and the html is transformed in text
If scape_type=4 is used AsyncChromiumLoader and BS4 in order to select the html element to extract
:param url: str representing url
:param scrape_type: 0|1|2!3!4
:return: list[Document]
"""
try:
urls = [url]
if scrape_type == 0:
loader = UnstructuredURLLoader(
urls=urls, mode="elements", strategy="fast", continue_on_failure=False,
headers={'user-agent': 'Mozilla/5.0'}
)
else:
docs = await loader.aload()

elif scrape_type == 1:
loader = UnstructuredURLLoader(
urls=urls, mode="single", continue_on_failure=False,
headers={'user-agent': 'Mozilla/5.0'}
)
docs = loader.load()
docs = await loader.aload()
elif scrape_type == 2:
loader = PlaywrightURLLoader(urls=urls)
docs = await loader.aload()
elif scrape_type == 3:
loader = AsyncChromiumLoader(urls=urls, user_agent='Mozilla/5.0')
docs = await loader.aload()
from langchain_community.document_transformers import Html2TextTransformer
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)
docs = docs_transformed
else:
params_type_4 = kwargs.get("parameters_scrape_type_4")
loader = AsyncChromiumLoader(urls=urls, user_agent='Mozilla/5.0')
docs = await loader.aload()
bs_transformer = BeautifulSoupTransformer()
docs_transformed = bs_transformer.transform_documents(docs,
tags_to_extract=params_type_4.tags_to_extract,
unwanted_tags =params_type_4.unwanted_tags,
unwanted_classnames=params_type_4.unwanted_classnames,
remove_lines=params_type_4.remove_lines,
remove_comments=params_type_4.remove_comments
)
docs = docs_transformed

for doc in docs:
doc.metadata = clean_metadata(doc.metadata)
Expand Down Expand Up @@ -57,6 +98,8 @@ def load_document(url: str, type_source: str):
return None

data = loader.load()
# from pprint import pprint
# pprint(data)
return data


Expand All @@ -67,7 +110,7 @@ def load_from_wikipedia(query, lang='en', load_max_docs=2):
return data


def get_content_by_url_with_bs(url:str):
def get_content_by_url_with_bs(url: str):
html = requests.get(url)
# urls = [url]
# Load HTML
Expand Down

0 comments on commit 3803d70

Please sign in to comment.