diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c49b4a..7ce5192 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ ## [2024-05-20] +### 0.1.18 +- added: scrape_type =0|1 + +## [2024-05-20] + ### 0.1.17 - added: PIENCONE_TYPE = "serverless|pod" diff --git a/pyproject.toml b/pyproject.toml index d7b0442..5683d66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "tilellm" -version = "0.1.17" +version = "0.1.18" description = "tiledesk for RAG" authors = ["Gianluca Lorenzo "] repository = "https://github.com/Tiledesk/tiledesk-llm" diff --git a/tilellm/models/item_model.py b/tilellm/models/item_model.py index aa1eeb7..880eb9a 100644 --- a/tilellm/models/item_model.py +++ b/tilellm/models/item_model.py @@ -9,6 +9,7 @@ class ItemSingle(BaseModel): type: str | None = None content: str | None = None gptkey: str | None = None + scrape_type: int = Field(default_factory=lambda: 0) embedding: str = Field(default_factory=lambda: "text-embedding-ada-002") namespace: str | None = None webhook: str = Field(default_factory=lambda: "") diff --git a/tilellm/store/pinecone_repository.py b/tilellm/store/pinecone_repository.py index b94d26f..b038bd4 100644 --- a/tilellm/store/pinecone_repository.py +++ b/tilellm/store/pinecone_repository.py @@ -30,6 +30,7 @@ async def add_pc_item(item): gpt_key = item.gptkey embedding = item.embedding namespace = item.namespace + scrape_type = item.scrape_type try: await delete_pc_ids_namespace(metadata_id=metadata_id, namespace=namespace) except Exception as ex: @@ -47,7 +48,7 @@ async def add_pc_item(item): cost = 0 if type_source == 'url': - documents = get_content_by_url(source) + documents = get_content_by_url(source, scrape_type) for document in documents: document.metadata["id"] = metadata_id document.metadata["source"] = source diff --git a/tilellm/tools/document_tool_simple.py b/tilellm/tools/document_tool_simple.py index cacbb5e..0916b7c 100644 --- a/tilellm/tools/document_tool_simple.py +++ b/tilellm/tools/document_tool_simple.py @@ -7,15 +7,20 @@ #"https://help.tiledesk.com/mychatbots/articles/il-pnrr-per-la-ricerca-e-linnovazione/" -def get_content_by_url(url:str): +def get_content_by_url(url: str, scrape_type: int): urls =[url] - loader = UnstructuredURLLoader( - urls=urls, mode="elements", strategy="fast", - ) + if scrape_type == 0: + loader = UnstructuredURLLoader( + urls=urls, mode="elements", strategy="fast", + ) + else: + loader = UnstructuredURLLoader( + urls=urls, mode="single" + ) docs = loader.load() - #from pprint import pprint - #pprint(docs) + # from pprint import pprint + # pprint(docs) return docs