Skip to content

Commit

Permalink
added scrape_type
Browse files Browse the repository at this point in the history
  • Loading branch information
glorenzo972 committed May 28, 2024
1 parent cd5fa6c commit c4ca087
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 8 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@

## [2024-05-20]

### 0.1.18
- added: scrape_type =0|1

## [2024-05-20]

### 0.1.17
- added: PIENCONE_TYPE = "serverless|pod"

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "tilellm"
version = "0.1.17"
version = "0.1.18"
description = "tiledesk for RAG"
authors = ["Gianluca Lorenzo <[email protected]>"]
repository = "https://github.com/Tiledesk/tiledesk-llm"
Expand Down
1 change: 1 addition & 0 deletions tilellm/models/item_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class ItemSingle(BaseModel):
type: str | None = None
content: str | None = None
gptkey: str | None = None
scrape_type: int = Field(default_factory=lambda: 0)
embedding: str = Field(default_factory=lambda: "text-embedding-ada-002")
namespace: str | None = None
webhook: str = Field(default_factory=lambda: "")
Expand Down
3 changes: 2 additions & 1 deletion tilellm/store/pinecone_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ async def add_pc_item(item):
gpt_key = item.gptkey
embedding = item.embedding
namespace = item.namespace
scrape_type = item.scrape_type
try:
await delete_pc_ids_namespace(metadata_id=metadata_id, namespace=namespace)
except Exception as ex:
Expand All @@ -47,7 +48,7 @@ async def add_pc_item(item):
cost = 0

if type_source == 'url':
documents = get_content_by_url(source)
documents = get_content_by_url(source, scrape_type)
for document in documents:
document.metadata["id"] = metadata_id
document.metadata["source"] = source
Expand Down
17 changes: 11 additions & 6 deletions tilellm/tools/document_tool_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,20 @@


#"https://help.tiledesk.com/mychatbots/articles/il-pnrr-per-la-ricerca-e-linnovazione/"
def get_content_by_url(url:str):
def get_content_by_url(url: str, scrape_type: int):
urls =[url]
loader = UnstructuredURLLoader(
urls=urls, mode="elements", strategy="fast",
)
if scrape_type == 0:
loader = UnstructuredURLLoader(
urls=urls, mode="elements", strategy="fast",
)
else:
loader = UnstructuredURLLoader(
urls=urls, mode="single"
)
docs = loader.load()

#from pprint import pprint
#pprint(docs)
# from pprint import pprint
# pprint(docs)

return docs

Expand Down

0 comments on commit c4ca087

Please sign in to comment.