Skip to content

Commit

Permalink
added scrape_type
Browse files Browse the repository at this point in the history
  • Loading branch information
glorenzo972 committed Jun 7, 2024
1 parent 18c1484 commit 4f29e42
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 9 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
*Andrea Sponziello*
### **Copyrigth**: *Tiledesk SRL*

## [2024-06-07]
### 0.1.21
- added: support for pdf, docx and txt

## [2024-06-06]
### 0.1.20
- added: log_conf.json
Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "tilellm"
version = "0.1.20"
version = "0.1.21"
description = "tiledesk for RAG"
authors = ["Gianluca Lorenzo <[email protected]>"]
repository = "https://github.com/Tiledesk/tiledesk-llm"
Expand Down Expand Up @@ -30,6 +30,9 @@ beautifulsoup4 ="^4.12.3"
#uvicorn = "^0.28"
unstructured= "^0.12.6"
#playwright = "^1.43.0"
pypdf="^4.2.0"
docx2txt="^0.8"
wikipedia="^1.4.0"

[tool.poetry.dependencies.uvicorn]
version = "^0.28"
Expand Down
18 changes: 15 additions & 3 deletions tilellm/store/pinecone_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
PineconeItems,
PineconeIndexingResult
)
from tilellm.tools.document_tool_simple import get_content_by_url, get_content_by_url_with_bs
from tilellm.tools.document_tool_simple import (get_content_by_url,
get_content_by_url_with_bs,
load_document,
load_from_wikipedia
)

from tilellm.shared import const
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
Expand Down Expand Up @@ -48,8 +53,13 @@ async def add_pc_item(item):
cost = 0

try:
if type_source == 'url':
documents = get_content_by_url(source, scrape_type)
if type_source == 'url' or 'pdf' or 'docx' or 'txt':
documents = []
if type_source == 'url':
documents = get_content_by_url(source, scrape_type)
elif type_source == 'pdf' or 'docx' or 'txt':
documents = load_document(source, type_source)

for document in documents:
document.metadata["id"] = metadata_id
document.metadata["source"] = source
Expand Down Expand Up @@ -609,3 +619,5 @@ def get_embeddings_dimension(embedding):
emb_dimension = 1536

return emb_dimension


48 changes: 43 additions & 5 deletions tilellm/tools/document_tool_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.document_loaders import AsyncChromiumLoader
import requests
import logging

logger = logging.getLogger(__name__)

#"https://help.tiledesk.com/mychatbots/articles/il-pnrr-per-la-ricerca-e-linnovazione/"

# "https://help.tiledesk.com/mychatbots/articles/il-pnrr-per-la-ricerca-e-linnovazione/"
def get_content_by_url(url: str, scrape_type: int):
try:
urls = [url]
Expand All @@ -27,12 +30,44 @@ def get_content_by_url(url: str, scrape_type: int):
except Exception as ex:
raise ex


def load_document(url: str, type_source: str):
# import os
# name, extension = os.path.splitext(file)

if type_source == 'pdf':
from langchain_community.document_loaders import PyPDFLoader
logger.info(f'Loading {url}')
loader = PyPDFLoader(url)
elif type_source == 'docx':
from langchain_community.document_loaders import Docx2txtLoader
logger.info(f'Loading {url}')
loader = Docx2txtLoader(url)
elif type_source == 'txt':
from langchain_community.document_loaders import TextLoader
logger.info(f'Loading {url}')
loader = TextLoader(url)
else:
logger.info('Document format is not supported!')
return None

data = loader.load()
return data


def load_from_wikipedia(query, lang='en', load_max_docs=2):
from langchain_community.document_loaders import WikipediaLoader
loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
data = loader.load()
return data


def get_content_by_url_with_bs(url:str):
html = requests.get(url)
#urls = [url]
# urls = [url]
# Load HTML
#loader = await AsyncChromiumLoader(urls)
#html = loader.load()
# loader = await AsyncChromiumLoader(urls)
# html = loader.load()

from bs4 import BeautifulSoup
soup = BeautifulSoup(html.content, 'html.parser')
Expand Down Expand Up @@ -66,8 +101,11 @@ def get_content_by_url_with_bs(url:str):
testi.append(testo_doc)

# Aggiungi una riga vuota tra i segmenti
#if index < len(h1_tags) - 1:
# if index < len(h1_tags) - 1:
# print() # Stampa una riga vuota tra i segmenti


return testi



0 comments on commit 4f29e42

Please sign in to comment.