diff --git a/CHANGELOG.md b/CHANGELOG.md index 788c64b..bb86f5a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ *Andrea Sponziello* ### **Copyrigth**: *Tiledesk SRL* +## [2024-06-07] +### 0.1.21 +- added: support for pdf, docx and txt + ## [2024-06-06] ### 0.1.20 - added: log_conf.json diff --git a/pyproject.toml b/pyproject.toml index c23d0cf..eeea46c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "tilellm" -version = "0.1.20" +version = "0.1.21" description = "tiledesk for RAG" authors = ["Gianluca Lorenzo "] repository = "https://github.com/Tiledesk/tiledesk-llm" @@ -30,6 +30,9 @@ beautifulsoup4 ="^4.12.3" #uvicorn = "^0.28" unstructured= "^0.12.6" #playwright = "^1.43.0" +pypdf="^4.2.0" +docx2txt="^0.8" +wikipedia="^1.4.0" [tool.poetry.dependencies.uvicorn] version = "^0.28" diff --git a/tilellm/store/pinecone_repository.py b/tilellm/store/pinecone_repository.py index b7a84ec..83c017b 100644 --- a/tilellm/store/pinecone_repository.py +++ b/tilellm/store/pinecone_repository.py @@ -3,7 +3,12 @@ PineconeItems, PineconeIndexingResult ) -from tilellm.tools.document_tool_simple import get_content_by_url, get_content_by_url_with_bs +from tilellm.tools.document_tool_simple import (get_content_by_url, + get_content_by_url_with_bs, + load_document, + load_from_wikipedia + ) + from tilellm.shared import const from langchain_core.documents import Document from langchain_openai import OpenAIEmbeddings @@ -48,8 +53,13 @@ async def add_pc_item(item): cost = 0 try: - if type_source == 'url': - documents = get_content_by_url(source, scrape_type) + if type_source == 'url' or 'pdf' or 'docx' or 'txt': + documents = [] + if type_source == 'url': + documents = get_content_by_url(source, scrape_type) + elif type_source == 'pdf' or 'docx' or 'txt': + documents = load_document(source, type_source) + for document in documents: document.metadata["id"] = metadata_id document.metadata["source"] = source @@ -609,3 +619,5 @@ def get_embeddings_dimension(embedding): emb_dimension = 1536 return emb_dimension + + diff --git a/tilellm/tools/document_tool_simple.py b/tilellm/tools/document_tool_simple.py index c5a595f..0f8c009 100644 --- a/tilellm/tools/document_tool_simple.py +++ b/tilellm/tools/document_tool_simple.py @@ -4,9 +4,12 @@ from langchain_community.document_loaders import UnstructuredURLLoader from langchain_community.document_loaders import AsyncChromiumLoader import requests +import logging +logger = logging.getLogger(__name__) -#"https://help.tiledesk.com/mychatbots/articles/il-pnrr-per-la-ricerca-e-linnovazione/" + +# "https://help.tiledesk.com/mychatbots/articles/il-pnrr-per-la-ricerca-e-linnovazione/" def get_content_by_url(url: str, scrape_type: int): try: urls = [url] @@ -27,12 +30,44 @@ def get_content_by_url(url: str, scrape_type: int): except Exception as ex: raise ex + +def load_document(url: str, type_source: str): + # import os + # name, extension = os.path.splitext(file) + + if type_source == 'pdf': + from langchain_community.document_loaders import PyPDFLoader + logger.info(f'Loading {url}') + loader = PyPDFLoader(url) + elif type_source == 'docx': + from langchain_community.document_loaders import Docx2txtLoader + logger.info(f'Loading {url}') + loader = Docx2txtLoader(url) + elif type_source == 'txt': + from langchain_community.document_loaders import TextLoader + logger.info(f'Loading {url}') + loader = TextLoader(url) + else: + logger.info('Document format is not supported!') + return None + + data = loader.load() + return data + + +def load_from_wikipedia(query, lang='en', load_max_docs=2): + from langchain_community.document_loaders import WikipediaLoader + loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs) + data = loader.load() + return data + + def get_content_by_url_with_bs(url:str): html = requests.get(url) - #urls = [url] + # urls = [url] # Load HTML - #loader = await AsyncChromiumLoader(urls) - #html = loader.load() + # loader = await AsyncChromiumLoader(urls) + # html = loader.load() from bs4 import BeautifulSoup soup = BeautifulSoup(html.content, 'html.parser') @@ -66,8 +101,11 @@ def get_content_by_url_with_bs(url:str): testi.append(testo_doc) # Aggiungi una riga vuota tra i segmenti - #if index < len(h1_tags) - 1: + # if index < len(h1_tags) - 1: # print() # Stampa una riga vuota tra i segmenti return testi + + +