diff --git a/CHANGELOG.md b/CHANGELOG.md index b29bf04..500a8b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,10 @@ *Andrea Sponziello* ### **Copyrigth**: *Tiledesk SRL* -## [2024-09-21] -### 0.3.1 + +## [2024-10-10] +### 0.3.2-rc2 +- fix: /api/id/{id}/namespace/{namespace}/{token} - add sentence embedding with bge-m3 - add: hybrid search with bg3-m3 - modify: deleted env variable for vector store diff --git a/README.md b/README.md index 64f2f15..97548e1 100644 --- a/README.md +++ b/README.md @@ -12,12 +12,8 @@ pip install -e . ```commandline -export REDIS_URL="redis://localhost:6379/0" -export PINECONE_TYPE="serverless|pod" -export PINECONE_API_KEY="pinecone api key" -export PINECONE_TEXT_KEY="pinecone field for text - default text in pod content" -export PINECONE_INDEX="pinecone index name" -export TILELLM_ROLE="role in pod. Train enable all the APIs, qa do not consume redis queue only Q&A" +export JWT_SECRET_KEY="yourkey-256-bit" +export TOKENIZERS_PARALLELISM=false export WORKERS=INT number of workers 2*CPU+1 export TIMEOUT=INT seconds of timeout default=180 export MAXREQUESTS=INT The maximum number of requests a worker will process before restarting. deafult=1200 @@ -34,11 +30,9 @@ sudo docker build -t tilellm . ``` -sudo docker run -d -p 8000:8000 --env environment="dev|prod" \ ---env PINECONE_API_KEY="yourapikey" \ ---env PINECONE_TEXT_KEY="text|content" \ ---env PINECONE_INDEX="index_name" \ ---env TILELLM_ROLE="train|qa" \ +sudo docker run -d -p 8000:8000 \ +--env JWT_SECRET_KEY = "yourkey-256-bit" +--env TOKENIZERS_PARALLELISM=false --env WORKERS=3 \ --env TIMEOUT=180 \ --env MAXREQUESTS=1200 \ @@ -145,19 +139,52 @@ In this method, the gradient of distance is used to split chunks along with the ```json { - ... - "embedding":"huggingface", - "hybrid":true, - "sparse_encoder":"splade|bge-m3", - ... - "engine": - { - "name": "", - "type": "", - "apikey" : "", - "vector_size": 1024, - "index_name": "" - } + "id": "content id", + "source": "name or url of document", + "type": "text|txt|url|pdf|docx", + "content": "content of document", + "hybrid": true, + "sparse_encoder": "splade|bge-m3", + "gptkey": "llm key; openai|anthropic|groq|cohere|gemini|ollama, ", + "scrape_type": 0, + "embedding": "name of embedding; huggingface|ollama|openai...|bge-m3", + "model": { + "name": "optional, used only with ollama", + "url": "ollama base url", + "dimension": 3072 + }, + "namespace": "vector store namespace", + "webhook": "string", + "semantic_chunk": false, + "breakpoint_threshold_type": "percentile", + "chunk_size": 1000, + "chunk_overlap": 100, + "parameters_scrape_type_4": { + "unwanted_tags": [ + "string" + ], + "tags_to_extract": [ + "string" + ], + "unwanted_classnames": [ + "string" + ], + "desired_classnames": [ + "string" + ], + "remove_lines": true, + "remove_comments": true, + "time_sleep": 2 + }, + "engine": { + "name": "pinecone", + "type": "serverless", + "apikey": "string", + "vector_size": 1536, + "index_name": "index name", + "text_key": "text for serverless; content for pod", + "metric": "cosine|dotproduct for hybrid" + } } ``` @@ -174,7 +201,7 @@ In this method, the gradient of distance is used to split chunks along with the "model": "es. claude-3-5-sonnet-20240620 | llama-3.1-70b-versatile", "temperature": 0.9, "max_tokens":2048, - "embedding":"huggingfacce", + "embedding":"huggingface", "sparse_encoder":"splade|bge-m3", "search_type":"hybrid", "alpha": 0.2, diff --git a/pyproject.toml b/pyproject.toml index 309eeaa..b92c325 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "tilellm" -version = "0.3.0" +version = "0.3.2-rc2" description = "tiledesk for RAG" authors = ["Gianluca Lorenzo "] repository = "https://github.com/Tiledesk/tiledesk-llm" @@ -34,6 +34,7 @@ langchain-community = "0.3.1" #"0.2.10" langchain-experimental = "0.3.1" #no previous langchain-pinecone = "0.2.0" langchain-huggingface="0.1.0" +langchain-ollama="0.2.0" peft = "0.13.0" tiktoken = "0.7.0" diff --git a/tilellm/__main__.py b/tilellm/__main__.py index 23c8849..9e9c850 100644 --- a/tilellm/__main__.py +++ b/tilellm/__main__.py @@ -496,9 +496,13 @@ async def scrape_status_main(scrape_status_req: ScrapeStatusReq, return JSONResponse(content=scrape_status_response.model_dump()) else: try: - retrieved_pinecone_data = await get_ids_namespace(metadata_id=scrape_status_req.id, + repository_engine = RepositoryEngine(engine=scrape_status_req.engine) + print(repository_engine.engine) + retrieved_pinecone_data = await get_ids_namespace(repository_engine, + metadata_id=scrape_status_req.id, namespace=scrape_status_req.namespace) + if retrieved_pinecone_data.matches: logger.debug(retrieved_pinecone_data.matches[0].date) date_from_metadata = retrieved_pinecone_data.matches[0].date diff --git a/tilellm/controller/controller_utils.py b/tilellm/controller/controller_utils.py index 4f254da..0ed9722 100644 --- a/tilellm/controller/controller_utils.py +++ b/tilellm/controller/controller_utils.py @@ -155,9 +155,12 @@ async def generate_answer_with_history(llm, question_answer, rag_chain, retrieve | qa_prompt | llm.with_structured_output(QuotedAnswer) ) - chain_w_citations = RunnablePassthrough.assign(context=retrieve_docs).assign( - answer=rag_chain_from_docs - ).assign(only_answer=lambda text: text["answer"].answer) + + chain_w_citations = (RunnablePassthrough.assign(context=retrieve_docs) + .assign(answer=rag_chain_from_docs) + .assign(only_answer=lambda text: text["answer"].answer) + ) + conversational_rag_chain = RunnableWithMessageHistory( chain_w_citations, get_session_history, diff --git a/tilellm/models/item_model.py b/tilellm/models/item_model.py index f463804..b58619b 100644 --- a/tilellm/models/item_model.py +++ b/tilellm/models/item_model.py @@ -3,6 +3,11 @@ from typing import Dict, Optional, List, Union, Any import datetime +class OllamaModel(BaseModel): + name: str + url: str + dimension: Optional[int] = 1024 + class Engine(BaseModel): name: str = Field(default="pinecone") @@ -50,6 +55,7 @@ class ItemSingle(BaseModel): gptkey: str | None = None scrape_type: int = Field(default_factory=lambda: 0) embedding: str = Field(default_factory=lambda: "text-embedding-ada-002") + model: Optional[OllamaModel] | None = None namespace: str | None = None webhook: str = Field(default_factory=lambda: "") semantic_chunk: Optional[bool] = Field(default=False) @@ -108,15 +114,15 @@ class QuestionAnswer(BaseModel): namespace: str llm: Optional[str] = Field(default="openai") gptkey: str - model: str = Field(default="gpt-3.5-turbo") + model: Union[str, OllamaModel] = Field(default="gpt-3.5-turbo") sparse_encoder: Optional[str] = Field(default="splade") #bge-m3 temperature: float = Field(default=0.0) top_k: int = Field(default=5) - max_tokens: int = Field(default=128) + max_tokens: int = Field(default=1024) embedding: str = Field(default_factory=lambda: "text-embedding-ada-002") similarity_threshold: float = Field(default_factory=lambda: 1.0) debug: bool = Field(default_factory=lambda: False) - citations: bool = Field(default_factory=lambda: True) + citations: bool = Field(default_factory=lambda: False) alpha: Optional[float] = Field(default=0.5) system_context: Optional[str] = None search_type: str = Field(default_factory=lambda: "similarity") @@ -155,7 +161,7 @@ class QuestionToLLM(BaseModel): question: str llm_key: Union[str, AWSAuthentication] llm: str - model: str = Field(default="gpt-3.5-turbo") + model: Union[str, OllamaModel] = Field(default="gpt-3.5-turbo") temperature: float = Field(default=0.0) max_tokens: int = Field(default=128) debug: bool = Field(default_factory=lambda: False) @@ -285,6 +291,7 @@ class ScrapeStatusReq(BaseModel): id: str namespace: str namespace_list: Optional[List[str]] | None = None + engine: Engine class ScrapeStatusResponse(BaseModel): diff --git a/tilellm/shared/utility.py b/tilellm/shared/utility.py index 8aea53c..8613302 100644 --- a/tilellm/shared/utility.py +++ b/tilellm/shared/utility.py @@ -1,4 +1,3 @@ -import os from functools import wraps import logging @@ -7,10 +6,12 @@ import langchain_aws from langchain_community.callbacks.openai_info import OpenAICallbackHandler from langchain_community.embeddings import CohereEmbeddings #, GooglePalmEmbeddings +from langchain_experimental.llms.ollama_functions import OllamaFunctions from langchain_huggingface import HuggingFaceEmbeddings +from langchain_ollama import ChatOllama from langchain_voyageai import VoyageAIEmbeddings from langchain_openai import OpenAIEmbeddings - +from openai import base_url from tilellm.shared import const @@ -92,6 +93,14 @@ async def wrapper(self, item, *args, **kwargs): ) dimension = 1024 + elif item.embedding == "ollama": + from langchain_ollama.embeddings import OllamaEmbeddings + embedding_obj = OllamaEmbeddings(model=item.model.name, + base_url=item.model.url + ) + dimension = item.model.dimension + # dimension for llama3.2 3072 + else: embedding_obj = OpenAIEmbeddings(api_key=item.gptkey, model=item.embedding) dimension = 1536 @@ -138,6 +147,11 @@ async def wrapper(question, *args, **kwargs): max_tokens=question.max_tokens, convert_system_message_to_human=True) + elif question.llm == "ollama": + chat_model = ChatOllama(model = question.model.name, + temperature=question.temperature, + um_predict = question.max_tokens, + base_url=question.model.url) elif question.llm == "groq": chat_model = ChatGroq(api_key=question.llm_key, model=question.model, @@ -263,8 +277,29 @@ async def wrapper(question, *args, **kwargs): model=question.model, temperature=question.temperature, max_tokens=question.max_tokens + + ) + elif question.llm == "ollama": + callback_handler = TiledeskAICallbackHandler() + + from langchain_ollama.embeddings import OllamaEmbeddings + llm_embeddings = OllamaEmbeddings(model=question.model.name, + base_url=question.model.url + ) + dimension = question.model.dimension + + llm = ChatOllama(model=question.model.name, + temperature=question.temperature, + num_predict=question.max_tokens, + base_url=question.model.url, + format="json", + callback_handler=[callback_handler] + ) + + + elif question.llm == "aws": import os diff --git a/tilellm/store/pinecone/pinecone_repository_base.py b/tilellm/store/pinecone/pinecone_repository_base.py index 88140ca..f4d4d56 100644 --- a/tilellm/store/pinecone/pinecone_repository_base.py +++ b/tilellm/store/pinecone/pinecone_repository_base.py @@ -138,7 +138,7 @@ async def get_pc_ids_namespace(engine: Engine, metadata_id: str, namespace: str) metadata_source=obj.get('metadata').get('source'), metadata_type=obj.get('metadata').get('type'), date=obj.get('metadata').get('date', 'Date not defined'), - text=obj.get('metadata').get(const.PINECONE_TEXT_KEY) + text=obj.get('metadata').get(engine.text_key) # su pod content, su Serverless text ) ) @@ -373,7 +373,7 @@ async def get_pc_sources_namespace(engine: Engine, source: str, namespace: str) metadata_source=obj.get('metadata').get('source'), metadata_type=obj.get('metadata').get('type'), date=obj.get('metadata').get('date', 'Date not defined'), - text=obj.get('metadata').get(const.PINECONE_TEXT_KEY) + text=obj.get('metadata').get(engine.text_key) # su pod content, su Serverless text ) )