From f2b032d73a6061069a0d24f09baf8c27fa17d5c0 Mon Sep 17 00:00:00 2001 From: SubhadityaMukherjee Date: Mon, 22 Jul 2024 13:13:25 +0200 Subject: [PATCH] MASSIVE REFACTOR, CLEANED UP EVERYTHING, updated tutorials --- backend/backend.py | 2 +- .../Developer Tutorials/change_model.py | 63 +++++++++++++++++++ evaluation/training_utils.py | 60 ++++++++---------- start_local.sh | 10 +-- 4 files changed, 96 insertions(+), 39 deletions(-) create mode 100644 docs/Rag Pipeline/Developer Tutorials/change_model.py diff --git a/backend/backend.py b/backend/backend.py index 7fbe713..6f843bf 100644 --- a/backend/backend.py +++ b/backend/backend.py @@ -11,7 +11,7 @@ # load the configuration and device config = load_config_and_device("config.json") -if config["testing_flag"] == True: +if config["testing_flag"]: config["persist_dir"] = "./data/chroma_db_testing/" config["test_subset"] = True config["data_dir"] = "./data/testing_data/" diff --git a/docs/Rag Pipeline/Developer Tutorials/change_model.py b/docs/Rag Pipeline/Developer Tutorials/change_model.py new file mode 100644 index 0000000..9856bc0 --- /dev/null +++ b/docs/Rag Pipeline/Developer Tutorials/change_model.py @@ -0,0 +1,63 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.16.3 +# kernelspec: +# display_name: openml +# language: python +# name: python3 +# --- + +# # Tutorial on changing models +# - How would you use a different embedding and llm model? + +from __future__ import annotations +from langchain_community.cache import SQLiteCache +import os +import sys +import chromadb + +from backend.modules.utils import load_config_and_device +from backend.modules.rag_llm import QASetup + +# ## Initial config + +config = load_config_and_device("../../../backend/config.json") +config["persist_dir"] = "../../data/doc_examples/chroma_db/" +config["data_dir"] = "../../data/doc_examples/" +config["type_of_data"] = "dataset" +config["training"] = True +config["test_subset"] = True #set this to false while training, this is for demo +# load the persistent database using ChromaDB +client = chromadb.PersistentClient(path=config["persist_dir"]) +print(config) + +# ## Embedding model +# - Pick a model from HF + +config["embedding_model"] = "BAAI/bge-large-en-v1.5" + +# ## LLM model + +# - Pick a model from Ollama - https://ollama.com/library?sort=popular +# - eg : mistral +# + +config["llm_model"] = "mistral" + +# + +qa_dataset_handler = QASetup( + config=config, + data_type=config["type_of_data"], + client=client, +) + +qa_dataset, _ = qa_dataset_handler.setup_vector_db_and_qa() +# - + +# # IMPORTANT +# - Do NOT forget to change the model to the best model in ollama/get_ollama.sh diff --git a/evaluation/training_utils.py b/evaluation/training_utils.py index 4e017ea..64c5d44 100644 --- a/evaluation/training_utils.py +++ b/evaluation/training_utils.py @@ -241,41 +241,35 @@ def aggregate_multiple_queries(self, qa_dataset, data_metadata, types_of_llm_app for query in tqdm(self.queries, total=len(self.queries), leave=True): for apply_llm_before_rag in types_of_llm_apply: - response_parser = response_parsers[apply_llm_before_rag] - - # result_data_frame, _ = get_result_from_query( - # query=query, - # qa=qa_dataset, - # type_of_query="dataset", - # config=self.config, - # ) - - result_data_frame, _ = QueryProcessor( - query=query, - qa=qa_dataset, - type_of_query="dataset", - config=self.config, - ).get_result_from_query() - response_parser.rag_response = { - "initial_response": list(result_data_frame["id"].values) - } - - response_parser.fetch_llm_response(query) - result_data_frame = response_parser.parse_and_update_response( - data_metadata - ).copy()[["did", "name"]] - - result_data_frame["query"] = query - result_data_frame["llm_model"] = self.config["llm_model"] - result_data_frame["embedding_model"] = self.config["embedding_model"] - result_data_frame["llm_before_rag"] = apply_llm_before_rag - - # combined_results.append(result_data_frame) - combined_results = pd.concat( - [combined_results, result_data_frame], ignore_index=True - ) + combined_results = self.run_query(apply_llm_before_rag, combined_results, data_metadata, qa_dataset, + query, response_parsers) # Concatenate all collected DataFrames at once # combined_df = pd.concat(combined_results, ignore_index=True) return combined_results + + def run_query(self, apply_llm_before_rag, combined_results, data_metadata, qa_dataset, query, response_parsers): + response_parser = response_parsers[apply_llm_before_rag] + result_data_frame, _ = QueryProcessor( + query=query, + qa=qa_dataset, + type_of_query="dataset", + config=self.config, + ).get_result_from_query() + response_parser.rag_response = { + "initial_response": result_data_frame["id"].to_list() + } + response_parser.fetch_llm_response(query) + result_data_frame = response_parser.parse_and_update_response( + data_metadata + ).copy()[["did", "name"]] + result_data_frame["query"] = query + result_data_frame["llm_model"] = self.config["llm_model"] + result_data_frame["embedding_model"] = self.config["embedding_model"] + result_data_frame["llm_before_rag"] = apply_llm_before_rag + # combined_results.append(result_data_frame) + combined_results = pd.concat( + [combined_results, result_data_frame], ignore_index=True + ) + return combined_results diff --git a/start_local.sh b/start_local.sh index 74f8b8f..6926758 100755 --- a/start_local.sh +++ b/start_local.sh @@ -6,26 +6,26 @@ killall streamlit PID_FILE="processes.pid" # Start processes and save their PIDs -cd ollama +cd ollama || exit ./get_ollama.sh & echo $! > $PID_FILE structured_query = false if [ "$structured_query" == true ]; then - cd ../structured_query + cd ../structured_query || exit uvicorn llm_service_structured_query:app --host 0.0.0.0 --port 8082 & echo $! > $PID_FILE else - cd ../llm_service + cd ../llm_service || exit uvicorn llm_service:app --host 0.0.0.0 --port 8081 & echo $! > $PID_FILE fi -cd ../backend +cd ../backend || exit uvicorn backend:app --host 0.0.0.0 --port 8000 & echo $! >> $PID_FILE -cd ../frontend +cd ../frontend || exit streamlit run ui.py & echo $! >> $PID_FILE