Skip to content

Commit

Permalink
MASSIVE REFACTOR, CLEANED UP EVERYTHING, updated tutorials
Browse files Browse the repository at this point in the history
  • Loading branch information
SubhadityaMukherjee committed Jul 22, 2024
1 parent f92ca2e commit f2b032d
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 39 deletions.
2 changes: 1 addition & 1 deletion backend/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

# load the configuration and device
config = load_config_and_device("config.json")
if config["testing_flag"] == True:
if config["testing_flag"]:
config["persist_dir"] = "./data/chroma_db_testing/"
config["test_subset"] = True
config["data_dir"] = "./data/testing_data/"
Expand Down
63 changes: 63 additions & 0 deletions docs/Rag Pipeline/Developer Tutorials/change_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.16.3
# kernelspec:
# display_name: openml
# language: python
# name: python3
# ---

# # Tutorial on changing models
# - How would you use a different embedding and llm model?

from __future__ import annotations
from langchain_community.cache import SQLiteCache
import os
import sys
import chromadb

from backend.modules.utils import load_config_and_device
from backend.modules.rag_llm import QASetup

# ## Initial config

config = load_config_and_device("../../../backend/config.json")
config["persist_dir"] = "../../data/doc_examples/chroma_db/"
config["data_dir"] = "../../data/doc_examples/"
config["type_of_data"] = "dataset"
config["training"] = True
config["test_subset"] = True #set this to false while training, this is for demo
# load the persistent database using ChromaDB
client = chromadb.PersistentClient(path=config["persist_dir"])
print(config)

# ## Embedding model
# - Pick a model from HF

config["embedding_model"] = "BAAI/bge-large-en-v1.5"

# ## LLM model

# - Pick a model from Ollama - https://ollama.com/library?sort=popular
# - eg : mistral
#

config["llm_model"] = "mistral"

# +
qa_dataset_handler = QASetup(
config=config,
data_type=config["type_of_data"],
client=client,
)

qa_dataset, _ = qa_dataset_handler.setup_vector_db_and_qa()
# -

# # IMPORTANT
# - Do NOT forget to change the model to the best model in ollama/get_ollama.sh
60 changes: 27 additions & 33 deletions evaluation/training_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,41 +241,35 @@ def aggregate_multiple_queries(self, qa_dataset, data_metadata, types_of_llm_app

for query in tqdm(self.queries, total=len(self.queries), leave=True):
for apply_llm_before_rag in types_of_llm_apply:
response_parser = response_parsers[apply_llm_before_rag]

# result_data_frame, _ = get_result_from_query(
# query=query,
# qa=qa_dataset,
# type_of_query="dataset",
# config=self.config,
# )

result_data_frame, _ = QueryProcessor(
query=query,
qa=qa_dataset,
type_of_query="dataset",
config=self.config,
).get_result_from_query()
response_parser.rag_response = {
"initial_response": list(result_data_frame["id"].values)
}

response_parser.fetch_llm_response(query)
result_data_frame = response_parser.parse_and_update_response(
data_metadata
).copy()[["did", "name"]]

result_data_frame["query"] = query
result_data_frame["llm_model"] = self.config["llm_model"]
result_data_frame["embedding_model"] = self.config["embedding_model"]
result_data_frame["llm_before_rag"] = apply_llm_before_rag

# combined_results.append(result_data_frame)
combined_results = pd.concat(
[combined_results, result_data_frame], ignore_index=True
)
combined_results = self.run_query(apply_llm_before_rag, combined_results, data_metadata, qa_dataset,
query, response_parsers)

# Concatenate all collected DataFrames at once
# combined_df = pd.concat(combined_results, ignore_index=True)

return combined_results

def run_query(self, apply_llm_before_rag, combined_results, data_metadata, qa_dataset, query, response_parsers):
response_parser = response_parsers[apply_llm_before_rag]
result_data_frame, _ = QueryProcessor(
query=query,
qa=qa_dataset,
type_of_query="dataset",
config=self.config,
).get_result_from_query()
response_parser.rag_response = {
"initial_response": result_data_frame["id"].to_list()
}
response_parser.fetch_llm_response(query)
result_data_frame = response_parser.parse_and_update_response(
data_metadata
).copy()[["did", "name"]]
result_data_frame["query"] = query
result_data_frame["llm_model"] = self.config["llm_model"]
result_data_frame["embedding_model"] = self.config["embedding_model"]
result_data_frame["llm_before_rag"] = apply_llm_before_rag
# combined_results.append(result_data_frame)
combined_results = pd.concat(
[combined_results, result_data_frame], ignore_index=True
)
return combined_results
10 changes: 5 additions & 5 deletions start_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,26 @@ killall streamlit
PID_FILE="processes.pid"

# Start processes and save their PIDs
cd ollama
cd ollama || exit
./get_ollama.sh &
echo $! > $PID_FILE

structured_query = false
if [ "$structured_query" == true ]; then
cd ../structured_query
cd ../structured_query || exit
uvicorn llm_service_structured_query:app --host 0.0.0.0 --port 8082 &
echo $! > $PID_FILE
else
cd ../llm_service
cd ../llm_service || exit
uvicorn llm_service:app --host 0.0.0.0 --port 8081 &
echo $! > $PID_FILE
fi

cd ../backend
cd ../backend || exit
uvicorn backend:app --host 0.0.0.0 --port 8000 &
echo $! >> $PID_FILE

cd ../frontend
cd ../frontend || exit
streamlit run ui.py &
echo $! >> $PID_FILE

Expand Down

0 comments on commit f2b032d

Please sign in to comment.