Skip to content

Commit

Permalink
fixed generation time bug
Browse files Browse the repository at this point in the history
  • Loading branch information
SubhadityaMukherjee committed Aug 27, 2024
1 parent 04e9828 commit ce926a3
Show file tree
Hide file tree
Showing 19 changed files with 1,886 additions and 1,552 deletions.
Binary file not shown.
Binary file modified data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/header.bin
Binary file not shown.
Binary file not shown.
Binary file modified data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/length.bin
Binary file not shown.
Binary file modified data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/link_lists.bin
Binary file not shown.
3 changes: 3 additions & 0 deletions docs/Documentation Bot/api_reference.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
## Documentation Bot

::: documentation_query_utils
8 changes: 8 additions & 0 deletions docs/Documentation Bot/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Documentation Bot

- This bot reads the documentation of OpenML and trains an LLM model to answer questions about the project.

## How to run

- First run the crawler to get the documentation from OpenML. This will create a `data` folder with the documentation in it. ```python run_crawler.py```
- For inference, run ```uvicorn documentation_query:app --host 0.0.0.0 --port 8083 &```
8 changes: 8 additions & 0 deletions documentation_bot/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Documentation Bot

- This bot reads the documentation of OpenML and trains an LLM model to answer questions about the project.

## How to run

- First run the crawler to get the documentation from OpenML. This will create a `data` folder with the documentation in it. ```python run_crawler.py```
- For inference, run ```uvicorn documentation_query:app --host 0.0.0.0 --port 8083 &```
Empty file added documentation_bot/__init__.py
Empty file.
4 changes: 4 additions & 0 deletions documentation_bot/base_urls.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
https://openml.github.io/openml-python/main/
https://docs.openml.org/
https://openml.org/apis/
https://github.com/openml/openml-python/tree/develop/openml
27 changes: 11 additions & 16 deletions documentation_bot/documentation_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,22 @@
from fastapi.responses import JSONResponse, StreamingResponse
from httpx import ConnectTimeout
from tenacity import retry, retry_if_exception_type, stop_after_attempt
from utils import ChromaStore, Crawler
from documentation_query_utils import ChromaStore, Crawler, stream_response
from langchain_ollama import ChatOllama

# TODO : make this into a separate thing using config
recrawl_websites = False

crawled_files_data_path = "../data/crawler/crawled_data.csv"
chroma_path = "../data/crawler/"
model_name = "BAAI/bge-small-en"
rag_model_name = "BAAI/bge-small-en"
generation_model_name = "llama3" # ollama

generation_llm = ChatOllama(
model=generation_model_name, temperature=0.0
)
# Send test message to the generation model
generation_llm.invoke("test generation")

# Crawl the websites and save the data
num_of_websites_to_crawl = None # none for all

Expand All @@ -31,28 +37,17 @@

# Initialize the ChromaStore and embed the data
chroma_store = ChromaStore(
model_name=model_name,
rag_model_name=rag_model_name,
crawled_files_data_path=crawled_files_data_path,
chroma_file_path=chroma_path,
generation_model_name=generation_model_name,
generation_llm=generation_llm,
)
if recrawl_websites == True:
chroma_store.read_data_and_embed()

app = FastAPI()
session_id = str(uuid.uuid4())


def stream_response(response):
for line in response:
try:
yield str(line["answer"])
except GeneratorExit:
break
except:
yield ""


@app.get("/documentationquery/{query}", response_class=JSONResponse)
@retry(stop=stop_after_attempt(3), retry=retry_if_exception_type(ConnectTimeout))
async def get_documentation_query(query: str):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,21 @@
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_ollama import ChatOllama

from tqdm.auto import tqdm

def stream_response(response):
"""
Description: This function is used to stream the response from the model.
"""
for line in response:
try:
yield str(line["answer"])
except GeneratorExit:
break
except:
yield ""

def find_device() -> str:
"""
Expand All @@ -36,6 +48,10 @@ def find_device() -> str:


def get_session_history(session_id: str) -> BaseChatMessageHistory:
"""
Description: This function is used to get the chat history of a session.
"""
# print("this is the session id", session_id)
if session_id not in store:
store[session_id] = ChatMessageHistory()
Expand All @@ -54,10 +70,8 @@ def __init__(
recrawl_websites=False,
num_of_websites_to_crawl=None,
):
self.base_urls = [
"https://openml.github.io/openml-python/main/",
"https://docs.openml.org/",
]
with open("./base_urls.txt", "r") as f:
self.base_urls = f.read().splitlines()
self.crawled_files_data_path = crawled_files_data_path
self.recrawl_websites = recrawl_websites
self.num_of_websites_to_crawl = num_of_websites_to_crawl
Expand Down Expand Up @@ -195,15 +209,15 @@ def do_crawl(self):
class ChromaStore:
def __init__(
self,
model_name,
rag_model_name,
crawled_files_data_path,
chroma_file_path,
generation_model_name,
generation_llm,
) -> None:
self.model_name = model_name
self.rag_model_name = rag_model_name
self.device = find_device()
self.hf_embedding_function = HuggingFaceBgeEmbeddings(
model_name=self.model_name,
model_name=self.rag_model_name,
model_kwargs={"device": self.device},
encode_kwargs={"normalize_embeddings": True},
)
Expand All @@ -225,19 +239,22 @@ def __init__(
"also reformulate the question. Do NOT answer the question, "
"just reformulate it if needed and otherwise return it as is."
)
self.generation_model_name = generation_model_name
self.generation_llm = ChatOllama(
model=self.generation_model_name, temperature=0.0
)
self.generation_llm = generation_llm


def read_data_and_embed(self): # inference
"""
Description: This function is used to read the crawled data and embed it using the Hugging Face model.
"""
if not os.path.exists(self.crawled_files_data_path):
print("Crawled data does not exist. Please run the crawler first.")
return

df = pd.read_csv(self.crawled_files_data_path)
df["joined"] = df.apply(self._join_columns, axis=1)
docs = DataFrameLoader(df, page_content_column="joined").load()


# Splitting the document texts into smaller chunks
docs_texts = self._split_documents(docs)
Expand Down Expand Up @@ -281,11 +298,20 @@ def _split_documents(self, docs):
return splitter.split_documents(docs)

def setup_inference(self, session_id: str) -> None:
"""
Description: This function is used to setup the inference for the bot.
"""
self.store = {}
self.session_id = session_id

def openml_page_search(self, input: str):

"""
Description: Use the Chroma vector store to search for the most relevant page to the input question , contextualize the question and answer it.
"""

vectorstore = Chroma(
persist_directory=self.chroma_file_path,
embedding_function=self.hf_embedding_function,
Expand Down Expand Up @@ -331,12 +357,7 @@ def openml_page_search(self, input: str):
output_messages_key="answer",
)

# answer = conversational_rag_chain.invoke(
# {"input": f"{input}"},
# config={
# "configurable": {"session_id": self.session_id}
# }, # constructs a key "abc123" in `store`.
# )["answer"]

answer = conversational_rag_chain.stream(
{"input": f"{input}"},
config={
Expand Down
12 changes: 12 additions & 0 deletions documentation_bot/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
beautifulsoup4==4.12.3
fastapi==0.112.2
httpx==0.27.0
langchain==0.2.14
langchain_community==0.2.12
langchain_core==0.2.35
langchain_ollama==0.1.1
pandas==2.2.2
Requests==2.32.3
tenacity==8.3.0
torch==2.3.0
tqdm==4.66.4
34 changes: 34 additions & 0 deletions documentation_bot/run_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os

from documentation_query_utils import ChromaStore, Crawler

recrawl_websites = True

crawled_files_data_path = "../data/crawler/crawled_data.csv"
chroma_path = "../data/crawler/"
model_name = "BAAI/bge-small-en"
generation_model_name = "llama3" # ollama

# Crawl the websites and save the data
num_of_websites_to_crawl = None # none for all

if not os.path.exists(chroma_path):
os.makedirs(chroma_path, exist_ok=True)

# Crawl the websites and save the data
crawler = Crawler(
crawled_files_data_path=crawled_files_data_path,
recrawl_websites=recrawl_websites,
num_of_websites_to_crawl=num_of_websites_to_crawl,
)
crawler.do_crawl()

# Initialize the ChromaStore and embed the data
chroma_store = ChromaStore(
model_name=model_name,
crawled_files_data_path=crawled_files_data_path,
chroma_file_path=chroma_path,
generation_model_name=generation_model_name,
)
if recrawl_websites == True:
chroma_store.read_data_and_embed()
2 changes: 1 addition & 1 deletion frontend/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@
with st.spinner("Loading Required Data"):
config_path = Path("../backend/config.json")
ui_loader = UILoader(config_path)
ui_loader.generate_complete_ui()
ui_loader.generate_complete_ui()
9 changes: 4 additions & 5 deletions frontend/ui_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ def __init__(self, config_path):
st.session_state.messages = []

# container for company description and logo
def generate_logo_header(
def _generate_logo_header(
self,
):

Expand All @@ -367,8 +367,9 @@ def generate_logo_header(

def generate_complete_ui(self):

self.generate_logo_header()
self._generate_logo_header()
chat_container = st.container()
# self.disclaimer_dialog()
with chat_container:
with st.form(key="chat_form"):
user_input = st.text_input(
Expand Down Expand Up @@ -404,7 +405,7 @@ def create_chat_interface(self, user_input, query_type=None, ai_filter=False):
with st.chat_message(name="ai"):
st.write("OpenML Agent: ", "Hello! How can I help you today?")
st.write(
"Note that results are powered by local LLM models and may not be accurate. Please refer to the official OpenML website for accurate information."
":warning: Note that results are powered by local LLM models and may not be accurate. Please refer to the official OpenML website for accurate information."
)

# Handle user input
Expand Down Expand Up @@ -477,8 +478,6 @@ def display_results(self, initial_response, role):
"""
Description: Display the results in a DataFrame
"""
# st.write("OpenML Agent: ")

try:
st.dataframe(initial_response)
except:
Expand Down
2 changes: 1 addition & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ plugins:
default_handler: python
handlers:
python:
paths: [backend/modules, tests/, frontend, llm_service, ollama, tools, evaluation/]
paths: [backend/modules, tests/, frontend, llm_service, ollama, tools, evaluation/, documentation_bot/]
load_external_modules: true
show_source: true
options:
Expand Down
Loading

0 comments on commit ce926a3

Please sign in to comment.