fixed generation time bug

openml-labs · Aug 27, 2024 · ce926a3 · ce926a3
1 parent 04e9828
commit ce926a3
Show file tree

Hide file tree

Showing 19 changed files with 1,886 additions and 1,552 deletions.
diff --git a/data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/data_level0.bin b/data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/data_level0.bin
diff --git a/data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/header.bin b/data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/header.bin
diff --git a/data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/index_metadata.pickle b/data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/index_metadata.pickle
diff --git a/data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/length.bin b/data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/length.bin
diff --git a/data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/link_lists.bin b/data/crawler/5d1d4bae-0137-4cfb-9783-64f67098e434/link_lists.bin
diff --git a/docs/Documentation Bot/api_reference.md b/docs/Documentation Bot/api_reference.md
@@ -0,0 +1,3 @@
+## Documentation Bot
+
+::: documentation_query_utils
diff --git a/docs/Documentation Bot/index.md b/docs/Documentation Bot/index.md
@@ -0,0 +1,8 @@
+# Documentation Bot
+
+- This bot reads the documentation of OpenML and trains an LLM model to answer questions about the project.
+
+## How to run
+
+- First run the crawler to get the documentation from OpenML. This will create a `data` folder with the documentation in it. ```python run_crawler.py```
+- For inference, run ```uvicorn documentation_query:app --host 0.0.0.0 --port 8083 &```
diff --git a/documentation_bot/README.md b/documentation_bot/README.md
@@ -0,0 +1,8 @@
+# Documentation Bot
+
+- This bot reads the documentation of OpenML and trains an LLM model to answer questions about the project.
+
+## How to run
+
+- First run the crawler to get the documentation from OpenML. This will create a `data` folder with the documentation in it. ```python run_crawler.py```
+- For inference, run ```uvicorn documentation_query:app --host 0.0.0.0 --port 8083 &```
diff --git a/documentation_bot/__init__.py b/documentation_bot/__init__.py
diff --git a/documentation_bot/base_urls.txt b/documentation_bot/base_urls.txt
@@ -0,0 +1,4 @@
+https://openml.github.io/openml-python/main/
+https://docs.openml.org/
+https://openml.org/apis/
+https://github.com/openml/openml-python/tree/develop/openml
diff --git a/documentation_bot/documentation_query.py b/documentation_bot/documentation_query.py
@@ -5,16 +5,22 @@
 from fastapi.responses import JSONResponse, StreamingResponse
 from httpx import ConnectTimeout
 from tenacity import retry, retry_if_exception_type, stop_after_attempt
-from utils import ChromaStore, Crawler
+from documentation_query_utils import ChromaStore, Crawler, stream_response
+from langchain_ollama import ChatOllama
 
-# TODO : make this into a separate thing using config
 recrawl_websites = False
 
 crawled_files_data_path = "../data/crawler/crawled_data.csv"
 chroma_path = "../data/crawler/"
-model_name = "BAAI/bge-small-en"
+rag_model_name = "BAAI/bge-small-en"
 generation_model_name = "llama3"  # ollama
 
+generation_llm = ChatOllama(
+    model=generation_model_name, temperature=0.0
+)
+# Send test message to the generation model
+generation_llm.invoke("test generation")
+
 # Crawl the websites and save the data
 num_of_websites_to_crawl = None  # none for all
 
@@ -31,28 +37,17 @@
 
 # Initialize the ChromaStore and embed the data
 chroma_store = ChromaStore(
-    model_name=model_name,
+    rag_model_name=rag_model_name,
     crawled_files_data_path=crawled_files_data_path,
     chroma_file_path=chroma_path,
-    generation_model_name=generation_model_name,
+    generation_llm=generation_llm,
 )
 if recrawl_websites == True:
     chroma_store.read_data_and_embed()
 
 app = FastAPI()
 session_id = str(uuid.uuid4())
 
-
-def stream_response(response):
-    for line in response:
-        try:
-            yield str(line["answer"])
-        except GeneratorExit:
-            break
-        except:
-            yield ""
-
-
 @app.get("/documentationquery/{query}", response_class=JSONResponse)
 @retry(stop=stop_after_attempt(3), retry=retry_if_exception_type(ConnectTimeout))
 async def get_documentation_query(query: str):

diff --git a/documentation_bot/utils.py → ...entation_bot/documentation_query_utils.py b/documentation_bot/utils.py → ...entation_bot/documentation_query_utils.py
@@ -17,9 +17,21 @@
 from langchain_core.chat_history import BaseChatMessageHistory
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.runnables.history import RunnableWithMessageHistory
-from langchain_ollama import ChatOllama
+
 from tqdm.auto import tqdm
 
+def stream_response(response):
+    """
+    Description: This function is used to stream the response from the model.
+    
+    """
+    for line in response:
+        try:
+            yield str(line["answer"])
+        except GeneratorExit:
+            break
+        except:
+            yield ""
 
 def find_device() -> str:
     """
@@ -36,6 +48,10 @@ def find_device() -> str:
 
 
 def get_session_history(session_id: str) -> BaseChatMessageHistory:
+    """
+    Description: This function is used to get the chat history of a session.
+    
+    """
     # print("this is the session id", session_id)
     if session_id not in store:
         store[session_id] = ChatMessageHistory()
@@ -54,10 +70,8 @@ def __init__(
         recrawl_websites=False,
         num_of_websites_to_crawl=None,
     ):
-        self.base_urls = [
-            "https://openml.github.io/openml-python/main/",
-            "https://docs.openml.org/",
-        ]
+        with open("./base_urls.txt", "r") as f:
+            self.base_urls = f.read().splitlines()
         self.crawled_files_data_path = crawled_files_data_path
         self.recrawl_websites = recrawl_websites
         self.num_of_websites_to_crawl = num_of_websites_to_crawl
@@ -195,15 +209,15 @@ def do_crawl(self):
 class ChromaStore:
     def __init__(
         self,
-        model_name,
+        rag_model_name,
         crawled_files_data_path,
         chroma_file_path,
-        generation_model_name,
+        generation_llm,
     ) -> None:
-        self.model_name = model_name
+        self.rag_model_name = rag_model_name
         self.device = find_device()
         self.hf_embedding_function = HuggingFaceBgeEmbeddings(
-            model_name=self.model_name,
+            model_name=self.rag_model_name,
             model_kwargs={"device": self.device},
             encode_kwargs={"normalize_embeddings": True},
         )
@@ -225,19 +239,22 @@ def __init__(
             "also reformulate the question. Do NOT answer the question, "
             "just reformulate it if needed and otherwise return it as is."
         )
-        self.generation_model_name = generation_model_name
-        self.generation_llm = ChatOllama(
-            model=self.generation_model_name, temperature=0.0
-        )
+        self.generation_llm = generation_llm
+
 
     def read_data_and_embed(self):  # inference
+        """
+        Description: This function is used to read the crawled data and embed it using the Hugging Face model.
+        
+        """
         if not os.path.exists(self.crawled_files_data_path):
             print("Crawled data does not exist. Please run the crawler first.")
             return
 
         df = pd.read_csv(self.crawled_files_data_path)
         df["joined"] = df.apply(self._join_columns, axis=1)
         docs = DataFrameLoader(df, page_content_column="joined").load()
+
 
         # Splitting the document texts into smaller chunks
         docs_texts = self._split_documents(docs)
@@ -281,11 +298,20 @@ def _split_documents(self, docs):
         return splitter.split_documents(docs)
 
     def setup_inference(self, session_id: str) -> None:
+        """
+        Description: This function is used to setup the inference for the bot.
+        
+        """
         self.store = {}
         self.session_id = session_id
 
     def openml_page_search(self, input: str):
 
+        """
+        Description: Use the Chroma vector store to search for the most relevant page to the input question , contextualize the question and answer it.
+        
+        """
+
         vectorstore = Chroma(
             persist_directory=self.chroma_file_path,
             embedding_function=self.hf_embedding_function,
@@ -331,12 +357,7 @@ def openml_page_search(self, input: str):
             output_messages_key="answer",
         )
 
-        # answer = conversational_rag_chain.invoke(
-        #     {"input": f"{input}"},
-        #     config={
-        #         "configurable": {"session_id": self.session_id}
-        #     },  # constructs a key "abc123" in `store`.
-        # )["answer"]
+
         answer = conversational_rag_chain.stream(
             {"input": f"{input}"},
             config={

diff --git a/documentation_bot/requirements.txt b/documentation_bot/requirements.txt
@@ -0,0 +1,12 @@
+beautifulsoup4==4.12.3
+fastapi==0.112.2
+httpx==0.27.0
+langchain==0.2.14
+langchain_community==0.2.12
+langchain_core==0.2.35
+langchain_ollama==0.1.1
+pandas==2.2.2
+Requests==2.32.3
+tenacity==8.3.0
+torch==2.3.0
+tqdm==4.66.4
diff --git a/documentation_bot/run_crawler.py b/documentation_bot/run_crawler.py
@@ -0,0 +1,34 @@
+import os
+
+from documentation_query_utils import ChromaStore, Crawler
+
+recrawl_websites = True
+
+crawled_files_data_path = "../data/crawler/crawled_data.csv"
+chroma_path = "../data/crawler/"
+model_name = "BAAI/bge-small-en"
+generation_model_name = "llama3"  # ollama
+
+# Crawl the websites and save the data
+num_of_websites_to_crawl = None  # none for all
+
+if not os.path.exists(chroma_path):
+    os.makedirs(chroma_path, exist_ok=True)
+
+# Crawl the websites and save the data
+crawler = Crawler(
+    crawled_files_data_path=crawled_files_data_path,
+    recrawl_websites=recrawl_websites,
+    num_of_websites_to_crawl=num_of_websites_to_crawl,
+)
+crawler.do_crawl()
+
+# Initialize the ChromaStore and embed the data
+chroma_store = ChromaStore(
+    model_name=model_name,
+    crawled_files_data_path=crawled_files_data_path,
+    chroma_file_path=chroma_path,
+    generation_model_name=generation_model_name,
+)
+if recrawl_websites == True:
+    chroma_store.read_data_and_embed()
diff --git a/frontend/ui.py b/frontend/ui.py
@@ -15,4 +15,4 @@
 with st.spinner("Loading Required Data"):
     config_path = Path("../backend/config.json")
     ui_loader = UILoader(config_path)
-    ui_loader.generate_complete_ui()
+ui_loader.generate_complete_ui()
diff --git a/frontend/ui_utils.py b/frontend/ui_utils.py
@@ -352,7 +352,7 @@ def __init__(self, config_path):
             st.session_state.messages = []
 
     # container for company description and logo
-    def generate_logo_header(
+    def _generate_logo_header(
         self,
     ):
 
@@ -367,8 +367,9 @@ def generate_logo_header(
 
     def generate_complete_ui(self):
 
-        self.generate_logo_header()
+        self._generate_logo_header()
         chat_container = st.container()
+        # self.disclaimer_dialog()
         with chat_container:
             with st.form(key="chat_form"):
                 user_input = st.text_input(
@@ -404,7 +405,7 @@ def create_chat_interface(self, user_input, query_type=None, ai_filter=False):
             with st.chat_message(name="ai"):
                 st.write("OpenML Agent: ", "Hello! How can I help you today?")
                 st.write(
-                    "Note that results are powered by local LLM models and may not be accurate. Please refer to the official OpenML website for accurate information."
+                    ":warning: Note that results are powered by local LLM models and may not be accurate. Please refer to the official OpenML website for accurate information."
                 )
 
         # Handle user input
@@ -477,8 +478,6 @@ def display_results(self, initial_response, role):
         """
         Description: Display the results in a DataFrame
         """
-        # st.write("OpenML Agent: ")
-
         try:
             st.dataframe(initial_response)
         except:

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -41,7 +41,7 @@ plugins:
       default_handler: python
       handlers:
         python:
-          paths: [backend/modules, tests/, frontend, llm_service, ollama, tools, evaluation/]
+          paths: [backend/modules, tests/, frontend, llm_service, ollama, tools, evaluation/, documentation_bot/]
           load_external_modules: true
           show_source: true
           options: