modified evaluation pipeline, cleaned up lots of things, fixed bugs w…

…ith documentation
openml-labs · Jul 16, 2024 · 8247087 · 8247087
1 parent f048581
commit 8247087
Show file tree

Hide file tree

Showing 27 changed files with 4,300 additions and 3,867 deletions.
diff --git a/backend/modules/general_utils.py b/backend/modules/general_utils.py
@@ -10,9 +10,6 @@ def find_device(training: bool = False) -> str:
     """
     Description: Find the device to use for the pipeline. If cuda is available, use it. If not, check if MPS is available and use it. If not, use CPU.
 
-    Input: training (bool) : Whether the pipeline is being used for training or not.
-
-    Returns: device (str) : The device to use for the pipeline.
     """
     print("[INFO] Finding device.")
     if torch.cuda.is_available():
@@ -27,10 +24,6 @@ def load_config_and_device(config_file: str, training: bool = False) -> dict:
     """
     Description: Load the config file and find the device to use for the pipeline.
 
-    Input: config_file (str) : The path to the config file.
-    training (bool) : Whether the pipeline is being used for training or not.
-
-    Returns: config (dict) : The config dictionary + device (str) : The device to use for the pipeline.
     """
     # Check if the config file exists and load it
     if not os.path.exists(config_file):

diff --git a/backend/modules/llm.py b/backend/modules/llm.py
@@ -32,9 +32,7 @@ def load_and_process_data(metadata_df: pd.DataFrame, page_content_column: str) -
     """
     Description: Load and process the data for the vector store. Split the documents into chunks of 1000 characters.
 
-    Input: metadata_df (pd.DataFrame), page_content_column (str)
 
-    Returns: chunked documents (list)
     """
     # Load data
     loader = DataFrameLoader(metadata_df, page_content_column=page_content_column)
@@ -52,9 +50,7 @@ def generate_unique_documents(documents: list, db: Chroma) -> tuple:
     Description: Generate unique documents by removing duplicates. This is done by generating unique IDs for the documents and keeping only one of the duplicate IDs.
         Source: https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist
 
-    Input: documents (list)
 
-    Returns: unique_docs (list), unique_ids (list)
     """
 
     # Remove duplicates based on ID (from database)
@@ -95,7 +91,6 @@ def load_document_and_create_vector_store(
         chroma_client (chromadb.PersistentClient): The Chroma client.
         config (dict): The configuration dictionary.
 
-    Returns:
         Chroma: The Chroma vector store.
     """
     embeddings = load_model(config)
@@ -113,9 +108,7 @@ def load_model(config: dict) -> HuggingFaceEmbeddings | None:
     """
     Description: Load the model using HuggingFaceEmbeddings.
 
-    Input: config (dict)
 
-    Returns: HuggingFaceEmbeddings
     """
     print("[INFO] Loading model...")
     model_kwargs = {"device": config["device"], "trust_remote_code": True}
@@ -135,9 +128,6 @@ def get_collection_name(config: dict) -> str:
     """
     Description: Get the collection name based on the type of data provided in the config.
 
-    Input: config (dict)
-
-    Returns: str
     """
     return {"dataset": "datasets", "flow": "flows"}.get(
         config["type_of_data"], "default"
@@ -152,10 +142,6 @@ def load_vector_store(
 ) -> Chroma:
     """
     Description: Load the vector store from the persist directory.
-
-    Input: chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)
-
-    Returns: Chroma
     """
     if not os.path.exists(config["persist_dir"]):
         raise Exception(
@@ -174,9 +160,6 @@ def add_documents_to_db(db, unique_docs, unique_ids):
     """
     Description: Add documents to the vector store in batches of 200.
 
-    Input: db (Chroma), unique_docs (list), unique_ids (list)
-
-    Returns: None
     """
     bs = 512
     if len(unique_docs) < bs:
@@ -187,9 +170,6 @@ def add_documents_to_db(db, unique_docs, unique_ids):
             db.add_documents(unique_docs[i : i + bs], ids=unique_ids[i : i + bs])
 
 
-# def create_vector_store(
-#     metadata_df, chroma_client, config, embeddings, collection_name
-# ):
 def create_vector_store(
     metadata_df: pd.DataFrame,
     chroma_client: ClientAPI,
@@ -200,9 +180,6 @@ def create_vector_store(
     """
     Description: Create the vector store using Chroma db. The documents are loaded and processed, unique documents are generated, and the documents are added to the vector store.
 
-    Input: metadata_df (pd.DataFrame), chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)
-
-    Returns: db (Chroma)
     """
 
     db = Chroma(
@@ -244,9 +221,6 @@ def initialize_llm_chain(
     """
     Description: Initialize the LLM chain and setup Retrieval QA with the specified configuration.
 
-    Input: vectordb (Chroma), config (dict)
-
-    Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)
     """
 
     return vectordb.as_retriever(
@@ -271,14 +245,10 @@ def setup_vector_db_and_qa(
     )
     # Create the combined metadata dataframe
     metadata_df, all_metadata = create_metadata_dataframe(
-        handler, openml_data_object, data_id, all_metadata, config=config
+        handler, openml_data_object, data_id, all_metadata, config=config, subset_ids = subset_ids
     )
 
-    # subset the metadata if subset_ids is not None
-    if subset_ids is not None:
-        subset_ids = [int(x) for x in subset_ids]
-        metadata_df = metadata_df[metadata_df["did"].isin(subset_ids)]
-
+
     # Create the vector store
     vectordb = load_document_and_create_vector_store(
         metadata_df, config=config, chroma_client=client
@@ -291,9 +261,6 @@ def get_llm_chain(config: dict, local: bool = False) -> LLMChain | bool:
     """
     Description: Get the LLM chain with the specified model and prompt template.
 
-    Input: config (dict)
-
-    Returns: LLMChain
     """
     base_url = "http://127.0.0.1:11434" if local else "http://ollama:11434"
     llm = Ollama(model=config["llm_model"], base_url=base_url)

diff --git a/backend/modules/metadata_utils.py b/backend/modules/metadata_utils.py
@@ -43,39 +43,31 @@ def get_description(self, data_id: int):
         """
         Description: Get the description of the OpenML object.
 
-        Input: data_id (int) : The data id
 
-        Returns: The OpenML object.
         """
         raise NotImplementedError
 
     def get_openml_objects(self):
         """
         Description: Get the OpenML objects.
 
-        Input: None
 
-        Returns: The OpenML objects.
         """
         raise NotImplementedError
 
     def initialize_cache(self, data_id: Sequence[int]) -> None:
         """
         Description: Initialize the cache for the OpenML objects.
 
-        Input: data_id (list) : The list of data ids
 
-        Returns: None
         """
         self.get_description(data_id[0])
 
     def get_metadata(self, data_id: Sequence[int]):
         """
         Description: Get metadata from OpenML using parallel processing.
 
-        Input: data_id (list) : The list of data ids
 
-        Returns: The OpenML objects.
         """
         return pqdm(
             data_id, self.get_description, n_jobs=self.config["data_download_n_jobs"]
@@ -87,23 +79,20 @@ def process_metadata(
         data_id: Sequence[int],
         all_dataset_metadata: pd.DataFrame,
         file_path: str,
+        subset_ids = None
     ):
         """
         Description: Process the metadata.
 
-        Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path
 
-        Returns: The combined metadata dataframe and the updated metadata table.
         """
         raise NotImplementedError
 
     def load_metadata(self, file_path: str):
         """
         Description: Load metadata from a file.
 
-        Input: file_path (str) : The file path
 
-        Returns: The metadata dataframe.
         """
         try:
             return pd.read_csv(file_path)
@@ -135,6 +124,7 @@ def process_metadata(
         data_id: Sequence[int],
         all_dataset_metadata: pd.DataFrame,
         file_path: str,
+        subset_ids = None
     ):
         descriptions = [
             extract_attribute(attr, "description") for attr in openml_data_object
@@ -153,6 +143,12 @@ def process_metadata(
             all_dataset_metadata, all_data_description_df
         )
 
+        # subset the metadata if subset_ids is not None
+        if subset_ids is not None:
+            subset_ids = [int(x) for x in subset_ids]
+            all_dataset_metadata = all_dataset_metadata[all_dataset_metadata["did"].isin(subset_ids)]
+
+
         all_dataset_metadata.to_csv(file_path)
 
         return (
@@ -179,6 +175,7 @@ def process_metadata(
         data_id: Sequence[int],
         all_dataset_metadata: pd.DataFrame,
         file_path: str,
+        subset_ids = None
     ):
         descriptions = [
             extract_attribute(attr, "description") for attr in openml_data_object
@@ -198,6 +195,10 @@ def process_metadata(
         all_data_description_df["Combined_information"] = all_data_description_df.apply(
             merge_all_columns_to_string, axis=1
         )
+        # subset the metadata if subset_ids is not None
+        if subset_ids is not None:
+            subset_ids = [int(x) for x in subset_ids]
+            all_dataset_metadata = all_dataset_metadata[all_dataset_metadata["did"].isin(subset_ids)]
         all_data_description_df.to_csv(file_path)
 
         return (
@@ -218,9 +219,7 @@ def get_all_metadata_from_openml(
     This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.
 
 
-    Input: config (dict) : The config dictionary
 
-    Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.
     """
 
     # save_filename = f"./data/all_{config['type_of_data']}_metadata.pkl"
@@ -281,9 +280,7 @@ def extract_attribute(attribute: object, attr_name: str) -> str:
     """
     Description: Extract an attribute from the OpenML object.
 
-    Input: attribute (object) : The OpenML object
 
-    Returns: The attribute value if it exists, else an empty string.
     """
     return getattr(attribute, attr_name, "")
 
@@ -292,10 +289,7 @@ def join_attributes(attribute: object, attr_name: str) -> str:
     """
     Description: Join the attributes of the OpenML object.
 
-    Input: attribute (object) : The OpenML object
 
-    Returns: The joined attributes if they exist, else an empty string.
-    example: "column - value, column - value, ..."
     """
 
     return (
@@ -315,9 +309,7 @@ def create_combined_information_df(
     """
     Description: Create a dataframe with the combined information of the OpenML object.
 
-    Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object
 
-    Returns: The dataframe with the combined information of the OpenML object.
     """
     return pd.DataFrame(
         {
@@ -333,9 +325,7 @@ def merge_all_columns_to_string(row: pd.Series) -> str:
     """
     Description: Create a single column that has a combined string of all the metadata and the description in the form of "column - value, column - value, ... description"
 
-    Input: row (pd.Series) : The row of the dataframe
 
-    Returns: The combined string of all the metadata and the description in the form of "column - value, column - value, ... description"
     """
 
     return " ".join([f"{col} - {val}," for col, val in zip(row.index, row.values)])
@@ -348,10 +338,7 @@ def combine_metadata(
     """
     Description: Combine the descriptions with the metadata table.
 
-    Input: all_dataset_metadata (pd.DataFrame) : The metadata table,
-    all_data_description_df (pd.DataFrame) : The descriptions
 
-    Returns: The combined metadata table.
     """
     # Combine the descriptions with the metadata table
     all_dataset_metadata = pd.merge(
@@ -375,23 +362,14 @@ def create_metadata_dataframe(
     data_id: Sequence[int],
     all_dataset_metadata: pd.DataFrame,
     config: dict,
+    subset_ids = None
 ) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
     Description: Creates a dataframe with all the metadata, joined columns with all information
     for the type of data specified in the config. If training is set to False,
     the dataframes are loaded from the files. If training is set to True, the
     dataframes are created and then saved to the files.
 
-    Input:
-        handler (OpenMLObjectHandler): The handler for the OpenML objects.
-        openml_data_object (list): The list of OpenML objects.
-        data_id (list): The list of data ids.
-        all_dataset_metadata (pd.DataFrame): The metadata table.
-        config (dict): The config dictionary.
-
-    Returns:
-        pd.DataFrame: The combined metadata dataframe.
-        pd.DataFrame: The updated metadata table.
     """
     # use os.path.join to ensure compatibility with different operating systems
     file_path = os.path.join(
@@ -402,5 +380,5 @@ def create_metadata_dataframe(
         return handler.load_metadata(file_path), all_dataset_metadata
 
     return handler.process_metadata(
-        openml_data_object, data_id, all_dataset_metadata, file_path
+        openml_data_object, data_id, all_dataset_metadata, file_path, subset_ids
     )