Skip to content

Commit

Permalink
modified evaluation pipeline, cleaned up lots of things, fixed bugs w…
Browse files Browse the repository at this point in the history
…ith documentation
  • Loading branch information
SubhadityaMukherjee committed Jul 16, 2024
1 parent f048581 commit 8247087
Show file tree
Hide file tree
Showing 27 changed files with 4,300 additions and 3,867 deletions.
7 changes: 0 additions & 7 deletions backend/modules/general_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@ def find_device(training: bool = False) -> str:
"""
Description: Find the device to use for the pipeline. If cuda is available, use it. If not, check if MPS is available and use it. If not, use CPU.
Input: training (bool) : Whether the pipeline is being used for training or not.
Returns: device (str) : The device to use for the pipeline.
"""
print("[INFO] Finding device.")
if torch.cuda.is_available():
Expand All @@ -27,10 +24,6 @@ def load_config_and_device(config_file: str, training: bool = False) -> dict:
"""
Description: Load the config file and find the device to use for the pipeline.
Input: config_file (str) : The path to the config file.
training (bool) : Whether the pipeline is being used for training or not.
Returns: config (dict) : The config dictionary + device (str) : The device to use for the pipeline.
"""
# Check if the config file exists and load it
if not os.path.exists(config_file):
Expand Down
37 changes: 2 additions & 35 deletions backend/modules/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@ def load_and_process_data(metadata_df: pd.DataFrame, page_content_column: str) -
"""
Description: Load and process the data for the vector store. Split the documents into chunks of 1000 characters.
Input: metadata_df (pd.DataFrame), page_content_column (str)
Returns: chunked documents (list)
"""
# Load data
loader = DataFrameLoader(metadata_df, page_content_column=page_content_column)
Expand All @@ -52,9 +50,7 @@ def generate_unique_documents(documents: list, db: Chroma) -> tuple:
Description: Generate unique documents by removing duplicates. This is done by generating unique IDs for the documents and keeping only one of the duplicate IDs.
Source: https://stackoverflow.com/questions/76265631/chromadb-add-single-document-only-if-it-doesnt-exist
Input: documents (list)
Returns: unique_docs (list), unique_ids (list)
"""

# Remove duplicates based on ID (from database)
Expand Down Expand Up @@ -95,7 +91,6 @@ def load_document_and_create_vector_store(
chroma_client (chromadb.PersistentClient): The Chroma client.
config (dict): The configuration dictionary.
Returns:
Chroma: The Chroma vector store.
"""
embeddings = load_model(config)
Expand All @@ -113,9 +108,7 @@ def load_model(config: dict) -> HuggingFaceEmbeddings | None:
"""
Description: Load the model using HuggingFaceEmbeddings.
Input: config (dict)
Returns: HuggingFaceEmbeddings
"""
print("[INFO] Loading model...")
model_kwargs = {"device": config["device"], "trust_remote_code": True}
Expand All @@ -135,9 +128,6 @@ def get_collection_name(config: dict) -> str:
"""
Description: Get the collection name based on the type of data provided in the config.
Input: config (dict)
Returns: str
"""
return {"dataset": "datasets", "flow": "flows"}.get(
config["type_of_data"], "default"
Expand All @@ -152,10 +142,6 @@ def load_vector_store(
) -> Chroma:
"""
Description: Load the vector store from the persist directory.
Input: chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)
Returns: Chroma
"""
if not os.path.exists(config["persist_dir"]):
raise Exception(
Expand All @@ -174,9 +160,6 @@ def add_documents_to_db(db, unique_docs, unique_ids):
"""
Description: Add documents to the vector store in batches of 200.
Input: db (Chroma), unique_docs (list), unique_ids (list)
Returns: None
"""
bs = 512
if len(unique_docs) < bs:
Expand All @@ -187,9 +170,6 @@ def add_documents_to_db(db, unique_docs, unique_ids):
db.add_documents(unique_docs[i : i + bs], ids=unique_ids[i : i + bs])


# def create_vector_store(
# metadata_df, chroma_client, config, embeddings, collection_name
# ):
def create_vector_store(
metadata_df: pd.DataFrame,
chroma_client: ClientAPI,
Expand All @@ -200,9 +180,6 @@ def create_vector_store(
"""
Description: Create the vector store using Chroma db. The documents are loaded and processed, unique documents are generated, and the documents are added to the vector store.
Input: metadata_df (pd.DataFrame), chroma_client (chromadb.PersistentClient), config (dict), embeddings (HuggingFaceEmbeddings), collection_name (str)
Returns: db (Chroma)
"""

db = Chroma(
Expand Down Expand Up @@ -244,9 +221,6 @@ def initialize_llm_chain(
"""
Description: Initialize the LLM chain and setup Retrieval QA with the specified configuration.
Input: vectordb (Chroma), config (dict)
Returns: qa (langchain.chains.retrieval_qa.base.RetrievalQA)
"""

return vectordb.as_retriever(
Expand All @@ -271,14 +245,10 @@ def setup_vector_db_and_qa(
)
# Create the combined metadata dataframe
metadata_df, all_metadata = create_metadata_dataframe(
handler, openml_data_object, data_id, all_metadata, config=config
handler, openml_data_object, data_id, all_metadata, config=config, subset_ids = subset_ids
)

# subset the metadata if subset_ids is not None
if subset_ids is not None:
subset_ids = [int(x) for x in subset_ids]
metadata_df = metadata_df[metadata_df["did"].isin(subset_ids)]


# Create the vector store
vectordb = load_document_and_create_vector_store(
metadata_df, config=config, chroma_client=client
Expand All @@ -291,9 +261,6 @@ def get_llm_chain(config: dict, local: bool = False) -> LLMChain | bool:
"""
Description: Get the LLM chain with the specified model and prompt template.
Input: config (dict)
Returns: LLMChain
"""
base_url = "http://127.0.0.1:11434" if local else "http://ollama:11434"
llm = Ollama(model=config["llm_model"], base_url=base_url)
Expand Down
52 changes: 15 additions & 37 deletions backend/modules/metadata_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,39 +43,31 @@ def get_description(self, data_id: int):
"""
Description: Get the description of the OpenML object.
Input: data_id (int) : The data id
Returns: The OpenML object.
"""
raise NotImplementedError

def get_openml_objects(self):
"""
Description: Get the OpenML objects.
Input: None
Returns: The OpenML objects.
"""
raise NotImplementedError

def initialize_cache(self, data_id: Sequence[int]) -> None:
"""
Description: Initialize the cache for the OpenML objects.
Input: data_id (list) : The list of data ids
Returns: None
"""
self.get_description(data_id[0])

def get_metadata(self, data_id: Sequence[int]):
"""
Description: Get metadata from OpenML using parallel processing.
Input: data_id (list) : The list of data ids
Returns: The OpenML objects.
"""
return pqdm(
data_id, self.get_description, n_jobs=self.config["data_download_n_jobs"]
Expand All @@ -87,23 +79,20 @@ def process_metadata(
data_id: Sequence[int],
all_dataset_metadata: pd.DataFrame,
file_path: str,
subset_ids = None
):
"""
Description: Process the metadata.
Input: openml_data_object (list) : The list of OpenML objects, data_id (list) : The list of data ids, all_dataset_metadata (pd.DataFrame) : The metadata table, file_path (str) : The file path
Returns: The combined metadata dataframe and the updated metadata table.
"""
raise NotImplementedError

def load_metadata(self, file_path: str):
"""
Description: Load metadata from a file.
Input: file_path (str) : The file path
Returns: The metadata dataframe.
"""
try:
return pd.read_csv(file_path)
Expand Down Expand Up @@ -135,6 +124,7 @@ def process_metadata(
data_id: Sequence[int],
all_dataset_metadata: pd.DataFrame,
file_path: str,
subset_ids = None
):
descriptions = [
extract_attribute(attr, "description") for attr in openml_data_object
Expand All @@ -153,6 +143,12 @@ def process_metadata(
all_dataset_metadata, all_data_description_df
)

# subset the metadata if subset_ids is not None
if subset_ids is not None:
subset_ids = [int(x) for x in subset_ids]
all_dataset_metadata = all_dataset_metadata[all_dataset_metadata["did"].isin(subset_ids)]


all_dataset_metadata.to_csv(file_path)

return (
Expand All @@ -179,6 +175,7 @@ def process_metadata(
data_id: Sequence[int],
all_dataset_metadata: pd.DataFrame,
file_path: str,
subset_ids = None
):
descriptions = [
extract_attribute(attr, "description") for attr in openml_data_object
Expand All @@ -198,6 +195,10 @@ def process_metadata(
all_data_description_df["Combined_information"] = all_data_description_df.apply(
merge_all_columns_to_string, axis=1
)
# subset the metadata if subset_ids is not None
if subset_ids is not None:
subset_ids = [int(x) for x in subset_ids]
all_dataset_metadata = all_dataset_metadata[all_dataset_metadata["did"].isin(subset_ids)]
all_data_description_df.to_csv(file_path)

return (
Expand All @@ -218,9 +219,7 @@ def get_all_metadata_from_openml(
This uses parallel threads (pqdm) and so to ensure thread safety, install the package oslo.concurrency.
Input: config (dict) : The config dictionary
Returns: all the data descriptions combined with data ids, data ids, and the raw openml objects in a dataframe.
"""

# save_filename = f"./data/all_{config['type_of_data']}_metadata.pkl"
Expand Down Expand Up @@ -281,9 +280,7 @@ def extract_attribute(attribute: object, attr_name: str) -> str:
"""
Description: Extract an attribute from the OpenML object.
Input: attribute (object) : The OpenML object
Returns: The attribute value if it exists, else an empty string.
"""
return getattr(attribute, attr_name, "")

Expand All @@ -292,10 +289,7 @@ def join_attributes(attribute: object, attr_name: str) -> str:
"""
Description: Join the attributes of the OpenML object.
Input: attribute (object) : The OpenML object
Returns: The joined attributes if they exist, else an empty string.
example: "column - value, column - value, ..."
"""

return (
Expand All @@ -315,9 +309,7 @@ def create_combined_information_df(
"""
Description: Create a dataframe with the combined information of the OpenML object.
Input: data_id (int) : The data id, descriptions (list) : The descriptions of the OpenML object, joined_qualities (list) : The joined qualities of the OpenML object, joined_features (list) : The joined features of the OpenML object
Returns: The dataframe with the combined information of the OpenML object.
"""
return pd.DataFrame(
{
Expand All @@ -333,9 +325,7 @@ def merge_all_columns_to_string(row: pd.Series) -> str:
"""
Description: Create a single column that has a combined string of all the metadata and the description in the form of "column - value, column - value, ... description"
Input: row (pd.Series) : The row of the dataframe
Returns: The combined string of all the metadata and the description in the form of "column - value, column - value, ... description"
"""

return " ".join([f"{col} - {val}," for col, val in zip(row.index, row.values)])
Expand All @@ -348,10 +338,7 @@ def combine_metadata(
"""
Description: Combine the descriptions with the metadata table.
Input: all_dataset_metadata (pd.DataFrame) : The metadata table,
all_data_description_df (pd.DataFrame) : The descriptions
Returns: The combined metadata table.
"""
# Combine the descriptions with the metadata table
all_dataset_metadata = pd.merge(
Expand All @@ -375,23 +362,14 @@ def create_metadata_dataframe(
data_id: Sequence[int],
all_dataset_metadata: pd.DataFrame,
config: dict,
subset_ids = None
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Description: Creates a dataframe with all the metadata, joined columns with all information
for the type of data specified in the config. If training is set to False,
the dataframes are loaded from the files. If training is set to True, the
dataframes are created and then saved to the files.
Input:
handler (OpenMLObjectHandler): The handler for the OpenML objects.
openml_data_object (list): The list of OpenML objects.
data_id (list): The list of data ids.
all_dataset_metadata (pd.DataFrame): The metadata table.
config (dict): The config dictionary.
Returns:
pd.DataFrame: The combined metadata dataframe.
pd.DataFrame: The updated metadata table.
"""
# use os.path.join to ensure compatibility with different operating systems
file_path = os.path.join(
Expand All @@ -402,5 +380,5 @@ def create_metadata_dataframe(
return handler.load_metadata(file_path), all_dataset_metadata

return handler.process_metadata(
openml_data_object, data_id, all_dataset_metadata, file_path
openml_data_object, data_id, all_dataset_metadata, file_path, subset_ids
)
Loading

0 comments on commit 8247087

Please sign in to comment.