From f0930a6fd1d1bee291a561cd369519915fc7f1e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gwendolyn=20O=2E=20D=C3=B6bel?= <81755070+GwennyGit@users.noreply.github.com> Date: Wed, 8 Nov 2023 10:44:39 +0100 Subject: [PATCH] Removed analysis_db_local_version.py #102 --- refinegems/analysis_db_local_version.py | 302 ------------------------ 1 file changed, 302 deletions(-) delete mode 100644 refinegems/analysis_db_local_version.py diff --git a/refinegems/analysis_db_local_version.py b/refinegems/analysis_db_local_version.py deleted file mode 100644 index 6e9aaffb..00000000 --- a/refinegems/analysis_db_local_version.py +++ /dev/null @@ -1,302 +0,0 @@ -#!/usr/bin/env python -import re -import requests -import sqlite3 -import pandas as pd -import numpy as np -from refinegems.io import load_a_table_from_database -from refinegems.databases import PATH_TO_DB -from typing import Literal -from tqdm import tqdm -from ratelimit import limits, sleep_and_retry - - -__author__ = "Famke Baeuerle and Gwendolyn O. Döbel" - - -ALL_BIGG_COMPARTMENTS_ONE_LETTER = ('c', 'e', 'p', 'm', 'x', 'r', 'v', 'n', 'g', 'u', 'l', 'h', 'f', 's', 'i', 'w', 'y') -ALL_BIGG_COMPARTMENTS_TWO_LETTER = ('im', 'cx', 'um', 'cm', 'mm') -BIGG_REACTIONS_URL = 'http://bigg.ucsd.edu/api/v2/universal/reactions/' -BIGG_METABOLITES_URL = 'http://bigg.ucsd.edu/api/v2/universal/metabolites/' - -COMPARTMENTS = ('c', 'e', 'p') - - -def get_search_regex(other_db: Literal['KEGG', 'BioCyc', 'SEED'], metabolites: bool) -> str: - """Retrieves the search regex for BioCyc/KEGG/SEED to be used in the BiGG mapping - - Args: - - other_db (Literal): Specifies if the search regex should be for BioCyc/KEGG/SEED - - metabolites (bool): Is required if one wants to search for KEGG/SEED Compound IDs in the bigg_models_metabolites.txt - - Returns: - str: Search regex - """ - if other_db == 'BioCyc': - return 'BioCyc: http://identifiers.org/biocyc/META:(.*?);' - elif other_db == 'KEGG' or other_db == 'SEED': - if metabolites: - return f'{other_db} Compound: http://identifiers.org/{other_db.lower()}.compound/(.*?);' - else: - return f'{other_db} Reaction: http://identifiers.org/{other_db.lower()}.reaction/(.*?);' - - -def compare_ids(id1: str, id2: str) -> bool: - """Compares two strings/IDs & Returns True if one string matches most of the other - - Args: - - id1 (str): ID 1 - - id2 (str): ID 2 - - Returns: - bool: Indicates if most of one string contained in the other - """ - id1_split, id2_split, id1_single_comp, id2_single_comp, id1_comp, id2_comp = None, None, None, None, None, None - - if '_' in id1: id1_split = re.split('_([a-zA-Z]|[0-9])$', id1)[0] - if '_' in id2: id2_split = re.split('_([a-zA-Z]|[0-9])$', id2)[0] - if id1.endswith(ALL_BIGG_COMPARTMENTS_ONE_LETTER): id1_single_comp = id1[:-1] - if id2.endswith(ALL_BIGG_COMPARTMENTS_ONE_LETTER): id2_single_comp = id2[:-1] - if id1.endswith(ALL_BIGG_COMPARTMENTS_TWO_LETTER): id1_comp = id1[:-2] - if id2.endswith(ALL_BIGG_COMPARTMENTS_TWO_LETTER): id2_comp = id2[:-2] - - similar_ids = False - if id1 == id2: similar_ids = True # Both IDs are same - - elif id1_split and id2_split and (id1_split == id2_split): similar_ids = True # Both IDs are same but from different compartments - elif id2_split and (id1 == id2_split): similar_ids = True # - "" - - elif id1_split and (id1_split == id2): similar_ids = True # - "" - - - elif id1_single_comp and id2_single_comp and (id1_single_comp == id2_single_comp): similar_ids = True - elif id2_single_comp and (id1 == id2_single_comp): similar_ids = True - elif id1_single_comp and (id1_single_comp == id2_single_comp): similar_ids = True - - elif id1_comp and id2_comp and (id1_comp == id2_comp): similar_ids = True - elif id2_comp and (id1 == id2_comp): similar_ids = True - elif id1_comp and (id1_comp == id2): similar_ids = True - - elif id1_split and id2_single_comp and (id1_split == id2_single_comp): similar_ids = True - elif id2_split and id1_single_comp and (id1_single_comp == id2_split): similar_ids = True - - elif id1_split and id2_comp and (id1_split == id2_comp): similar_ids = True - elif id2_split and id1_comp and (id1_comp == id2_split): similar_ids = True - - elif id1_comp and id2_single_comp and (id1_comp == id2_single_comp): similar_ids = True - elif id2_comp and id1_single_comp and (id1_single_comp == id2_comp): similar_ids = True - - else: similar_ids = False - - return similar_ids - - -def keep_only_reactions_in_certain_compartments(complete_df: pd.DataFrame, other_db: str) -> pd.DataFrame: - """Extracts all possible BiGG ID variations from database for a BiGG reaction ID, gets the metabolite compartments - & returns table containing only reactions which happen in one of the provided compartments - - Args: - - complete_df (pd.DataFrame): Table containing at least the columns 'bigg_id' & 'KEGG'/'BioCyc'/'SEED' - - other_db (str): String specifying the column name of the identifiers from the not BiGG namespace - - Returns: - pd.DataFrame: Table containing reactions & their compartments - """ - tqdm.pandas() - complete_df = complete_df[['bigg_id', other_db]] # Remove all unnecessary columns - - # (1) Find all occurrencs of a BiGG reaction ID in bigg_reactions table in database - def get_all_similar_bigg_ids(bigg_id_in: str) -> list[str]: - - if '_' in bigg_id_in: bigg_id = re.split('_([a-zA-Z]|[0-9])$', bigg_id_in)[0] - elif bigg_id_in.endswith(ALL_BIGG_COMPARTMENTS_ONE_LETTER): bigg_id = bigg_id_in[:-1] - elif bigg_id_in.endswith(ALL_BIGG_COMPARTMENTS_TWO_LETTER): bigg_id = bigg_id_in[:-2] - else: bigg_id = bigg_id_in - - query = f"SELECT bigg_id, INSTR(bigg_id, '{bigg_id}') bi FROM bigg_reactions WHERE bi > 0" - result = con.execute(query).fetchall() - result = [result_tuple[0] for result_tuple in result] if result else [bigg_id_in] - result = [res for res in result if compare_ids(bigg_id, res)] - return result - - # (2) Use list of all BiGG IDs obtained from database table bigg_reactions to get 'metabolites' - @sleep_and_retry - @limits(calls=10, period=1) - def get_reaction_compartment(bigg_id: str) -> str: - - metabs_from_reac = requests.get(BIGG_REACTIONS_URL + bigg_id, allow_redirects=False).json()['metabolites'] - - comps = [comp_dict.get('compartment_bigg_id') for comp_dict in metabs_from_reac] # Get all compartments for reaction - contained_in_compartments = [(comp in COMPARTMENTS) for comp in comps] # Get True for correct compartment - if not all(contained_in_compartments): # At least one compartment not correct - return np.nan - else: # All compartments correct - if len(set(comps)) == 1: # Set of found compartments of reaction = 1: Reaction happens in one compartment - return comps[0] - else: # Not so important but do not remove reaction as reaction in correct compartments - return 'exchange' # Probably exchange reaction - - # Connect to database & get similar IDs (1) - print('Getting all similar IDs...') - con = sqlite3.connect(PATH_TO_DB) # Open connection to database - complete_df.loc[:,'bigg_id_list'] = complete_df.loc[:, 'bigg_id'].progress_map(get_all_similar_bigg_ids) - #complete_df.progress_apply(get_all_similar_bigg_ids, axis=1) - con.close() # Close connection to database - - # Adjust table to contain one BiGG ID per row from bigg_id_list (1) - complete_df.loc[:, 'id_group'] = complete_df['bigg_id'].ne(complete_df['bigg_id'].shift()).cumsum() # Group similar IDs - complete_df.drop(labels='bigg_id', axis=1, inplace=True) # Drop 'bigg_id' as no longer required - complete_df = complete_df.explode('bigg_id_list', ignore_index=True) # Expand 'bigg_id_list' column - complete_df.rename(columns={'bigg_id_list': 'bigg_id'}, inplace=True) # Rename 'bigg_id_list' to 'bigg_id' - - # (2) Get all compartments for each reaction from BiGG database API - print(f'Getting all IDs with correct compartment {COMPARTMENTS}...') - complete_df.loc[:, 'compartment'] = complete_df.loc[:, 'bigg_id'].progress_map(get_reaction_compartment) - #complete_df.progress_apply(get_reaction_compartment, axis=1) # (2) - - # (3) Remove reactions with compartment = NaN - complete_df.dropna(subset=['compartment'], inplace=True) - - return complete_df - - -# Function originally from refineGEMs.genecomp/refineGEMs.KEGG_analysis --- Modified -def get_bigg2other_db(other_db: Literal['KEGG', 'BioCyc', 'SEED'], metabolites: bool=False) -> pd.DataFrame: - """Uses list of BiGG reactions/metabolites to get a mapping from BiGG to KEGG/BioCyc Id - - Args: - - other_db (Literal): Set to 'KEGG'/'BioCyc'/'SEED' to map KEGG/BioCyc/SEED IDs to BiGG IDs - - metabolites (bool): Set to True to map other_db IDs to BiGG IDs for metabolites - - Returns: - pd.DataFrame: Table containing BiGG Ids with corresponding KEGG/BioCyc/SEED Ids - """ - - # Get only rows with BioCyc/KEGG entries - db_table_name = 'bigg_metabolites' if metabolites else 'bigg_reactions' - reaction_or_compound = 'Compound' if metabolites else 'Reaction' - other_db_query = other_db if other_db == 'BioCyc' else ' '.join([other_db, reaction_or_compound]) - bigg_db_query = f"SELECT *, INSTR(database_links, '{other_db_query}:') o_db FROM {db_table_name} WHERE o_db > 0" - bigg_db_df = load_a_table_from_database(bigg_db_query) - - db_search_regex = get_search_regex(other_db, metabolites) - - def find_other_db(database_links: str): - m = re.findall( - db_search_regex, - str(database_links)) - if m: - return m - else: - return None - - bigg_db_df[other_db] = bigg_db_df.apply( - lambda row: find_other_db(row['database_links']), axis=1) - bigg_db_df = bigg_db_df.explode(other_db, ignore_index=True) - - if not metabolites: - bigg_db_df = keep_only_reactions_in_certain_compartments(bigg_db_df, other_db) - - bigg_df = bigg_db_df[['bigg_id', other_db]] if metabolites else bigg_db_df[['bigg_id', other_db, 'compartment', 'id_group']] - - return bigg_df - - -# Function originally from refineGEMs.genecomp/refineGEMs.KEGG_analysis --- Modified -def compare_bigg_model(complete_df: pd.DataFrame, model_entities: pd.DataFrame, metabolites: bool=False) -> pd.DataFrame: - """Compares missing entities obtained through genes extracted via KEGG/BioCyc to entities in the model - Needed to back check previous comparisons. - - Args: - - complete_df (pd.DataFrame): Table that contains KEGG/BioCyc Id, BiGG Id & more - - model_entities (pd.DataFrame): BiGG Ids of entities in the model - - metabolites (bool): True if names of metabolites should be added, otherwise false - - Returns: - pd.DataFrame: Table containing entities present in KEGG/BioCyc but not in the model - """ - db = 'KEGG' if 'KEGG' in complete_df.columns else 'BioCyc' # Find out which database was used - - # Get only IDs that are not in model - mapp = complete_df.set_index('bigg_id') - entities = model_entities.set_index('bigg_id') - entities_missing_in_model = mapp[~mapp.index.isin( - entities.index)].reset_index() - - db_ids = entities_missing_in_model.groupby('bigg_id')[db].agg(set) # Get a set of all BioCyc/KEGG IDs belonging to one BiGG ID - - # Add set of BioCyc/KEGG IDs belonging to one BiGG ID to the dataframe - entities_missing_in_model.set_index('bigg_id', inplace=True) - entities_missing_in_model.loc[:, db] = db_ids - entities_missing_in_model.reset_index(inplace=True) - - if 'id_group' in entities_missing_in_model.columns: # Remove reaction ID duplicates but keep all related BiGG & BioCyc/KEGG IDs in a list - aliases = entities_missing_in_model.groupby(['compartment', 'id_group'])['bigg_id'].agg(set) # Get a set of the 'duplicated' BiGG reaction IDs -> aliases - entities_missing_in_model.drop_duplicates(['compartment', 'id_group'], inplace=True, ignore_index=True) # Drop duplicates where compartments & id_group same - - # Add set of BiGG ID aliases to the dataframe - entities_missing_in_model.set_index(['compartment', 'id_group'], inplace=True) - entities_missing_in_model.loc[:, 'bigg_aliases'] = aliases - entities_missing_in_model.reset_index(inplace=True) - - entities_missing_in_model.drop(labels='id_group', axis=1, inplace=True) # id_group is not longer necessary - - entities_missing_in_model.drop_duplicates(subset='bigg_id', inplace=True, ignore_index=True) # Remove BiGG ID duplicates - - # Add name column to dataframe - def get_name_from_bigg(bigg_id: str): - bigg_db = 'bigg_metabolites' if metabolites else 'bigg_reactions' - query = f"SELECT name FROM {bigg_db} WHERE bigg_id=\'{bigg_id}\'" - name_from_bigg = con.execute(query).fetchone()[0] - return name_from_bigg - - con = sqlite3.connect(PATH_TO_DB) # Open connection to database - entities_missing_in_model['name'] = entities_missing_in_model['bigg_id'].map(get_name_from_bigg) - con.close() - - # Add compartment ID to all BiGG metabolites - if metabolites: - def get_compartment_from_id(bigg_id: str): - compartment = bigg_id[-1] - return compartment if compartment in COMPARTMENTS else np.nan # To filter the incorrect compartments out - - entities_missing_in_model['compartment'] = entities_missing_in_model.apply( - lambda row: get_compartment_from_id(row['bigg_id']), axis=1) - entities_missing_in_model.dropna(subset=['compartment'], inplace=True) # Drop all BiGG metabolite IDs which have no valid compartment - - return entities_missing_in_model - - -def add_stoichiometric_values_to_reacs(missing_reacs: pd.DataFrame) -> pd.DataFrame: - """Adds for each reaction a dictionary containing the reactants & products as dictionaries with the BiGG Metabolite - ID as key & the respective absolute stoichiometric value as value - - Args: - - missing_reacs (pd.DataFrame): Table containing missing reactions (Only requires a column containing BiGG IDs) - - Returns: - pd.DataFrame: Table where for each BiGG reaction ID a dictionary containing reactants & products exists - """ - - def get_reactants_and_products_dicts(reaction_id: str) -> list[dict]: - reactants = {} - products = {} - - metabs_from_reac = requests.get(BIGG_REACTIONS_URL + reaction_id).json()['metabolites'] - - for compound_dict in metabs_from_reac: - complete_bigg_id = None - if compound_dict.get('compartment_bigg_id'): - complete_bigg_id = f"{compound_dict.get('bigg_id')}_{compound_dict.get('compartment_bigg_id')}" - else: - complete_bigg_id = compound_dict.get('bigg_id') - if compound_dict.get('stoichiometry') < 0: - reactants[complete_bigg_id] = abs(compound_dict.get('stoichiometry')) - elif compound_dict.get('stoichiometry') > 0: - products[complete_bigg_id] = abs(compound_dict.get('stoichiometry')) - - return str({'reactants': reactants, 'products': products}) - - missing_reacs['bigg_reaction']= missing_reacs.apply( - lambda row: get_reactants_and_products_dicts(str(row['bigg_id'])), axis=1) #, missing_reacs['bigg_products'], result_type='expand' - - return missing_reacs - \ No newline at end of file