Skip to content

Commit

Permalink
Started finalising BioCycGapFiller #52 #126
Browse files Browse the repository at this point in the history
  • Loading branch information
GwennyGit committed Aug 16, 2024
1 parent 9d7c017 commit 1c1420c
Showing 1 changed file with 61 additions and 52 deletions.
113 changes: 61 additions & 52 deletions src/refinegems/classes/gapfill.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,8 @@ class GapFiller(ABC):
"""Abstract base class for the gap filling.
Already includes functions for the "filling" part of the gap-filling approach
and some helper functions. Each subclass needs an implementation of `get_missing_genes`
and `get_missing_reactions` to determine the entities, that are missing in the model.
and some helper functions. Each subclass needs an implementation of `find_missing_genes`
and `find_missing_reactions` to determine the entities, that are missing in the model.
Attributes:
- full_gene_list (list):
Expand Down Expand Up @@ -252,12 +252,12 @@ def __init__(self) -> None:

@abstractmethod
# TODO write output format and what this functions needs to do
def get_missing_genes(self, model):
def find_missing_genes(self, model):
pass

@abstractmethod
# TODO write output format and what this functions needs to do
def get_missing_reacs(self,model):
def find_missing_reacs(self,model):
pass

# finding the gaps
Expand Down Expand Up @@ -412,7 +412,7 @@ def add_gene_reac_associations_from_table(self,model:libModel,
warnings.warn(mes,UserWarning)


# @TODO BioCyc not implemeneted
# @TEST BioCyc
# @TODO logging, save stuff for manual curation etc. -> started a bit
# @TEST - somewhat seems to work - for now
def add_reactions_from_table(self, model:cobra.Model,
Expand All @@ -423,7 +423,7 @@ def add_reactions_from_table(self, model:cobra.Model,
idprefix:str='refineGEMs',
namespace:Literal['BiGG']='BiGG') -> pd.DataFrame:
"""Helper function to add reactions to a model from the missing_reactions table
(output of the chosen implementation of :py:meth:`~refinegems.classes.gapfill.GapFiller.get_missing_reacs`)
(output of the chosen implementation of :py:meth:`~refinegems.classes.gapfill.GapFiller.find_missing_reacs`)
Args:
- model (cobra.Model):
Expand Down Expand Up @@ -486,9 +486,16 @@ def add_reactions_from_table(self, model:cobra.Model,
references={'ec-code':[row['ec-code']]},
idprefix=idprefix,
namespace=namespace)
# BioCyc @TODO
# BioCyc
# @TEST
case 'BioCyc':
reac = build_reaction_biocyc()
refs = row['references']
refs['ec-code'] = row['ec-code']
reac = build_reaction_kegg(model,row['id'],
reac_str=row['equation'],
references=refs,
idprefix=idprefix,
namespace=namespace)
# Unknown database
case _:
mes = f'''Unknown database name for reaction reconstruction: {row["via"]}\n
Expand Down Expand Up @@ -698,7 +705,7 @@ def __init__(self, organismid) -> None:

# @TODO: parallelising
# @TODO: logging
def get_missing_genes(self, model:libModel) -> pd.DataFrame:
def find_missing_genes(self, model:libModel) -> pd.DataFrame:
"""Get the missing genes in model in comparison to the KEGG entry of the
organism.
Expand Down Expand Up @@ -732,7 +739,7 @@ def get_model_genes(model: libModel) -> pd.DataFrame:
for idx in range(cv_term.getNumResources()):
uri = cv_term.getResourceURI(idx)
if 'kegg.genes' in uri:
genes_in_model.append(re.split('kegg.genes:|kegg.genes/',uri)[1]) # work with olf/new pattern
genes_in_model.append(re.split('kegg.genes:|kegg.genes/',uri)[1]) # work with old/new pattern

return pd.DataFrame(genes_in_model, columns=['orgid:locus'])

Expand Down Expand Up @@ -775,7 +782,7 @@ def get_model_genes(model: libModel) -> pd.DataFrame:
# @TODO : paralellising possibilities?
# @TODO : progress bar
# @TODO : self._statistics for reactions
def get_missing_reacs(self,model:cobra.Model,genes_not_in_model):
def find_missing_reacs(self,model:cobra.Model,genes_not_in_model):

# Step 1: filter missing gene list + extract ECs
# ----------------------------------------------
Expand Down Expand Up @@ -839,13 +846,10 @@ class BioCycGapFiller(GapFiller):
'id' | 'ncbiprotein' | 'name' | 'equation' | 'Reactants' |
'Products' | 'ec-code' | 'Reaction-Direction' | 'via' | 'add_to_gpr'
"""
# @TODO: Add column 'references'
# & move entries from 'name', 'Reactants', 'Products', 'ec-code' &
# 'Reaction-Direction' to 'references'
# @NOTE: Columns 'Reactants' & 'Products' could be unnecessary
# -> Retrieve information from BioCyc-API?
# -> Retrieve information from MetaCyc Compounds table?
# (Could maybe also added to rg internal db)
# (Could maybe also be added to rg internal db)
def __init__(self, biocyc_gene_tbl_path: str,
biocyc_reacs_tbl_path: str, gff:str) -> None:
super().__init__()
Expand All @@ -861,7 +865,7 @@ def biocyc_gene_tbl(self):
return self._biocyc_gene_tbl

@biocyc_gene_tbl.setter
# @TODO: Hier sollten wir noch diskutieren, ob Accession-2 oder Accession-1
# @DISCUSSION: Hier sollten wir noch diskutieren, ob Accession-2 oder Accession-1
# hard coded sein sollten oder nicht
# Dafür müssten wir nochmal ein paar entries in BioCyc dazu anschauen.
# Locus tags in GenBank GFF file == BioCyc Accession-2 == Old locus tags in
Expand All @@ -882,25 +886,27 @@ def biocyc_gene_tbl(self, biocyc_gene_tbl_path: str):
Table containing only rows where a 'Reaction of gene' exists
"""
# Read table
biocyc_genes = pd.read_table(
self._biocyc_gene_tbl = pd.read_table(
biocyc_gene_tbl_path,
usecols=['Accession-2', 'Reactions of gene'],
dtype=str
)

# Rename columns for further use
biocyc_genes.rename(
self._biocyc_gene_tbl.rename(
columns={
'Accession-2': 'locus_tag', 'Reactions of gene': 'id'
},
inplace=True)

# Turn empty strings into NaNs & Remove NaNs
# @DISCUSSION: Too hard-coded? Should I keep the NaNs? Or at least keep them for the report?
biocyc_genes.replace('', np.nan, inplace=True)
biocyc_genes.dropna(inplace=True)

self._biocyc_gene_tbl = biocyc_genes
# Turn empty strings into NaNs
self._biocyc_gene_tbl.replace('', np.nan, inplace=True)

# Save not mappable genes
self.manual_curation['BioCyc genes not mappable'] = self._biocyc_gene_tbl[self._biocyc_gene_tbl.isna()]

# Remove NaNs
self._biocyc_gene_tbl.dropna(inplace=True)

@property
def biocyc_rxn_tbl(self):
Expand All @@ -925,7 +931,7 @@ def biocyc_rxn_tbl(self, biocyc_reacs_tbl_path: str) -> pd.DataFrame:
Table containing all BioCyc reactions from provided file
"""
# Read table
biocyc_reacs = pd.read_table(
self._biocyc_rxn_tbl = pd.read_table(
biocyc_reacs_tbl_path,
usecols=[
'Reaction', 'Object ID', 'Reactants of reaction',
Expand All @@ -936,7 +942,7 @@ def biocyc_rxn_tbl(self, biocyc_reacs_tbl_path: str) -> pd.DataFrame:
)

# Rename columns for further use
biocyc_reacs.rename(
self._biocyc_rxn_tbl.rename(
columns={
'Reaction': 'equation', 'Object ID': 'id',
'Reactants of reaction': 'Reactants',
Expand All @@ -947,17 +953,14 @@ def biocyc_rxn_tbl(self, biocyc_reacs_tbl_path: str) -> pd.DataFrame:
)

# Turn empty strings into NaNs
biocyc_reacs.replace('', np.nan, inplace=True)
self._biocyc_rxn_tbl.replace('', np.nan, inplace=True)

# Set entries in is_spontaneous to booleans &
# specify empty entries in 'is_spontaneous' as False
biocyc_reacs['is_spontaneous'].replace({'T': True, 'F': False}, inplace=True)
biocyc_reacs['is_spontaneous'] = biocyc_reacs['is_spontaneous'].fillna(False)

self._biocyc_rxn_tbl = biocyc_reacs
self._biocyc_rxn_tbl['is_spontaneous'].replace({'T': True, 'F': False}, inplace=True)
self._biocyc_rxn_tbl['is_spontaneous'] = self._biocyc_rxn_tbl['is_spontaneous'].fillna(False)

# @DISCUSSION: Change get method to setter?
def get_missing_genes(self, model: libModel):
def find_missing_genes(self, model: libModel):
"""Retrieves the missing genes and reactions from the BioCyc table
according to the 'Accession-2' identifiers
Expand Down Expand Up @@ -1007,7 +1010,7 @@ def get_missing_genes(self, model: libModel):
self.missing_genes['locus_tag'].unique().tolist()
)

def get_missing_reacs(self, model: cobra.Model):
def find_missing_reacs(self, model: cobra.Model):
"""Retrieves the missing reactions with more information like the
equation, EC code, etc. according to the missing genes
Expand All @@ -1019,7 +1022,7 @@ def get_missing_reacs(self, model: cobra.Model):
# Step 1: filter missing gene list + extract ECs
# ----------------------------------------------
# Drop locus tag column as not needed for here
missing_genes = self.missing_genes.drop('locus_tag', axis=1)
missing_genes = self.missing_genes.drop(['locus_tag', 'name'], axis=1)

# Expand missing genes result table to merge with Biocyc reactions table
missing_genes = pd.DataFrame(
Expand All @@ -1033,53 +1036,59 @@ def get_missing_reacs(self, model: cobra.Model):
missing_genes['id'] = missing_genes['id'].str.strip()

# Get missing reactions from missing genes
missing_reacs = missing_genes.merge(
self.missing_reacs = missing_genes.merge(
self.biocyc_rxn_tbl, on='id'
)

# Turn entries with '//' into lists
missing_reacs['Reactants'] = missing_reacs['Reactants'].str.split('\s*//\s*')
missing_reacs['Products'] = missing_reacs['Products'].str.split('\s*//\s*')
missing_reacs['ec-code'] = missing_reacs['ec-code'].str.split('\s*//\s*')
self.missing_reacs['Reactants'] = self.missing_reacs['Reactants'].str.split('\s*//\s*')
self.missing_reacs['Products'] = self.missing_reacs['Products'].str.split('\s*//\s*')
self.missing_reacs['ec-code'] = self.missing_reacs['ec-code'].str.split('\s*//\s*')

# Step 2: Get content for column ncbiprotein
# ------------------------------------------
# Turn ncbiprotein column into lists of ncbiprotein IDs per reaction
ncbiprotein_as_list = missing_reacs.groupby('id')['ncbiprotein'].apply(list).reset_index(name='ncbiprotein')
missing_reacs.drop('ncbiprotein', axis=1, inplace=True)
missing_reacs = ncbiprotein_as_list.merge(missing_reacs, on='id')
missing_reacs['ncbiprotein'] = missing_reacs['ncbiprotein'].apply(
ncbiprotein_as_list = self.missing_reacs.groupby('id')['ncbiprotein'].apply(list).reset_index(name='ncbiprotein')
self.missing_reacs.drop('ncbiprotein', axis=1, inplace=True)
self.missing_reacs = ncbiprotein_as_list.merge(self.missing_reacs, on='id')
self.missing_reacs['ncbiprotein'] = self.missing_reacs['ncbiprotein'].apply(
lambda x:
x if not None in x else list(filter(None, x))
)

# Add 'G_spontaneous' as gene product if marked as spontaneous &
# drop is_spontaneous column
missing_reacs['ncbiprotein'] = missing_reacs.apply(
self.missing_reacs['ncbiprotein'] = self.missing_reacs.apply(
lambda x:
x['ncbiprotein'] if not x['is_spontaneous'] else x['ncbiprotein'].append('spontaneous'),
axis=1
)
missing_reacs.drop('is_spontaneous', axis=1, inplace=True)
self.missing_reacs.drop('is_spontaneous', axis=1, inplace=True)

# Step 3: Get amount of missing reactions from BioCyc for statistics
# ------------------------------------------------------------------
self._statistics['reactions']['missing (before)'] = len(
missing_reacs['id'].unique().tolist()
self.missing_reacs['id'].unique().tolist()
)

# Step 4: Map to model reactions & cleanup
# ----------------------------------------
# Add column 'via'
missing_reacs['via'] = 'BioCyc'
self.missing_reacs['via'] = 'BioCyc'

# Filter reacs for already in model
missing_reacs['add_to_GPR'] = missing_reacs.apply(
self.missing_reacs['add_to_GPR'] = self.missing_reacs.apply(
lambda x:
self._find_reac_in_model(model,x['ec-code'],x['id'],x['via']), axis=1
)

self.missing_reacs = missing_reacs

# Add column 'references'
# & move entries from 'Reactants', 'Products' &
# 'Reaction-Direction' to 'references'
self.missing_reacs['references'] = self.missing_reacs[['Reactants', 'Products', 'Reaction-Direction']].to_dict('records')

# Remove columns 'Reactants', 'Products' & 'Reaction-Direction'
self.missing_reacs.drop(['Reactants', 'Products', 'Reaction-Direction'], axis=1, inplace=True)


# ----------------
Expand Down Expand Up @@ -1128,7 +1137,7 @@ class GeneGapFiller(GapFiller):
def __init__(self) -> None:
super().__init__()

def get_missing_genes(self,gffpath:Union[str|Path],model:libModel) -> pd.DataFrame:
def find_missing_genes(self,gffpath:Union[str|Path],model:libModel) -> pd.DataFrame:

# get all CDS from gff
all_genes = parse_gff_for_cds(gffpath,self.GFF_COLS)
Expand Down Expand Up @@ -1157,7 +1166,7 @@ def get_missing_genes(self,gffpath:Union[str|Path],model:libModel) -> pd.DataFra
return missing_genes


def get_missing_reacs(self, model:cobra.Model,
def find_missing_reacs(self, model:cobra.Model,
missing_genes:pd.DataFrame,
# prefix for pseudo ncbiprotein ids
prefix:str='refinegems',
Expand Down

0 comments on commit 1c1420c

Please sign in to comment.