Skip to content

Commit

Permalink
Update gapfill #126 #52
Browse files Browse the repository at this point in the history
  • Loading branch information
cb-Hades committed Aug 1, 2024
1 parent 78f7408 commit 61843e2
Show file tree
Hide file tree
Showing 4 changed files with 200 additions and 39 deletions.
187 changes: 156 additions & 31 deletions dev/gapfill-testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -178,13 +178,6 @@
"mapped_res[1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -270,7 +263,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -283,14 +276,17 @@
"# @TODO merge with the function of the same name in entities, if possible\n",
"# or just use them separatly \n",
"# @TODO generalise addition of references -> maybe kwargs\n",
"# @TODO\n",
"# what to do about the name\n",
"def create_gp(model, protein_id, \n",
" name:str=None, locus_tag:str=None,\n",
" uniprot:tuple[str,bool]=None):\n",
" \n",
" # create gene product object\n",
" gp = model.getPlugin(0).createGeneProduct()\n",
" # set basic attributes\n",
" gp.setId('G_' + protein_id) # ID @TODO this results in a '' as ID!??!?!?\n",
" geneid = f'G_{protein_id}'.replace('.','_') # remove problematic signs\n",
" gp.setIdAttribute(geneid) # ID \n",
" if name: gp.setName(name) # Name \n",
" if locus_tag: gp.setLabel(locus_tag) # Label\n",
" gp.setSBOTerm('SBO:0000243') # SBOterm\n",
Expand All @@ -306,9 +302,8 @@
" for uniprotid in uniprot[0]:\n",
" add_cv_term_genes(uniprotid, 'UNIPROT', gp, uniprot[1]) # UniProt\n",
" \n",
"# @TEST\n",
"# @ASK : does the model need to be returned?\n",
"# probably sort in GapFiller\n",
" \n",
"# probably sort into GapFiller\n",
"def add_genes_from_table(model, gene_table:pd.DataFrame):\n",
" \n",
" # ncbiprotein | locus_tag | ec-code | ...\n",
Expand All @@ -321,6 +316,12 @@
" create_gp(model, x['ncbiprotein'], \n",
" locus_tag=x['locus_tag'],\n",
" uniprot=(x['UniProt'],True))\n",
" \n",
"\n",
"def create_gpr(reaction,gene):\n",
" # Case 1:\n",
" pass\n",
"\n",
"\n",
"def fill_model(model, missing_genes:pd.DataFrame, \n",
" missing_reacs:pd.DataFrame):\n",
Expand All @@ -332,57 +333,181 @@
" ncbiprot_with_reacs_in_model = [*chain(*list(reacs_in_model['ncbiprotein']))]\n",
" genes_with_reacs_in_model = missing_genes[missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model)]\n",
" \n",
" # add genes as gene products to model\n",
" # @TODO\n",
" # what to do about the name\n",
" add_genes_from_table(model, genes_with_reacs_in_model)\n",
" if len(genes_with_reacs_in_model) > 0:\n",
" # add genes as gene products to model\n",
" add_genes_from_table(model, genes_with_reacs_in_model)\n",
" \n",
" # extend gene production rules \n",
" # @TODO\n",
" # add_gene_reac_associations_from_table(model,....)\n",
" # extend gene production rules \n",
" # @TODO\n",
" # add_gene_reac_associations_from_table(model,....)\n",
" \n",
" # what remains:\n",
" missing_reacs = missing_reacs[missing_reacs['add_to_GPR'].isnull()]\n",
" missing_genes = missing_genes[~(missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model))]\n",
" # what remains:\n",
" missing_reacs = missing_reacs[missing_reacs['add_to_GPR'].isnull()]\n",
" missing_genes = missing_genes[~(missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model))]\n",
" \n",
" \n",
" # Step 2: "
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# [*chain(*list(mapped_res[1][~mapped_res[1]['add_to_GPR']]['ncbiprotein']))]\n",
"testmodel = model.clone()\n",
"before = testmodel.getPlugin(0).getListOfGeneProducts()\n",
"testcase = mapped_res[1].copy()\n",
"testcase.iloc[2,-1] = ['12DGR160tipp']\n",
"fill_model(testmodel,mapped_res[0],testcase)\n",
"after = testmodel.getPlugin(0).getListOfGeneProducts()\n",
"# test_list = testmodel.getPlugin(0).getListOfGeneProducts()"
"after = testmodel.getPlugin(0).getListOfGeneProducts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
],
"source": [
"from libsbml import FbcOr, FbcAnd\n",
"\n",
"testmodel = model.clone()\n",
"x = 113 # 45 : None, 46 : One, 113 Or\n",
"id = 'WP_011274363_1'\n",
"reac = testmodel.getListOfReactions()[x].getPlugin(0)\n",
"# connection = 'or'\n",
"# test, if there is already a gpr\n",
"old_association_str = None\n",
"old_association_fbc = None\n",
"if reac.getGeneProductAssociation():\n",
" old_association = reac.clone().getGeneProductAssociation().getListOfAllElements()\n",
" if len(old_association) == 1:\n",
" old_association_str = old_association[0].getGeneProduct()\n",
" else:\n",
" for el in old_association:\n",
" if isinstance(el, FbcOr) or isinstance(el, FbcAnd):\n",
" old_association_fbc = el # there should only be one object od this type -> @TODO check\n",
" break\n",
" \n",
"# create new gene product association \n",
"if old_association_str and isinstance(id,str):\n",
" id = [old_association_str,id]\n",
"elif old_association_str and isinstance(id,list):\n",
" id.append(old_association_str)\n",
" \n",
"# this does not work!!!!\n",
"# @IDEA: create a dummy gp e.g. a copy of the current one and copy it from there\n",
"if not old_association_fbc:\n",
" new_association = reac.createGeneProductAssociation()\n",
"else:\n",
" new_association = reac.createGeneProductAssociation().createOr()\n",
" new_association.addAssociation(old_association_fbc)\n",
" \n",
"if isinstance(id,str):\n",
" new_association.createGeneProductRef().setGeneProduct(id)\n",
"elif isinstance(id,list) and len(id) == 1:\n",
" new_association.createGeneProductRef().setGeneProduct(id[0])\n",
"elif isinstance(id,list) and len(id) > 1:\n",
" gpa_or = new_association.createOr()\n",
" for i in id:\n",
" gpa_or.createGeneProductRef().setGeneProduct(i)\n",
" \n",
" \n",
"\n",
"print(reac.getGeneProductAssociation().getListOfAllElements())\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1538\n"
]
}
],
"source": [
"testmodel = model.clone()\n",
"print(len(testmodel.getListOfReactions()))\n",
"testreac = testmodel.getListOfReactions()[113]\n",
"testreaccopy = testreac.clone()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'R_ADPT'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"testreac.getId()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'R_ADPT'"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"testreaccopy.getId()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"''"
"1538"
]
},
"execution_count": 21,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"penny = [_ for _ in after if _.getLabel() == 'SH0005']\n",
"penny[0].getId()"
"len(testmodel.getListOfReactions())"
]
},
{
Expand Down
14 changes: 9 additions & 5 deletions src/refinegems/classes/gapfill.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import pandas as pd
import numpy as np

from typing import Literal
from typing import Literal, Union

from libsbml import Model as libModel
from bioservices.kegg import KEGG
Expand Down Expand Up @@ -137,7 +137,7 @@ def _map_ec_to_reac_kegg(unmapped_reacs):
class GapFiller(ABC):

def __init__(self) -> None:
self.full_gene_list = None
self.full_gene_list = None # really a good idea? GeneGapFiller does not need it at all
self.geneid_type = 'ncbi' # @TODO
self._statistics = dict()
self.manual_curation = dict()
Expand All @@ -156,7 +156,7 @@ def get_missing_reacs(self,model):

def _find_reac_in_model(self, model: cobra.Model, eccode:str, id:str,
idtype:Literal['MetaNetX','KEGG','BiGG', 'BioCyc'],
include_ec_match:bool=False):
include_ec_match:bool=False) -> Union[None, list]:
# @TODO Ensure that user has requested BioCyc identifiers.org version?
# -> Could be done with polish_annotations
MAPPING = {
Expand Down Expand Up @@ -592,7 +592,7 @@ class GeneGapFiller(GapFiller):
def __init__(self) -> None:
super().__init__()

def get_missing_genes(self,gffpath,model:libModel):
def get_missing_genes(self,gffpath,model:libModel) -> pd.DataFrame:

# get all CDS from gff
all_genes = parse_gff_for_cds(gffpath,self.GFF_COLS)
Expand Down Expand Up @@ -625,7 +625,7 @@ def get_missing_reacs(self, model:cobra.Model,
fasta:str=None,
dmnd_db:str=None,
swissprot_map:str=None,
**kwargs):
**kwargs) -> tuple:

# Case 1: no EC
# --------------
Expand Down Expand Up @@ -711,6 +711,10 @@ def get_missing_reacs(self, model:cobra.Model,
mapped_reacs['add_to_GPR'] = mapped_reacs.apply(lambda x: self._find_reac_in_model(model,x['ec-code'],x['id'],x['via']), axis=1)

return updated_missing_genes, mapped_reacs




############################################################################
# functions
############################################################################
Expand Down
36 changes: 34 additions & 2 deletions src/refinegems/curation/db_access/kegg.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,22 @@ def bioservices_parse_KEGG_gene(locus_tag):
return gene_info

# @NEW / FASTER version of the above using Biopython
def parse_KEGG_gene(locus_tag):
def parse_KEGG_gene(locus_tag:str) -> dict:
"""Based on a locus tag, fetch the corresponding KEGG entry and
parse it into a dictionary containing the following information (if available):
- ec-code
- orthology
- references
Args:
- locus_tag (str):
The locus in the format <orgnismid>:<locus_tag>
Returns:
dict:
The collected information.
"""

gene_info = dict()
gene_info['orgid:locus'] = locus_tag
Expand Down Expand Up @@ -221,7 +236,24 @@ def bioservices_parse_KEGG_ec(ec):
return ec_info

# @NEW / FASTER version of the above using Biopython
def parse_KEGG_ec(ec):
def parse_KEGG_ec(ec:str) -> dict:
"""Based on an EC number, fetch the corresponding KEGG entry and
parse it into a dictionary containing the following information (if available):
- ec-code
- id (kegg.reference)
- equation
- reference
- pathway
Args:
- ec (str):
The EC number in the format 'x.x.x.x'
Returns:
dict:
The collected information about the KEGG entry.
"""

ec_info = dict()
ec_info['ec-code'] = ec
Expand Down
2 changes: 1 addition & 1 deletion src/refinegems/utility/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ def create_gp(model: libModel, model_id: str, name: str, locus_tag: str, protein
"""
id_db = None
gp = model.getPlugin(0).createGeneProduct()
gp.setId(model_id)
gp.setId(model_id) # libsbml advised to use set/getIdAttribute
gp.setName(name)
gp.setLabel(locus_tag)
gp.setSBOTerm('SBO:0000243')
Expand Down

0 comments on commit 61843e2

Please sign in to comment.