Update gapfill #126 #52

draeger-lab · Aug 1, 2024 · 61843e2 · 61843e2
1 parent 78f7408
commit 61843e2
Show file tree

Hide file tree

Showing 4 changed files with 200 additions and 39 deletions.
diff --git a/dev/gapfill-testing.ipynb b/dev/gapfill-testing.ipynb
@@ -178,13 +178,6 @@
     "mapped_res[1]"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -270,7 +263,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -283,14 +276,17 @@
     "# @TODO merge with the function of the same name in entities, if possible\n",
     "# or just use them separatly \n",
     "# @TODO generalise addition of references -> maybe kwargs\n",
+    "# @TODO\n",
+    "# what to do about the name\n",
     "def create_gp(model, protein_id, \n",
     "              name:str=None, locus_tag:str=None,\n",
     "              uniprot:tuple[str,bool]=None):\n",
     "    \n",
     "    # create gene product object\n",
     "    gp = model.getPlugin(0).createGeneProduct()\n",
     "    # set basic attributes\n",
-    "    gp.setId('G_' + protein_id)             # ID @TODO this results in a '' as ID!??!?!?\n",
+    "    geneid = f'G_{protein_id}'.replace('.','_') # remove problematic signs\n",
+    "    gp.setIdAttribute(geneid)               # ID \n",
     "    if name: gp.setName(name)               # Name  \n",
     "    if locus_tag: gp.setLabel(locus_tag)    # Label\n",
     "    gp.setSBOTerm('SBO:0000243')            # SBOterm\n",
@@ -306,9 +302,8 @@
     "        for uniprotid in uniprot[0]:\n",
     "            add_cv_term_genes(uniprotid, 'UNIPROT', gp, uniprot[1]) # UniProt\n",
     "   \n",
-    "# @TEST\n",
-    "# @ASK : does the model need to be returned?\n",
-    "# probably sort in GapFiller\n",
+    "   \n",
+    "# probably sort into GapFiller\n",
     "def add_genes_from_table(model, gene_table:pd.DataFrame):\n",
     "    \n",
     "    # ncbiprotein | locus_tag | ec-code | ...\n",
@@ -321,6 +316,12 @@
     "        create_gp(model, x['ncbiprotein'], \n",
     "                  locus_tag=x['locus_tag'],\n",
     "                  uniprot=(x['UniProt'],True))\n",
+    "        \n",
+    "\n",
+    "def create_gpr(reaction,gene):\n",
+    "    # Case 1:\n",
+    "    pass\n",
+    "\n",
     "\n",
     "def fill_model(model, missing_genes:pd.DataFrame, \n",
     "               missing_reacs:pd.DataFrame):\n",
@@ -332,57 +333,181 @@
     "    ncbiprot_with_reacs_in_model = [*chain(*list(reacs_in_model['ncbiprotein']))]\n",
     "    genes_with_reacs_in_model = missing_genes[missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model)]\n",
     "    \n",
-    "    # add genes as gene products to model\n",
-    "    # @TODO\n",
-    "    # what to do about the name\n",
-    "    add_genes_from_table(model, genes_with_reacs_in_model)\n",
+    "    if len(genes_with_reacs_in_model) > 0:\n",
+    "        # add genes as gene products to model\n",
+    "        add_genes_from_table(model, genes_with_reacs_in_model)\n",
     "    \n",
-    "    # extend gene production rules \n",
-    "    # @TODO\n",
-    "    # add_gene_reac_associations_from_table(model,....)\n",
+    "        # extend gene production rules \n",
+    "        # @TODO\n",
+    "        # add_gene_reac_associations_from_table(model,....)\n",
     "   \n",
-    "    # what remains:\n",
-    "    missing_reacs = missing_reacs[missing_reacs['add_to_GPR'].isnull()]\n",
-    "    missing_genes = missing_genes[~(missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model))]\n",
+    "        # what remains:\n",
+    "        missing_reacs = missing_reacs[missing_reacs['add_to_GPR'].isnull()]\n",
+    "        missing_genes = missing_genes[~(missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model))]\n",
+    "    \n",
     "    \n",
     "    # Step 2: "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
     "# [*chain(*list(mapped_res[1][~mapped_res[1]['add_to_GPR']]['ncbiprotein']))]\n",
     "testmodel = model.clone()\n",
-    "before = testmodel.getPlugin(0).getListOfGeneProducts()\n",
     "testcase = mapped_res[1].copy()\n",
     "testcase.iloc[2,-1] = ['12DGR160tipp']\n",
     "fill_model(testmodel,mapped_res[0],testcase)\n",
-    "after = testmodel.getPlugin(0).getListOfGeneProducts()\n",
-    "# test_list = testmodel.getPlugin(0).getListOfGeneProducts()"
+    "after = testmodel.getPlugin(0).getListOfGeneProducts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
+      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
+      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
+      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
+     ]
+    }
+   ],
+   "source": [
+    "from libsbml import FbcOr, FbcAnd\n",
+    "\n",
+    "testmodel = model.clone()\n",
+    "x = 113 # 45 : None, 46 : One, 113 Or\n",
+    "id = 'WP_011274363_1'\n",
+    "reac = testmodel.getListOfReactions()[x].getPlugin(0)\n",
+    "# connection = 'or'\n",
+    "# test, if there is already a gpr\n",
+    "old_association_str = None\n",
+    "old_association_fbc = None\n",
+    "if reac.getGeneProductAssociation():\n",
+    "    old_association = reac.clone().getGeneProductAssociation().getListOfAllElements()\n",
+    "    if len(old_association) == 1:\n",
+    "        old_association_str = old_association[0].getGeneProduct()\n",
+    "    else:\n",
+    "        for el in old_association:\n",
+    "            if isinstance(el, FbcOr) or isinstance(el, FbcAnd):\n",
+    "                old_association_fbc = el # there should only be one object od this type -> @TODO check\n",
+    "                break\n",
+    "                \n",
+    "# create new gene product association \n",
+    "if old_association_str and isinstance(id,str):\n",
+    "    id = [old_association_str,id]\n",
+    "elif old_association_str  and isinstance(id,list):\n",
+    "    id.append(old_association_str)\n",
+    "    \n",
+    "# this does not work!!!!\n",
+    "# @IDEA: create a dummy gp e.g. a copy of the current one and copy it from there\n",
+    "if not old_association_fbc:\n",
+    "    new_association = reac.createGeneProductAssociation()\n",
+    "else:\n",
+    "    new_association = reac.createGeneProductAssociation().createOr()\n",
+    "    new_association.addAssociation(old_association_fbc)\n",
+    "    \n",
+    "if isinstance(id,str):\n",
+    "    new_association.createGeneProductRef().setGeneProduct(id)\n",
+    "elif isinstance(id,list) and len(id) == 1:\n",
+    "    new_association.createGeneProductRef().setGeneProduct(id[0])\n",
+    "elif isinstance(id,list) and len(id) > 1:\n",
+    "    gpa_or =  new_association.createOr()\n",
+    "    for i in id:\n",
+    "        gpa_or.createGeneProductRef().setGeneProduct(i)\n",
+    "        \n",
+    "    \n",
+    "\n",
+    "print(reac.getGeneProductAssociation().getListOfAllElements())\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1538\n"
+     ]
+    }
+   ],
+   "source": [
+    "testmodel = model.clone()\n",
+    "print(len(testmodel.getListOfReactions()))\n",
+    "testreac = testmodel.getListOfReactions()[113]\n",
+    "testreaccopy = testreac.clone()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'R_ADPT'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "testreac.getId()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'R_ADPT'"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "testreaccopy.getId()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "''"
+       "1538"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "penny = [_ for _ in after if _.getLabel() == 'SH0005']\n",
-    "penny[0].getId()"
+    "len(testmodel.getListOfReactions())"
    ]
   },
   {

diff --git a/src/refinegems/classes/gapfill.py b/src/refinegems/classes/gapfill.py
@@ -13,7 +13,7 @@
 import pandas as pd
 import numpy as np
 
-from typing import Literal
+from typing import Literal, Union
 
 from libsbml import Model as libModel
 from bioservices.kegg import KEGG
@@ -137,7 +137,7 @@ def _map_ec_to_reac_kegg(unmapped_reacs):
 class GapFiller(ABC):
 
     def __init__(self) -> None:
-        self.full_gene_list = None
+        self.full_gene_list = None  # really a good idea? GeneGapFiller does not need it at all
         self.geneid_type = 'ncbi' # @TODO
         self._statistics = dict()
         self.manual_curation = dict()
@@ -156,7 +156,7 @@ def get_missing_reacs(self,model):
 
     def _find_reac_in_model(self, model: cobra.Model, eccode:str, id:str, 
                idtype:Literal['MetaNetX','KEGG','BiGG', 'BioCyc'], 
-               include_ec_match:bool=False):
+               include_ec_match:bool=False) -> Union[None, list]:
         # @TODO Ensure that user has requested BioCyc identifiers.org version? 
         # -> Could be done with polish_annotations
         MAPPING = {
@@ -592,7 +592,7 @@ class GeneGapFiller(GapFiller):
     def __init__(self) -> None:
         super().__init__()
 
-    def get_missing_genes(self,gffpath,model:libModel):
+    def get_missing_genes(self,gffpath,model:libModel) -> pd.DataFrame:
 
         # get all CDS from gff
         all_genes = parse_gff_for_cds(gffpath,self.GFF_COLS)
@@ -625,7 +625,7 @@ def get_missing_reacs(self, model:cobra.Model,
                           fasta:str=None, 
                           dmnd_db:str=None, 
                           swissprot_map:str=None,
-                          **kwargs):
+                          **kwargs) -> tuple:
 
         # Case 1:  no EC
         # --------------
@@ -711,6 +711,10 @@ def get_missing_reacs(self, model:cobra.Model,
         mapped_reacs['add_to_GPR'] = mapped_reacs.apply(lambda x: self._find_reac_in_model(model,x['ec-code'],x['id'],x['via']), axis=1)
 
         return updated_missing_genes, mapped_reacs
+
+
+
+
 ############################################################################
 # functions
 ############################################################################

diff --git a/src/refinegems/curation/db_access/kegg.py b/src/refinegems/curation/db_access/kegg.py
@@ -138,7 +138,22 @@ def bioservices_parse_KEGG_gene(locus_tag):
     return gene_info
 
 # @NEW / FASTER version of the above using Biopython
-def parse_KEGG_gene(locus_tag):
+def parse_KEGG_gene(locus_tag:str) -> dict:
+    """Based on a locus tag, fetch the corresponding KEGG entry and
+    parse it into a dictionary containing the following information (if available):
+    
+    - ec-code
+    - orthology
+    - references
+
+    Args:
+        - locus_tag (str): 
+            The locus in the format <orgnismid>:<locus_tag>
+
+    Returns:
+        dict: 
+            The collected information.
+    """
 
     gene_info = dict()
     gene_info['orgid:locus'] = locus_tag
@@ -221,7 +236,24 @@ def bioservices_parse_KEGG_ec(ec):
     return ec_info
 
 # @NEW / FASTER version of the above using Biopython
-def parse_KEGG_ec(ec):
+def parse_KEGG_ec(ec:str) -> dict:
+    """Based on an EC number, fetch the corresponding KEGG entry and
+    parse it into a dictionary containing the following information (if available):
+    
+    - ec-code
+    - id (kegg.reference)
+    - equation
+    - reference 
+    - pathway
+
+    Args:
+        - ec (str): 
+            The EC number in the format 'x.x.x.x'
+
+    Returns:
+        dict: 
+            The collected information about the KEGG entry.
+    """
 
     ec_info = dict()
     ec_info['ec-code'] = ec

diff --git a/src/refinegems/utility/entities.py b/src/refinegems/utility/entities.py
@@ -421,7 +421,7 @@ def create_gp(model: libModel, model_id: str, name: str, locus_tag: str, protein
     """
     id_db = None
     gp = model.getPlugin(0).createGeneProduct()
-    gp.setId(model_id)
+    gp.setId(model_id) # libsbml advised to use set/getIdAttribute
     gp.setName(name)
     gp.setLabel(locus_tag)
     gp.setSBOTerm('SBO:0000243')