Adjusted connections due to new db_access set up #126

draeger-lab · Aug 21, 2024 · 4df40b4 · 4df40b4
1 parent cac6ad6
commit 4df40b4
Show file tree

Hide file tree

Showing 16 changed files with 299 additions and 311 deletions.
diff --git a/docs/source/modules/utility.rst b/docs/source/modules/utility.rst
@@ -31,6 +31,17 @@ databases module
    :private-members: 
    :special-members:
 
+db\_access module
+-----------------
+
+.. automodule:: refinegems.utility.db_access
+   :members:
+   :undoc-members:
+   :show-inheritance:
+   :private-members: 
+   :special-members:
+
+
 entities module
 ---------------
 

diff --git a/src/refinegems/analysis/growth.py b/src/refinegems/analysis/growth.py
@@ -18,7 +18,7 @@
 import yaml
 
 from cobra import Model as cobraModel
-from ..utility.entities import test_biomass_presence
+from ..utility.util import test_biomass_presence
 from ..utility.io import load_model, load_a_table_from_database
 from ..classes.reports import SingleGrowthSimulationReport, GrowthSimulationReport, AuxotrophySimulationReport, SourceTestReport
 from ..classes.medium import Medium, medium_to_model, read_from_cobra_model, load_medium_from_db, read_external_medium

diff --git a/src/refinegems/analysis/investigate.py b/src/refinegems/analysis/investigate.py
@@ -21,7 +21,7 @@
 from memote.support import consistency_helpers as con_helpers
 
 from ..utility.io import search_sbo_label
-from ..utility.entities import test_biomass_presence
+from ..utility.util import test_biomass_presence
 
 ################################################################################
 # variables

diff --git a/src/refinegems/classes/egcs.py b/src/refinegems/classes/egcs.py
@@ -9,7 +9,7 @@
 
 from ..analysis.growth import MIN_GROWTH_THRESHOLD, set_bounds_to_default
 from .medium import Medium
-from ..utility.entities import test_biomass_presence
+from ..utility.util import test_biomass_presence
 
 import cobra
 import pandas as pd

diff --git a/src/refinegems/classes/reports.py b/src/refinegems/classes/reports.py
@@ -25,7 +25,7 @@
 from typing import Literal,Union
 
 from ..analysis.investigate import get_mass_charge_unbalanced, get_orphans_deadends_disconnected, get_reac_with_gpr
-from ..utility.entities import test_biomass_presence
+from ..utility.util import test_biomass_presence
 from ..developement.decorators import *
 
 ################################################################################

diff --git a/src/refinegems/curation/__init__.py b/src/refinegems/curation/__init__.py
@@ -1,6 +1,4 @@
-__all__ = ['db_access','biomass','charges','curate','pathways','polish']
-
-from . import db_access
+__all__ = ['biomass','charges','curate','pathways','polish']
 
 from . import biomass
 from . import charges

diff --git a/src/refinegems/curation/biomass.py b/src/refinegems/curation/biomass.py
@@ -17,7 +17,7 @@
 from six import iteritems
 from typing import Union
 
-from ..utility.entities import test_biomass_consistency, test_biomass_presence
+from ..utility.util import test_biomass_consistency, test_biomass_presence
 
 ############################################################################
 # variables

diff --git a/src/refinegems/curation/charges.py b/src/refinegems/curation/charges.py
@@ -24,7 +24,7 @@
 
 from libsbml import Model as libModel
 
-from .db_access.modelseed import get_modelseed_compounds
+from ..utility.db_access import get_modelseed_compounds
 
 ############################################################################
 # functions

diff --git a/src/refinegems/curation/curate.py b/src/refinegems/curation/curate.py
@@ -20,7 +20,8 @@
 from typing import Literal
 
 from ..utility.cvterms import add_cv_term_reactions, add_cv_term_metabolites, metabol_db_dict, get_id_from_cv_term
-from ..utility.entities import create_gpr_from_locus_tag, create_reaction,test_biomass_presence
+from ..utility.entities import create_gpr_from_locus_tag, create_reaction
+from ..utility.util import test_biomass_presence
 
 ################################################################################
 # variables

diff --git a/src/refinegems/curation/polish.py b/src/refinegems/curation/polish.py
@@ -31,7 +31,8 @@
 from typing import Union
 
 from ..utility.cvterms import add_cv_term_units, add_cv_term_metabolites, add_cv_term_reactions, add_cv_term_genes, generate_cvterm, metabol_db_dict, reaction_db_dict, MIRIAM, OLD_MIRIAM
-from ..utility.io import search_ncbi_for_gpr, parse_gff_for_refseq_info, parse_fasta_headers, parse_dict_to_dataframe, load_a_table_from_database
+from ..utility.db_access import search_ncbi_for_gpr
+from ..utility.io import parse_gff_for_refseq_info, parse_fasta_headers, parse_dict_to_dataframe, load_a_table_from_database
 
 ################################################################################
 # variables

diff --git a/src/refinegems/utility/__init__.py b/src/refinegems/utility/__init__.py
@@ -1,8 +1,9 @@
-__all__ = ['connections', 'cvterms','databases','entities','io','set_up','util']
+__all__ = ['connections', 'cvterms', 'databases', 'db_access', 'entities','io','set_up','util']
 
 from . import connections
 from . import cvterms
 from . import databases
+from . import db_access
 from . import entities
 from . import io
 from . import set_up

diff --git a/src/refinegems/utility/connections.py b/src/refinegems/utility/connections.py
@@ -11,7 +11,9 @@
 import cobra
 import json
 import memote
+import pandas as pd
 import shutil
+import subprocess
 import tempfile
 import time
 import warnings
@@ -33,7 +35,7 @@
 # needed by memote.support.consistency
 from memote.support import consistency_helpers as con_helpers
 
-from .entities import test_biomass_presence
+from .util import test_biomass_presence
 from .io import write_model_to_file
 
 # note:
@@ -45,13 +47,6 @@
 # variables
 ################################################################################
 
-# database urls
-# -------------
-
-BIGG_REACTIONS_URL = 'http://bigg.ucsd.edu/api/v2/universal/reactions/' #: :meta: 
-BIGG_METABOLITES_URL = 'http://bigg.ucsd.edu/api/v2/universal/metabolites/' #: :meta: 
-
-
 ################################################################################
 # functions
 ################################################################################
@@ -147,6 +142,94 @@ def adjust_BOF(genome:str, model_file:str, model:cobra.Model, dna_weight_fractio
     return new_objective
 
 
+# DIAMOND
+# -------
+
+# @ISSUE / @NOTE / @TODO
+#   is this the right place to put these functions?
+def run_DIAMOND_blastp(fasta:str, db:str, 
+                       sensitivity:Literal['sensitive', 'more-sensitive', 'very-sensitive','ultra-sensitive']='more-sensitive',
+                       coverage:float=95.0,
+                       threads:int=2,
+                       outdir:str=None, outname:str='DIAMOND_blastp_res.tsv') -> str:
+    """Run DIAMOND in BLASTp mode.
+
+    Args:
+        - fasta (str): 
+            The FASTA file to BLAST for.
+        - db (str): 
+            The DIAMOND database file to BLAST against
+        - sensitivity (Literal['sensitive', 'more-sensitive', 'very-sensitive','ultra-sensitive'], optional): 
+            Sensitivity mode for DIAMOND. 
+            Defaults to 'more-sensitive'.
+        - coverage (float, optional): 
+            A parameter for DIAMOND
+            Coverage theshold for the hits. 
+            Defaults to 95.0.
+        - threads (int, optional): 
+            A parameter for DIAMOND.
+            Number of threds to be used while BLASTing.
+            Defaults to 2.
+        - outdir (str, optional): 
+            Path to a directory to write the output files to. 
+            Defaults to None.
+        - outname (str, optional): 
+            Name of the result file (name only, not a path). 
+            Defaults to 'DIAMOND_blastp_res.tsv'.
+
+    Returns:
+        str: 
+            Path to the results of the DIAMOND BLASTp run. 
+    """
+
+    if outdir:
+        outname = Path(outdir,'DIAMOND_blastp_res.tsv')
+        logfile = Path(outdir,'log_DIAMOND_blastp.txt')
+    else:
+        outname = Path(outname)
+        logfile = Path('log_DIAMOND_blastp.txt')
+
+    # @TODO: test, if it works with different paths and their problems  
+    # @TODO: write additional output to a logfile, not stderr
+    subprocess.run([F'diamond blastp -d {db} -q {fasta} --{sensitivity} --query-cover {coverage} -p {int(threads)} -o {outname} --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore 2> {logfile}'], shell=True)
+
+    return outname
+
+def filter_DIAMOND_blastp_results(blasttsv:str, pid_theshold:float=90.0) -> pd.DataFrame:
+    """Filter the results of a DIAMOND BLASTp run (see 
+    :py:func:`~refinegems.curation.db_access.db.run_DIAMOND_blastp`)
+    by percentage identity value (PID) and extract the matching pairs of query 
+    and subject IDs.
+
+    Args:
+        - blasttsv (str): 
+            Path to the DIAMOND BLASTp result file.
+        - pid_theshold (float, optional): 
+            Threshold value for the PID. Given in percent.
+            Defaults to 90.0.
+
+    Raises:
+        - ValueError: PID threshold has to be between 0.0 and 100.0
+
+    Returns:
+        pd.DataFrame: 
+            A table with the columns query_ID and subject_ID containing hits from
+            BLAST run with s PID higher than the given threshold value.
+    """
+
+    if pid_theshold > 100.0 or pid_theshold < 0.0:
+        raise ValueError('PID threshold has to be between 0.0 and 100.0')
+
+    # load diamond results
+    diamond_results = pd.read_csv(blasttsv, sep='\t', header=None)
+    diamond_results.columns = ['query_ID', 'subject_ID', 'PID', 'align_len', 'no_mismatch', 'no_gapopen', 'query_start', 'query_end', 'subject_start', 'subject_end','E-value','bitscore']
+    # filter by PID
+    diamond_results = diamond_results[diamond_results['PID']>=pid_theshold]
+    # trim cols
+    diamond_results = diamond_results[['query_ID','subject_ID']]
+
+    return diamond_results
+
 
 # MCC - MassChargeCuration
 # ------------------------
@@ -308,3 +391,4 @@ def run_SBOannotator(model: libModel) -> libModel:
         model = sbo_annotator(doc,model,'constrained-based',str(Path(tempdir,'dbs')),str(Path(tempdir,'dud.xml')))
     return model
 
+
diff --git a/src/refinegems/utility/cvterms.py b/src/refinegems/utility/cvterms.py
@@ -10,6 +10,7 @@
 # requirements
 ################################################################################
 
+import cobra
 import logging
 from libsbml import BIOLOGICAL_QUALIFIER, BQB_IS, BQB_OCCURS_IN, BQB_IS_HOMOLOG_TO, MODEL_QUALIFIER, BQM_IS_DESCRIBED_BY, Unit, CVTerm, Species, Reaction, GeneProduct, Group, SBase
 
@@ -80,6 +81,33 @@
 # functions
 ################################################################################
 
+# cobra
+# -----
+
+def _add_annotations_from_dict_cobra(references:dict, entity:cobra.Reaction|cobra.Metabolite|cobra.Model) -> None:
+    """Given a dictionary and a cobra object, add the former as annotations to the latter.
+    The keys of the dictionary are used as the annotation labels, the values as the values.
+    If the keys are already in the entity, the values will be combined (union).
+
+    Args:
+        - references (dict): 
+            The dictionary with the references to add the entity.
+        - entity (cobra.Reaction | cobra.Metabolite | cobra.Model): 
+            The entity to add annotations to.
+    """
+    # add additional references from the parameter
+    for db,idlist in references.items():
+        if not isinstance(idlist,list):
+            idlist = [idlist]
+        if db in entity.annotation.keys():
+            entity.annotation[db] = list(set(entity.annotation[db] + idlist))
+        else:
+            entity.annotation[db] = idlist
+
+
+
+# libsbml
+# -------
 
 def add_cv_term_units(unit_id: str, unit: Unit, relation: int):
     """Adds CVTerm to a unit

diff --git a/src/refinegems/utility/databases.py b/src/refinegems/utility/databases.py
@@ -188,6 +188,7 @@ def get_database_links_info_per_row(row:pd.Series):
                 key, value = link.split(':',1)
                 key = key.strip()
                 value = value.rsplit('/',1)[1].strip()
+                value = value.removeprefix('META:') # @TODO: Make case insensitive!
                 if key in database_ids.keys():
                     database_ids[key].append(value)
                 else: