Merge pull request #228 from WrightonLabCSU/improved_amg_summary

Improved amg summary This update changes the AMG summary so that it will provide all information from both the Metabolic DB and The AMG DB. It is a bit of a kitchen sink approach, but we can always cut it down if needed and most dram-v users I would call advanced. I could not fix the sql someone more familiar with sqlalchemy is free to do so I will make a branch to deal with this
WrightonLabCSU · Jan 6, 2023 · ac83ba7 · ac83ba7
2 parents 76f7103 + e592376
commit ac83ba7
Show file tree

Hide file tree

Showing 10 changed files with 374 additions and 169 deletions.
diff --git a/mag_annotator/CONFIG b/mag_annotator/CONFIG
@@ -1,28 +1,155 @@
 {
   "search_databases": {
-    "kegg": null,
-    "kofam_hmm": null,
-    "kofam_ko_list": null,
-    "uniref": null,
-    "pfam": null,
-    "dbcan": null,
-    "viral": null,
-    "peptidase": null,
-    "vogdb": null
+    "kegg": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kegg.20221012.mmsdb",
+    "kofam_hmm": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kofam_profiles.hmm",
+    "kofam_ko_list": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kofam_ko_list.tsv",
+    "uniref": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/uniref90.20220928.mmsdb",
+    "pfam": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/pfam.mmspro",
+    "dbcan": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/dbCAN-HMMdb-V11.txt",
+    "viral": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/refseq_viral.20220928.mmsdb",
+    "peptidase": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/peptidases.20220928.mmsdb",
+    "vogdb": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/vog_latest_hmms.txt"
   },
-  "custom_dbs": null,
   "database_descriptions": {
-    "pfam_hmm_dat": null,
-    "dbcan_fam_activities": null,
-    "vog_annotations": null
+    "pfam_hmm": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/Pfam-A.hmm.dat.gz",
+    "dbcan_fam_activities": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/CAZyDB.08062022.fam-activities.txt",
+    "dbcan_subfam_ec": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/CAZyDB.08062022.fam.subfam.ec.txt",
+    "vog_annotations": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/vog_annotations_latest.tsv.gz"
   },
   "dram_sheets": {
-    "genome_summary_form": null,
-    "module_step_form": null,
-    "etc_module_database": null,
-    "function_heatmap_form": null,
-    "amg_database": null
+    "genome_summary_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/genome_summary_form.20220928.tsv",
+    "module_step_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/module_step_form.20220928.tsv",
+    "etc_module_database": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/etc_mdoule_database.20220928.tsv",
+    "function_heatmap_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/function_heatmap_form.20220928.tsv",
+    "amg_database": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/amg_database.20220928.tsv"
   },
-  "description_db": null,
-  "dram_version": null
-}
+  "dram_version": "1.4.0rc1",
+  "description_db": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/description_db.sqlite",
+  "setup_info": {
+    "kegg": {
+      "name": "KEGG db",
+      "description_db_updated": "10/12/2022, 18:52:36",
+      "citation": " M. Kanehisa, M. Furumichi, Y. Sato, M. Ishiguro-Watanabe, and M. Tanabe, \"Kegg: integrating viruses and cellular organisms,\" Nucleic acids research, vol. 49, no. D1, pp. D545\u2013D551, 2021."
+    },
+    "kofam_hmm": {
+      "name": "KOfam db",
+      "citation": "T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa, S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignment based on profile hmm and adaptive score threshold,\" Bioinformatics, vol. 36, no. 7, pp. 2251\u20132252, 2020.",
+      "Download time": "09/28/2022, 11:00:09",
+      "Origin": "Downloaded by DRAM"
+    },
+    "kofam_ko_list": {
+      "name": "KOfam KO list",
+      "citation": "T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa, S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignment based on profile hmm and adaptive score threshold,\" Bioinformatics, vol. 36, no. 7, pp. 2251\u20132252, 2020.",
+      "Download time": "09/28/2022, 11:00:11",
+      "Origin": "Downloaded by DRAM"
+    },
+    "uniref": {
+      "name": "UniRef db",
+      "description_db_updated": "09/29/2022, 13:14:40",
+      "citation": "Y. Wang, Q. Wang, H. Huang, W. Huang, Y. Chen, P. B. McGarvey, C. H. Wu, C. N. Arighi, and U. Consortium, \"A crowdsourcing open platform for literature curation in uniprot,\" PLoS Biology, vol. 19, no. 12, p. e3001464, 2021.",
+      "version": "90",
+      "Download time": "09/28/2022, 11:15:01",
+      "Origin": "Downloaded by DRAM"
+    },
+    "pfam": {
+      "name": "Pfam db",
+      "citation": "J. Mistry, S. Chuguransky, L. Williams, M. Qureshi, G. A. Salazar, E. L. Sonnhammer, S. C. Tosatto, L. Paladin, S. Raj, L. J. Richardson et al., \"Pfam: The protein families database in 2021,\" Nucleic acids research, vol. 49, no. D1, pp. D412\u2013D419, 2021.",
+      "Download time": "09/28/2022, 11:49:29",
+      "Origin": "Downloaded by DRAM",
+      "description_db_updated": "09/29/2022, 13:23:47"
+    },
+    "pfam_hmm": {
+      "name": "Pfam hmm dat",
+      "description_db_updated": "Unknown, or Never",
+      "citation": "J. Mistry, S. Chuguransky, L. Williams, M. Qureshi, G. A. Salazar, E. L. Sonnhammer, S. C. Tosatto, L. Paladin, S. Raj, L. J. Richardson et al., \"Pfam: The protein families database in 2021,\" Nucleic acids research, vol. 49, no. D1, pp. D412\u2013D419, 2021.",
+      "Download time": "09/28/2022, 11:49:31",
+      "Origin": "Downloaded by DRAM"
+    },
+    "dbcan": {
+      "name": "dbCAN db",
+      "citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.",
+      "version": "11",
+      "Download time": "09/28/2022, 11:49:33",
+      "Origin": "Downloaded by DRAM",
+      "description_db_updated": "09/29/2022, 13:23:50"
+    },
+    "dbcan_fam_activities": {
+      "name": "dbCAN family activities",
+      "citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.",
+      "version": "11",
+      "upload_date": "08062022",
+      "Download time": "09/28/2022, 11:49:33",
+      "Origin": "Downloaded by DRAM"
+    },
+    "dbcan_subfam_ec": {
+      "name": "dbCAN subfamily EC numbers",
+      "citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.",
+      "version": "11",
+      "upload_date": "08062022",
+      "Download time": "09/28/2022, 11:49:33",
+      "Origin": "Downloaded by DRAM"
+    },
+    "vogdb": {
+      "name": "VOGDB db",
+      "citation": "J. Thannesberger, H.-J. Hellinger, I. Klymiuk, M.-T. Kastner, F. J. Rieder, M. Schneider, S. Fister, T. Lion, K. Kosulin, J. Laengle et al., \"Viruses comprise an extensive pool of mobile genetic elements in eukaryote cell cultures and human clinical samples,\" The FASEB Journal, vol. 31, no. 5, pp. 1987\u20132000, 2017.",
+      "version": "latest",
+      "Download time": "09/28/2022, 11:51:57",
+      "Origin": "Downloaded by DRAM",
+      "description_db_updated": "09/29/2022, 13:24:14"
+    },
+    "vog_annotations": {
+      "name": "VOG annotations",
+      "description_db_updated": "Unknown, or Never",
+      "citation": "J. Thannesberger, H.-J. Hellinger, I. Klymiuk, M.-T. Kastner, F. J. Rieder, M. Schneider, S. Fister, T. Lion, K. Kosulin, J. Laengle et al., \"Viruses comprise an extensive pool of mobile genetic elements in eukaryote cell cultures and human clinical samples,\" The FASEB Journal, vol. 31, no. 5, pp. 1987\u20132000, 2017.",
+      "version": "latest",
+      "Download time": "09/28/2022, 11:51:58",
+      "Origin": "Downloaded by DRAM"
+    },
+    "viral": {
+      "name": "RefSeq Viral db",
+      "description_db_updated": "09/29/2022, 13:16:15",
+      "citation": "J. R. Brister, D. Ako-Adjei, Y. Bao, and O. Blinkova, \"Ncbi viral genomes resource,\" Nucleic acids research, vol. 43, no. D1, pp. D571\u2013D577, 2015. [3] M. Kanehisa, M. Furumichi, Y. Sato, M. Ishiguro-Watanabe, and M. Tan-abe, \"Kegg: integrating viruses and cellular organisms,\" Nucleic acids research, vol. 49, no. D1, pp. D545\u2013D551, 2021.",
+      "viral_files": 2,
+      "Download time": "09/28/2022, 11:52:20",
+      "Origin": "Downloaded by DRAM"
+    },
+    "peptidase": {
+      "name": "MEROPS peptidase db",
+      "description_db_updated": "09/29/2022, 13:23:40",
+      "citation": "N. D. Rawlings, A. J. Barrett, P. D. Thomas, X. Huang, A. Bateman, and R. D. Finn, \"The merops database of proteolytic enzymes, their substrates and inhibitors in 2017 and a comparison with peptidases in the panther database,\" Nucleic acids research, vol. 46, no. D1, pp. D624\u2013D632, 2018.",
+      "Download time": "09/28/2022, 12:01:46",
+      "Origin": "Downloaded by DRAM"
+    },
+    "genome_summary_form": {
+      "name": "Genome summary form",
+      "branch": "master",
+      "Download time": "09/28/2022, 12:01:46",
+      "Origin": "Downloaded by DRAM"
+    },
+    "module_step_form": {
+      "name": "Module step form",
+      "branch": "master",
+      "Download time": "09/28/2022, 12:01:47",
+      "Origin": "Downloaded by DRAM"
+    },
+    "function_heatmap_form": {
+      "name": "Function heatmap form",
+      "branch": "master",
+      "Download time": "09/28/2022, 12:01:47",
+      "Origin": "Downloaded by DRAM"
+    },
+    "amg_database": {
+      "name": "AMG database",
+      "branch": "master",
+      "Download time": "09/28/2022, 12:01:47",
+      "Origin": "Downloaded by DRAM"
+    },
+    "etc_module_database": {
+      "name": "ETC module database",
+      "branch": "master",
+      "Download time": "09/28/2022, 12:01:47",
+      "Origin": "Downloaded by DRAM"
+    }
+  },
+  "log_path": null
+}
diff --git a/mag_annotator/__init__.py b/mag_annotator/__init__.py
@@ -1 +1 @@
-__version__ = '1.4.3'
+__version__ = '1.4.4'
diff --git a/mag_annotator/database_handler.py b/mag_annotator/database_handler.py
@@ -5,11 +5,10 @@
 import logging
 from shutil import copy2
 import warnings
-from sqlalchemy import create_engine
-from sqlalchemy.orm import sessionmaker
-
 from datetime import datetime
 from functools import partial
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
 
 import pandas as pd
 

diff --git a/mag_annotator/database_processing.py b/mag_annotator/database_processing.py
@@ -26,54 +26,6 @@
 DEFAULT_MMMSPRO_DB_NAME = 'db'
 
 
-# KOFAM_CITATION = ("Aramaki T., Blanc-Mathieu R., Endo H., Ohkubo K., Kanehisa "
-#                   "M., Goto S., Ogata H.\nKofamKOALA: KEGG ortholog assignment"
-#                   " based on profile HMM and adaptive score threshold.\nBioinf"
-#                   "ormatics. 2019 Nov 19. pii: btz859. doi: 10.1093/bioinforma"
-#                   "tics/btz859."
-#                   ) # arguably not for kofam but the closest I saw
-# VIRAL_REFSEQ_CITATION = ("Brister JR, Ako-Adjei D, Bao Y, Blinkova O. NCBI vir"
-#                          "al genomes resource. Nucleic Acids Res. 2015 Jan;43("
-#                          "Database issue):D571-7 PubMed PubMedCentral"
-#                          ) # Three options but this one is viral specific
-# KEGG_CITATION = ("Kanehisa, M., Furumichi, M., Sato, Y., Ishiguro-Watanabe, M."
-#                  ", and Tanabe, M.; KEGG: integrating viruses and cellular org"
-#                  "anisms. Nucleic Acids Res. 49, D545-D551 (2021)."
-#                  )
-# PFAM_CITATION = ("Pfam: The protein families database in 2021: J. Mistry, S. C"
-#                  "huguransky, L. Williams, M. Qureshi, G.A. Salazar, E.L.L. So"
-#                  "nnhammer, S.C.E. Tosatto, L. Paladin, S. Raj, L.J. Richardso"
-#                  "n, R.D. Finn, A. Bateman"
-#                  )
-# PEPTIDASE_CITATION = ("Rawlings, N.D., Barrett, A.J., Thomas, P.D., Huang, X.,"
-#                       " Bateman, A. & Finn, R.D. (2018) The MEROPS database of"
-#                       " proteolytic enzymes, their substrates and inhibitors i"
-#                       "n 2017 and a comparison with peptidases in the PANTHER "
-#                       "database. Nucleic Acids Res 46, D624-D632."
-#                       )
-# VOGDB_CITATION = ("Thannesberger, J., Hellinger, H. J., Klymiuk, I., Kastner, M"
-#                   ". T., Rieder, F. J., Schneider, M., ... & Steininger, C. (20"
-#                   "17). Viruses comprise an extensive pool of mobile genetic el"
-#                   "ements in eukaryote cell cultures and human clinical samples"
-#                   ". The FASEB Journal, 31(5), 1987-2000."
-#                   )
-# UNIREF_CITATION = ("Wang Y, Wang Q, Huang H, Huang W, Chen Y, McGarvey PB, Wu C"
-#                    "H, Arighi CN, UniProt Consortium. A crowdsourcing open plat"
-#                    "form for literature curation in UniProt Plos Biology. 19(12"
-#                    "):e3001464 (2021)"
-#                    )
-# DBCAN_CITATION = ("Yin Y*, Mao X*, Yang JC, Chen X, Mao F and Xu Y, dbCAN: a we"
-#                   "b resource for automated carbohydrate-active enzyme annotati"
-#                   "on, Nucleic Acids Res. 2012 Jul;40(Web Server issue):W445-51"
-#                   ) # again a citation for the tool not the db
-# DRAM_CITATION = ("M. Shaffer, M. A. Borton, B. B. McGivern, A. A. Zayed, S. L. "
-#                  "La Rosa, L. M. Solden, P. Liu, A. B. Narrowe, J. Rodríguez-Ra"
-#                  "mos, B. Bolduc et al., \"Dram for distilling microbial metabo"
-#                  "lism to automate the curation of microbiome function,\" Nucle"
-#                  "ic acids research, vol. 48, no. 16, pp. 8883–8900, 2020."
-#                  )
-
-
 KOFAM_CITATION = ("T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa"
                   ", S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignme"
                   "nt based on profile hmm and adaptive score threshold,\" Bioin"

diff --git a/mag_annotator/database_setup.py b/mag_annotator/database_setup.py
@@ -82,25 +82,6 @@ def serialize(self):
             'dbcan_subfam_ec': self.ec,
         }
 
-
-# DBCAN_SUBFAM_EC_TABLE_NAME = 'dbcan_subfam_ec'
-
-
-# class DbcanSubfamEC(Base):
-#     __tablename__ = DBCAN_SUBFAM_EC_TABLE_NAME
-# 
-#     id = Column(String(30), primary_key=True, nullable=False, index=True)
-# 
-#     description = Column(String(1000))
-# 
-#     @property
-#     def serialize(self):
-#         return {
-#             'dbcan_id': self.id,
-#             'dbcan_subfam_ec': self.description,
-#         }
-
-
 VIRAL_DESCRIPTION_TABLE_NAME = 'viral_description'
 
 
@@ -164,7 +145,6 @@ def create_description_db(db_loc):
                             UNIREF_DESCRIPTION_TABLE_NAME: UniRefDescription,
                             PFAM_DESCRIPTION_TABLE_NAME: PfamDescription,
                             DBCAN_DESCRIPTION_TABLE_NAME: DbcanDescription,
-                            # DBCAN_SUBFAM_EC_TABLE_NAME: DbcanSubfamEC,
                             VIRAL_DESCRIPTION_TABLE_NAME: ViralDescription,
                             PEPTIDASE_DESCRIPTION_TABLE_NAME: PeptidaseDescription,
                             VOGDB_DESCRIPTION_TABLE_NAME: VOGDBDescription}
diff --git a/mag_annotator/summarize_genomes.py b/mag_annotator/summarize_genomes.py
@@ -97,7 +97,7 @@ def fill_genome_summary_frame_gene_names(annotations, genome_summary_frame, grou
     for genome, frame in annotations.groupby(groupby_column, sort=False):
         # make dict of identifiers to gene names
         id_gene_dict = defaultdict(list)
-        for gene, ids in get_ids_from_annotations_by_row(frame).iteritems():
+        for gene, ids in get_ids_from_annotations_by_row(frame).items():
             for id_ in ids:
                 id_gene_dict[id_].append(gene)
         # fill in genome summary_frame
@@ -316,7 +316,7 @@ def get_module_step_coverage(kos, module_net):
 def make_module_coverage_df(annotation_df, module_nets):
     kos_to_genes = defaultdict(list)
     ko_id_name = 'kegg_id' if 'kegg_id' in annotation_df.columns else 'ko_id'
-    for gene_id, ko_list in annotation_df[ko_id_name].iteritems():
+    for gene_id, ko_list in annotation_df[ko_id_name].items():
         if type(ko_list) is str:
             for ko in ko_list.split(','):
                 kos_to_genes[ko].append(gene_id)