Skip to content

Commit

Permalink
Merge pull request #228 from WrightonLabCSU/improved_amg_summary
Browse files Browse the repository at this point in the history
Improved amg summary
This update changes the AMG summary so that it will provide all
information from both the Metabolic DB and The AMG DB. It is a bit of a
kitchen sink approach, but we can always cut it down if needed and most
dram-v users I would call advanced.
I could not fix the sql someone more familiar with sqlalchemy is free to do so
I will make a branch to deal with this
  • Loading branch information
rmFlynn authored Jan 6, 2023
2 parents 76f7103 + e592376 commit ac83ba7
Show file tree
Hide file tree
Showing 10 changed files with 374 additions and 169 deletions.
169 changes: 148 additions & 21 deletions mag_annotator/CONFIG
Original file line number Diff line number Diff line change
@@ -1,28 +1,155 @@
{
"search_databases": {
"kegg": null,
"kofam_hmm": null,
"kofam_ko_list": null,
"uniref": null,
"pfam": null,
"dbcan": null,
"viral": null,
"peptidase": null,
"vogdb": null
"kegg": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kegg.20221012.mmsdb",
"kofam_hmm": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kofam_profiles.hmm",
"kofam_ko_list": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kofam_ko_list.tsv",
"uniref": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/uniref90.20220928.mmsdb",
"pfam": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/pfam.mmspro",
"dbcan": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/dbCAN-HMMdb-V11.txt",
"viral": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/refseq_viral.20220928.mmsdb",
"peptidase": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/peptidases.20220928.mmsdb",
"vogdb": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/vog_latest_hmms.txt"
},
"custom_dbs": null,
"database_descriptions": {
"pfam_hmm_dat": null,
"dbcan_fam_activities": null,
"vog_annotations": null
"pfam_hmm": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/Pfam-A.hmm.dat.gz",
"dbcan_fam_activities": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/CAZyDB.08062022.fam-activities.txt",
"dbcan_subfam_ec": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/CAZyDB.08062022.fam.subfam.ec.txt",
"vog_annotations": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/vog_annotations_latest.tsv.gz"
},
"dram_sheets": {
"genome_summary_form": null,
"module_step_form": null,
"etc_module_database": null,
"function_heatmap_form": null,
"amg_database": null
"genome_summary_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/genome_summary_form.20220928.tsv",
"module_step_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/module_step_form.20220928.tsv",
"etc_module_database": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/etc_mdoule_database.20220928.tsv",
"function_heatmap_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/function_heatmap_form.20220928.tsv",
"amg_database": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/amg_database.20220928.tsv"
},
"description_db": null,
"dram_version": null
}
"dram_version": "1.4.0rc1",
"description_db": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/description_db.sqlite",
"setup_info": {
"kegg": {
"name": "KEGG db",
"description_db_updated": "10/12/2022, 18:52:36",
"citation": " M. Kanehisa, M. Furumichi, Y. Sato, M. Ishiguro-Watanabe, and M. Tanabe, \"Kegg: integrating viruses and cellular organisms,\" Nucleic acids research, vol. 49, no. D1, pp. D545\u2013D551, 2021."
},
"kofam_hmm": {
"name": "KOfam db",
"citation": "T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa, S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignment based on profile hmm and adaptive score threshold,\" Bioinformatics, vol. 36, no. 7, pp. 2251\u20132252, 2020.",
"Download time": "09/28/2022, 11:00:09",
"Origin": "Downloaded by DRAM"
},
"kofam_ko_list": {
"name": "KOfam KO list",
"citation": "T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa, S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignment based on profile hmm and adaptive score threshold,\" Bioinformatics, vol. 36, no. 7, pp. 2251\u20132252, 2020.",
"Download time": "09/28/2022, 11:00:11",
"Origin": "Downloaded by DRAM"
},
"uniref": {
"name": "UniRef db",
"description_db_updated": "09/29/2022, 13:14:40",
"citation": "Y. Wang, Q. Wang, H. Huang, W. Huang, Y. Chen, P. B. McGarvey, C. H. Wu, C. N. Arighi, and U. Consortium, \"A crowdsourcing open platform for literature curation in uniprot,\" PLoS Biology, vol. 19, no. 12, p. e3001464, 2021.",
"version": "90",
"Download time": "09/28/2022, 11:15:01",
"Origin": "Downloaded by DRAM"
},
"pfam": {
"name": "Pfam db",
"citation": "J. Mistry, S. Chuguransky, L. Williams, M. Qureshi, G. A. Salazar, E. L. Sonnhammer, S. C. Tosatto, L. Paladin, S. Raj, L. J. Richardson et al., \"Pfam: The protein families database in 2021,\" Nucleic acids research, vol. 49, no. D1, pp. D412\u2013D419, 2021.",
"Download time": "09/28/2022, 11:49:29",
"Origin": "Downloaded by DRAM",
"description_db_updated": "09/29/2022, 13:23:47"
},
"pfam_hmm": {
"name": "Pfam hmm dat",
"description_db_updated": "Unknown, or Never",
"citation": "J. Mistry, S. Chuguransky, L. Williams, M. Qureshi, G. A. Salazar, E. L. Sonnhammer, S. C. Tosatto, L. Paladin, S. Raj, L. J. Richardson et al., \"Pfam: The protein families database in 2021,\" Nucleic acids research, vol. 49, no. D1, pp. D412\u2013D419, 2021.",
"Download time": "09/28/2022, 11:49:31",
"Origin": "Downloaded by DRAM"
},
"dbcan": {
"name": "dbCAN db",
"citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.",
"version": "11",
"Download time": "09/28/2022, 11:49:33",
"Origin": "Downloaded by DRAM",
"description_db_updated": "09/29/2022, 13:23:50"
},
"dbcan_fam_activities": {
"name": "dbCAN family activities",
"citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.",
"version": "11",
"upload_date": "08062022",
"Download time": "09/28/2022, 11:49:33",
"Origin": "Downloaded by DRAM"
},
"dbcan_subfam_ec": {
"name": "dbCAN subfamily EC numbers",
"citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.",
"version": "11",
"upload_date": "08062022",
"Download time": "09/28/2022, 11:49:33",
"Origin": "Downloaded by DRAM"
},
"vogdb": {
"name": "VOGDB db",
"citation": "J. Thannesberger, H.-J. Hellinger, I. Klymiuk, M.-T. Kastner, F. J. Rieder, M. Schneider, S. Fister, T. Lion, K. Kosulin, J. Laengle et al., \"Viruses comprise an extensive pool of mobile genetic elements in eukaryote cell cultures and human clinical samples,\" The FASEB Journal, vol. 31, no. 5, pp. 1987\u20132000, 2017.",
"version": "latest",
"Download time": "09/28/2022, 11:51:57",
"Origin": "Downloaded by DRAM",
"description_db_updated": "09/29/2022, 13:24:14"
},
"vog_annotations": {
"name": "VOG annotations",
"description_db_updated": "Unknown, or Never",
"citation": "J. Thannesberger, H.-J. Hellinger, I. Klymiuk, M.-T. Kastner, F. J. Rieder, M. Schneider, S. Fister, T. Lion, K. Kosulin, J. Laengle et al., \"Viruses comprise an extensive pool of mobile genetic elements in eukaryote cell cultures and human clinical samples,\" The FASEB Journal, vol. 31, no. 5, pp. 1987\u20132000, 2017.",
"version": "latest",
"Download time": "09/28/2022, 11:51:58",
"Origin": "Downloaded by DRAM"
},
"viral": {
"name": "RefSeq Viral db",
"description_db_updated": "09/29/2022, 13:16:15",
"citation": "J. R. Brister, D. Ako-Adjei, Y. Bao, and O. Blinkova, \"Ncbi viral genomes resource,\" Nucleic acids research, vol. 43, no. D1, pp. D571\u2013D577, 2015. [3] M. Kanehisa, M. Furumichi, Y. Sato, M. Ishiguro-Watanabe, and M. Tan-abe, \"Kegg: integrating viruses and cellular organisms,\" Nucleic acids research, vol. 49, no. D1, pp. D545\u2013D551, 2021.",
"viral_files": 2,
"Download time": "09/28/2022, 11:52:20",
"Origin": "Downloaded by DRAM"
},
"peptidase": {
"name": "MEROPS peptidase db",
"description_db_updated": "09/29/2022, 13:23:40",
"citation": "N. D. Rawlings, A. J. Barrett, P. D. Thomas, X. Huang, A. Bateman, and R. D. Finn, \"The merops database of proteolytic enzymes, their substrates and inhibitors in 2017 and a comparison with peptidases in the panther database,\" Nucleic acids research, vol. 46, no. D1, pp. D624\u2013D632, 2018.",
"Download time": "09/28/2022, 12:01:46",
"Origin": "Downloaded by DRAM"
},
"genome_summary_form": {
"name": "Genome summary form",
"branch": "master",
"Download time": "09/28/2022, 12:01:46",
"Origin": "Downloaded by DRAM"
},
"module_step_form": {
"name": "Module step form",
"branch": "master",
"Download time": "09/28/2022, 12:01:47",
"Origin": "Downloaded by DRAM"
},
"function_heatmap_form": {
"name": "Function heatmap form",
"branch": "master",
"Download time": "09/28/2022, 12:01:47",
"Origin": "Downloaded by DRAM"
},
"amg_database": {
"name": "AMG database",
"branch": "master",
"Download time": "09/28/2022, 12:01:47",
"Origin": "Downloaded by DRAM"
},
"etc_module_database": {
"name": "ETC module database",
"branch": "master",
"Download time": "09/28/2022, 12:01:47",
"Origin": "Downloaded by DRAM"
}
},
"log_path": null
}
2 changes: 1 addition & 1 deletion mag_annotator/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.4.3'
__version__ = '1.4.4'
5 changes: 2 additions & 3 deletions mag_annotator/database_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@
import logging
from shutil import copy2
import warnings
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from datetime import datetime
from functools import partial
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

import pandas as pd

Expand Down
48 changes: 0 additions & 48 deletions mag_annotator/database_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,54 +26,6 @@
DEFAULT_MMMSPRO_DB_NAME = 'db'


# KOFAM_CITATION = ("Aramaki T., Blanc-Mathieu R., Endo H., Ohkubo K., Kanehisa "
# "M., Goto S., Ogata H.\nKofamKOALA: KEGG ortholog assignment"
# " based on profile HMM and adaptive score threshold.\nBioinf"
# "ormatics. 2019 Nov 19. pii: btz859. doi: 10.1093/bioinforma"
# "tics/btz859."
# ) # arguably not for kofam but the closest I saw
# VIRAL_REFSEQ_CITATION = ("Brister JR, Ako-Adjei D, Bao Y, Blinkova O. NCBI vir"
# "al genomes resource. Nucleic Acids Res. 2015 Jan;43("
# "Database issue):D571-7 PubMed PubMedCentral"
# ) # Three options but this one is viral specific
# KEGG_CITATION = ("Kanehisa, M., Furumichi, M., Sato, Y., Ishiguro-Watanabe, M."
# ", and Tanabe, M.; KEGG: integrating viruses and cellular org"
# "anisms. Nucleic Acids Res. 49, D545-D551 (2021)."
# )
# PFAM_CITATION = ("Pfam: The protein families database in 2021: J. Mistry, S. C"
# "huguransky, L. Williams, M. Qureshi, G.A. Salazar, E.L.L. So"
# "nnhammer, S.C.E. Tosatto, L. Paladin, S. Raj, L.J. Richardso"
# "n, R.D. Finn, A. Bateman"
# )
# PEPTIDASE_CITATION = ("Rawlings, N.D., Barrett, A.J., Thomas, P.D., Huang, X.,"
# " Bateman, A. & Finn, R.D. (2018) The MEROPS database of"
# " proteolytic enzymes, their substrates and inhibitors i"
# "n 2017 and a comparison with peptidases in the PANTHER "
# "database. Nucleic Acids Res 46, D624-D632."
# )
# VOGDB_CITATION = ("Thannesberger, J., Hellinger, H. J., Klymiuk, I., Kastner, M"
# ". T., Rieder, F. J., Schneider, M., ... & Steininger, C. (20"
# "17). Viruses comprise an extensive pool of mobile genetic el"
# "ements in eukaryote cell cultures and human clinical samples"
# ". The FASEB Journal, 31(5), 1987-2000."
# )
# UNIREF_CITATION = ("Wang Y, Wang Q, Huang H, Huang W, Chen Y, McGarvey PB, Wu C"
# "H, Arighi CN, UniProt Consortium. A crowdsourcing open plat"
# "form for literature curation in UniProt Plos Biology. 19(12"
# "):e3001464 (2021)"
# )
# DBCAN_CITATION = ("Yin Y*, Mao X*, Yang JC, Chen X, Mao F and Xu Y, dbCAN: a we"
# "b resource for automated carbohydrate-active enzyme annotati"
# "on, Nucleic Acids Res. 2012 Jul;40(Web Server issue):W445-51"
# ) # again a citation for the tool not the db
# DRAM_CITATION = ("M. Shaffer, M. A. Borton, B. B. McGivern, A. A. Zayed, S. L. "
# "La Rosa, L. M. Solden, P. Liu, A. B. Narrowe, J. Rodríguez-Ra"
# "mos, B. Bolduc et al., \"Dram for distilling microbial metabo"
# "lism to automate the curation of microbiome function,\" Nucle"
# "ic acids research, vol. 48, no. 16, pp. 8883–8900, 2020."
# )


KOFAM_CITATION = ("T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa"
", S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignme"
"nt based on profile hmm and adaptive score threshold,\" Bioin"
Expand Down
20 changes: 0 additions & 20 deletions mag_annotator/database_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,25 +82,6 @@ def serialize(self):
'dbcan_subfam_ec': self.ec,
}


# DBCAN_SUBFAM_EC_TABLE_NAME = 'dbcan_subfam_ec'


# class DbcanSubfamEC(Base):
# __tablename__ = DBCAN_SUBFAM_EC_TABLE_NAME
#
# id = Column(String(30), primary_key=True, nullable=False, index=True)
#
# description = Column(String(1000))
#
# @property
# def serialize(self):
# return {
# 'dbcan_id': self.id,
# 'dbcan_subfam_ec': self.description,
# }


VIRAL_DESCRIPTION_TABLE_NAME = 'viral_description'


Expand Down Expand Up @@ -164,7 +145,6 @@ def create_description_db(db_loc):
UNIREF_DESCRIPTION_TABLE_NAME: UniRefDescription,
PFAM_DESCRIPTION_TABLE_NAME: PfamDescription,
DBCAN_DESCRIPTION_TABLE_NAME: DbcanDescription,
# DBCAN_SUBFAM_EC_TABLE_NAME: DbcanSubfamEC,
VIRAL_DESCRIPTION_TABLE_NAME: ViralDescription,
PEPTIDASE_DESCRIPTION_TABLE_NAME: PeptidaseDescription,
VOGDB_DESCRIPTION_TABLE_NAME: VOGDBDescription}
4 changes: 2 additions & 2 deletions mag_annotator/summarize_genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def fill_genome_summary_frame_gene_names(annotations, genome_summary_frame, grou
for genome, frame in annotations.groupby(groupby_column, sort=False):
# make dict of identifiers to gene names
id_gene_dict = defaultdict(list)
for gene, ids in get_ids_from_annotations_by_row(frame).iteritems():
for gene, ids in get_ids_from_annotations_by_row(frame).items():
for id_ in ids:
id_gene_dict[id_].append(gene)
# fill in genome summary_frame
Expand Down Expand Up @@ -316,7 +316,7 @@ def get_module_step_coverage(kos, module_net):
def make_module_coverage_df(annotation_df, module_nets):
kos_to_genes = defaultdict(list)
ko_id_name = 'kegg_id' if 'kegg_id' in annotation_df.columns else 'ko_id'
for gene_id, ko_list in annotation_df[ko_id_name].iteritems():
for gene_id, ko_list in annotation_df[ko_id_name].items():
if type(ko_list) is str:
for ko in ko_list.split(','):
kos_to_genes[ko].append(gene_id)
Expand Down
Loading

0 comments on commit ac83ba7

Please sign in to comment.