diff --git a/mag_annotator/CONFIG b/mag_annotator/CONFIG index 45f3b20a..0f019614 100644 --- a/mag_annotator/CONFIG +++ b/mag_annotator/CONFIG @@ -1,28 +1,155 @@ { "search_databases": { - "kegg": null, - "kofam_hmm": null, - "kofam_ko_list": null, - "uniref": null, - "pfam": null, - "dbcan": null, - "viral": null, - "peptidase": null, - "vogdb": null + "kegg": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kegg.20221012.mmsdb", + "kofam_hmm": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kofam_profiles.hmm", + "kofam_ko_list": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kofam_ko_list.tsv", + "uniref": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/uniref90.20220928.mmsdb", + "pfam": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/pfam.mmspro", + "dbcan": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/dbCAN-HMMdb-V11.txt", + "viral": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/refseq_viral.20220928.mmsdb", + "peptidase": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/peptidases.20220928.mmsdb", + "vogdb": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/vog_latest_hmms.txt" }, - "custom_dbs": null, "database_descriptions": { - "pfam_hmm_dat": null, - "dbcan_fam_activities": null, - "vog_annotations": null + "pfam_hmm": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/Pfam-A.hmm.dat.gz", + "dbcan_fam_activities": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/CAZyDB.08062022.fam-activities.txt", + "dbcan_subfam_ec": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/CAZyDB.08062022.fam.subfam.ec.txt", + "vog_annotations": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/vog_annotations_latest.tsv.gz" }, "dram_sheets": { - "genome_summary_form": null, - "module_step_form": null, - "etc_module_database": null, - "function_heatmap_form": null, - "amg_database": null + "genome_summary_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/genome_summary_form.20220928.tsv", + "module_step_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/module_step_form.20220928.tsv", + "etc_module_database": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/etc_mdoule_database.20220928.tsv", + "function_heatmap_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/function_heatmap_form.20220928.tsv", + "amg_database": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/amg_database.20220928.tsv" }, - "description_db": null, - "dram_version": null -} \ No newline at end of file + "dram_version": "1.4.0rc1", + "description_db": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/description_db.sqlite", + "setup_info": { + "kegg": { + "name": "KEGG db", + "description_db_updated": "10/12/2022, 18:52:36", + "citation": " M. Kanehisa, M. Furumichi, Y. Sato, M. Ishiguro-Watanabe, and M. Tanabe, \"Kegg: integrating viruses and cellular organisms,\" Nucleic acids research, vol. 49, no. D1, pp. D545\u2013D551, 2021." + }, + "kofam_hmm": { + "name": "KOfam db", + "citation": "T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa, S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignment based on profile hmm and adaptive score threshold,\" Bioinformatics, vol. 36, no. 7, pp. 2251\u20132252, 2020.", + "Download time": "09/28/2022, 11:00:09", + "Origin": "Downloaded by DRAM" + }, + "kofam_ko_list": { + "name": "KOfam KO list", + "citation": "T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa, S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignment based on profile hmm and adaptive score threshold,\" Bioinformatics, vol. 36, no. 7, pp. 2251\u20132252, 2020.", + "Download time": "09/28/2022, 11:00:11", + "Origin": "Downloaded by DRAM" + }, + "uniref": { + "name": "UniRef db", + "description_db_updated": "09/29/2022, 13:14:40", + "citation": "Y. Wang, Q. Wang, H. Huang, W. Huang, Y. Chen, P. B. McGarvey, C. H. Wu, C. N. Arighi, and U. Consortium, \"A crowdsourcing open platform for literature curation in uniprot,\" PLoS Biology, vol. 19, no. 12, p. e3001464, 2021.", + "version": "90", + "Download time": "09/28/2022, 11:15:01", + "Origin": "Downloaded by DRAM" + }, + "pfam": { + "name": "Pfam db", + "citation": "J. Mistry, S. Chuguransky, L. Williams, M. Qureshi, G. A. Salazar, E. L. Sonnhammer, S. C. Tosatto, L. Paladin, S. Raj, L. J. Richardson et al., \"Pfam: The protein families database in 2021,\" Nucleic acids research, vol. 49, no. D1, pp. D412\u2013D419, 2021.", + "Download time": "09/28/2022, 11:49:29", + "Origin": "Downloaded by DRAM", + "description_db_updated": "09/29/2022, 13:23:47" + }, + "pfam_hmm": { + "name": "Pfam hmm dat", + "description_db_updated": "Unknown, or Never", + "citation": "J. Mistry, S. Chuguransky, L. Williams, M. Qureshi, G. A. Salazar, E. L. Sonnhammer, S. C. Tosatto, L. Paladin, S. Raj, L. J. Richardson et al., \"Pfam: The protein families database in 2021,\" Nucleic acids research, vol. 49, no. D1, pp. D412\u2013D419, 2021.", + "Download time": "09/28/2022, 11:49:31", + "Origin": "Downloaded by DRAM" + }, + "dbcan": { + "name": "dbCAN db", + "citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.", + "version": "11", + "Download time": "09/28/2022, 11:49:33", + "Origin": "Downloaded by DRAM", + "description_db_updated": "09/29/2022, 13:23:50" + }, + "dbcan_fam_activities": { + "name": "dbCAN family activities", + "citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.", + "version": "11", + "upload_date": "08062022", + "Download time": "09/28/2022, 11:49:33", + "Origin": "Downloaded by DRAM" + }, + "dbcan_subfam_ec": { + "name": "dbCAN subfamily EC numbers", + "citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.", + "version": "11", + "upload_date": "08062022", + "Download time": "09/28/2022, 11:49:33", + "Origin": "Downloaded by DRAM" + }, + "vogdb": { + "name": "VOGDB db", + "citation": "J. Thannesberger, H.-J. Hellinger, I. Klymiuk, M.-T. Kastner, F. J. Rieder, M. Schneider, S. Fister, T. Lion, K. Kosulin, J. Laengle et al., \"Viruses comprise an extensive pool of mobile genetic elements in eukaryote cell cultures and human clinical samples,\" The FASEB Journal, vol. 31, no. 5, pp. 1987\u20132000, 2017.", + "version": "latest", + "Download time": "09/28/2022, 11:51:57", + "Origin": "Downloaded by DRAM", + "description_db_updated": "09/29/2022, 13:24:14" + }, + "vog_annotations": { + "name": "VOG annotations", + "description_db_updated": "Unknown, or Never", + "citation": "J. Thannesberger, H.-J. Hellinger, I. Klymiuk, M.-T. Kastner, F. J. Rieder, M. Schneider, S. Fister, T. Lion, K. Kosulin, J. Laengle et al., \"Viruses comprise an extensive pool of mobile genetic elements in eukaryote cell cultures and human clinical samples,\" The FASEB Journal, vol. 31, no. 5, pp. 1987\u20132000, 2017.", + "version": "latest", + "Download time": "09/28/2022, 11:51:58", + "Origin": "Downloaded by DRAM" + }, + "viral": { + "name": "RefSeq Viral db", + "description_db_updated": "09/29/2022, 13:16:15", + "citation": "J. R. Brister, D. Ako-Adjei, Y. Bao, and O. Blinkova, \"Ncbi viral genomes resource,\" Nucleic acids research, vol. 43, no. D1, pp. D571\u2013D577, 2015. [3] M. Kanehisa, M. Furumichi, Y. Sato, M. Ishiguro-Watanabe, and M. Tan-abe, \"Kegg: integrating viruses and cellular organisms,\" Nucleic acids research, vol. 49, no. D1, pp. D545\u2013D551, 2021.", + "viral_files": 2, + "Download time": "09/28/2022, 11:52:20", + "Origin": "Downloaded by DRAM" + }, + "peptidase": { + "name": "MEROPS peptidase db", + "description_db_updated": "09/29/2022, 13:23:40", + "citation": "N. D. Rawlings, A. J. Barrett, P. D. Thomas, X. Huang, A. Bateman, and R. D. Finn, \"The merops database of proteolytic enzymes, their substrates and inhibitors in 2017 and a comparison with peptidases in the panther database,\" Nucleic acids research, vol. 46, no. D1, pp. D624\u2013D632, 2018.", + "Download time": "09/28/2022, 12:01:46", + "Origin": "Downloaded by DRAM" + }, + "genome_summary_form": { + "name": "Genome summary form", + "branch": "master", + "Download time": "09/28/2022, 12:01:46", + "Origin": "Downloaded by DRAM" + }, + "module_step_form": { + "name": "Module step form", + "branch": "master", + "Download time": "09/28/2022, 12:01:47", + "Origin": "Downloaded by DRAM" + }, + "function_heatmap_form": { + "name": "Function heatmap form", + "branch": "master", + "Download time": "09/28/2022, 12:01:47", + "Origin": "Downloaded by DRAM" + }, + "amg_database": { + "name": "AMG database", + "branch": "master", + "Download time": "09/28/2022, 12:01:47", + "Origin": "Downloaded by DRAM" + }, + "etc_module_database": { + "name": "ETC module database", + "branch": "master", + "Download time": "09/28/2022, 12:01:47", + "Origin": "Downloaded by DRAM" + } + }, + "log_path": null +} diff --git a/mag_annotator/__init__.py b/mag_annotator/__init__.py index 4e7c72a5..9e0feee7 100644 --- a/mag_annotator/__init__.py +++ b/mag_annotator/__init__.py @@ -1 +1 @@ -__version__ = '1.4.3' +__version__ = '1.4.4' diff --git a/mag_annotator/database_handler.py b/mag_annotator/database_handler.py index 78a8bcd7..a94bca9f 100644 --- a/mag_annotator/database_handler.py +++ b/mag_annotator/database_handler.py @@ -5,11 +5,10 @@ import logging from shutil import copy2 import warnings -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker - from datetime import datetime from functools import partial +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker import pandas as pd diff --git a/mag_annotator/database_processing.py b/mag_annotator/database_processing.py index 72dff9ea..cee0eefa 100644 --- a/mag_annotator/database_processing.py +++ b/mag_annotator/database_processing.py @@ -26,54 +26,6 @@ DEFAULT_MMMSPRO_DB_NAME = 'db' -# KOFAM_CITATION = ("Aramaki T., Blanc-Mathieu R., Endo H., Ohkubo K., Kanehisa " -# "M., Goto S., Ogata H.\nKofamKOALA: KEGG ortholog assignment" -# " based on profile HMM and adaptive score threshold.\nBioinf" -# "ormatics. 2019 Nov 19. pii: btz859. doi: 10.1093/bioinforma" -# "tics/btz859." -# ) # arguably not for kofam but the closest I saw -# VIRAL_REFSEQ_CITATION = ("Brister JR, Ako-Adjei D, Bao Y, Blinkova O. NCBI vir" -# "al genomes resource. Nucleic Acids Res. 2015 Jan;43(" -# "Database issue):D571-7 PubMed PubMedCentral" -# ) # Three options but this one is viral specific -# KEGG_CITATION = ("Kanehisa, M., Furumichi, M., Sato, Y., Ishiguro-Watanabe, M." -# ", and Tanabe, M.; KEGG: integrating viruses and cellular org" -# "anisms. Nucleic Acids Res. 49, D545-D551 (2021)." -# ) -# PFAM_CITATION = ("Pfam: The protein families database in 2021: J. Mistry, S. C" -# "huguransky, L. Williams, M. Qureshi, G.A. Salazar, E.L.L. So" -# "nnhammer, S.C.E. Tosatto, L. Paladin, S. Raj, L.J. Richardso" -# "n, R.D. Finn, A. Bateman" -# ) -# PEPTIDASE_CITATION = ("Rawlings, N.D., Barrett, A.J., Thomas, P.D., Huang, X.," -# " Bateman, A. & Finn, R.D. (2018) The MEROPS database of" -# " proteolytic enzymes, their substrates and inhibitors i" -# "n 2017 and a comparison with peptidases in the PANTHER " -# "database. Nucleic Acids Res 46, D624-D632." -# ) -# VOGDB_CITATION = ("Thannesberger, J., Hellinger, H. J., Klymiuk, I., Kastner, M" -# ". T., Rieder, F. J., Schneider, M., ... & Steininger, C. (20" -# "17). Viruses comprise an extensive pool of mobile genetic el" -# "ements in eukaryote cell cultures and human clinical samples" -# ". The FASEB Journal, 31(5), 1987-2000." -# ) -# UNIREF_CITATION = ("Wang Y, Wang Q, Huang H, Huang W, Chen Y, McGarvey PB, Wu C" -# "H, Arighi CN, UniProt Consortium. A crowdsourcing open plat" -# "form for literature curation in UniProt Plos Biology. 19(12" -# "):e3001464 (2021)" -# ) -# DBCAN_CITATION = ("Yin Y*, Mao X*, Yang JC, Chen X, Mao F and Xu Y, dbCAN: a we" -# "b resource for automated carbohydrate-active enzyme annotati" -# "on, Nucleic Acids Res. 2012 Jul;40(Web Server issue):W445-51" -# ) # again a citation for the tool not the db -# DRAM_CITATION = ("M. Shaffer, M. A. Borton, B. B. McGivern, A. A. Zayed, S. L. " -# "La Rosa, L. M. Solden, P. Liu, A. B. Narrowe, J. Rodríguez-Ra" -# "mos, B. Bolduc et al., \"Dram for distilling microbial metabo" -# "lism to automate the curation of microbiome function,\" Nucle" -# "ic acids research, vol. 48, no. 16, pp. 8883–8900, 2020." -# ) - - KOFAM_CITATION = ("T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa" ", S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignme" "nt based on profile hmm and adaptive score threshold,\" Bioin" diff --git a/mag_annotator/database_setup.py b/mag_annotator/database_setup.py index c0c2216d..dbe686c6 100644 --- a/mag_annotator/database_setup.py +++ b/mag_annotator/database_setup.py @@ -82,25 +82,6 @@ def serialize(self): 'dbcan_subfam_ec': self.ec, } - -# DBCAN_SUBFAM_EC_TABLE_NAME = 'dbcan_subfam_ec' - - -# class DbcanSubfamEC(Base): -# __tablename__ = DBCAN_SUBFAM_EC_TABLE_NAME -# -# id = Column(String(30), primary_key=True, nullable=False, index=True) -# -# description = Column(String(1000)) -# -# @property -# def serialize(self): -# return { -# 'dbcan_id': self.id, -# 'dbcan_subfam_ec': self.description, -# } - - VIRAL_DESCRIPTION_TABLE_NAME = 'viral_description' @@ -164,7 +145,6 @@ def create_description_db(db_loc): UNIREF_DESCRIPTION_TABLE_NAME: UniRefDescription, PFAM_DESCRIPTION_TABLE_NAME: PfamDescription, DBCAN_DESCRIPTION_TABLE_NAME: DbcanDescription, - # DBCAN_SUBFAM_EC_TABLE_NAME: DbcanSubfamEC, VIRAL_DESCRIPTION_TABLE_NAME: ViralDescription, PEPTIDASE_DESCRIPTION_TABLE_NAME: PeptidaseDescription, VOGDB_DESCRIPTION_TABLE_NAME: VOGDBDescription} diff --git a/mag_annotator/summarize_genomes.py b/mag_annotator/summarize_genomes.py index 3c2d22a9..c58cb130 100644 --- a/mag_annotator/summarize_genomes.py +++ b/mag_annotator/summarize_genomes.py @@ -97,7 +97,7 @@ def fill_genome_summary_frame_gene_names(annotations, genome_summary_frame, grou for genome, frame in annotations.groupby(groupby_column, sort=False): # make dict of identifiers to gene names id_gene_dict = defaultdict(list) - for gene, ids in get_ids_from_annotations_by_row(frame).iteritems(): + for gene, ids in get_ids_from_annotations_by_row(frame).items(): for id_ in ids: id_gene_dict[id_].append(gene) # fill in genome summary_frame @@ -316,7 +316,7 @@ def get_module_step_coverage(kos, module_net): def make_module_coverage_df(annotation_df, module_nets): kos_to_genes = defaultdict(list) ko_id_name = 'kegg_id' if 'kegg_id' in annotation_df.columns else 'ko_id' - for gene_id, ko_list in annotation_df[ko_id_name].iteritems(): + for gene_id, ko_list in annotation_df[ko_id_name].items(): if type(ko_list) is str: for ko in ko_list.split(','): kos_to_genes[ko].append(gene_id) diff --git a/mag_annotator/summarize_vgfs.py b/mag_annotator/summarize_vgfs.py index 61df3d88..b0ca6a3b 100644 --- a/mag_annotator/summarize_vgfs.py +++ b/mag_annotator/summarize_vgfs.py @@ -104,33 +104,49 @@ def make_viral_stats_table(annotations, potential_amgs, groupby_column='scaffold return pd.DataFrame(viral_stats_series).fillna(0) -def make_viral_distillate(potential_amgs, genome_summary_form): - rows = list() - potential_amgs['ids'] = get_ids_from_annotations_by_row(potential_amgs) - potential_amgs.iloc[0] - logger = logging.getLogger() - check_columns(potential_amgs, logger) - missing_count = 0 - for gene, row in potential_amgs.iterrows(): - gene_ids = row.ids & set(genome_summary_form.index) - if len(gene_ids) > 0: - for gene_id in gene_ids: - gene_summary = genome_summary_form.loc[gene_id] - if type(gene_summary) is pd.Series: - rows.append([gene, row['scaffold'], gene_id, gene_summary['gene_description'], - gene_summary['sheet'], gene_summary['header'], gene_summary['subheader'], - gene_summary['module'], row['auxiliary_score'], row['amg_flags']]) - else: - for sub_gene_id, sub_gene_summary in gene_summary.iterrows(): - rows.append([gene, row['scaffold'], gene_id, sub_gene_summary['gene_description'], - sub_gene_summary['sheet'], sub_gene_summary['header'], - sub_gene_summary['subheader'], sub_gene_summary['module'], - row['auxiliary_score'], row['amg_flags']]) - else: - warnings.warn("No distillate information found for gene %s" % gene) - rows.append([gene, row['scaffold'], '', '', '', '', '', '', row['auxiliary_score'], - row['amg_flags']]) - return pd.DataFrame(rows, columns=VIRAL_DISTILLATE_COLUMNS) +def make_viral_distillate(potential_amgs, genome_summary_form, amg_database, logger): + """Make a summary of what in our database makes something a AMG or likly AMG to dram""" + # Transform the amg database to make it more workable + def look_up_metabolic_info(search_db, match_db, match_db_name): + id_genes = set(match_db.index) + return ( + (search_db + .assign(gene_id = lambda x: x['ids'].apply(lambda y: y & id_genes)) + )[['gene_id', 'scaffold', 'auxiliary_score', 'amg_flags']] + .explode('gene_id') + .dropna(subset=['gene_id']) + .merge(match_db, how='left', left_on='gene_id', right_index=True) + .assign(gene_id_origin=match_db_name)) + + amg_database_frame = (amg_database + .melt(value_vars=['KO', 'EC', 'PFAM'], + id_vars=['gene', 'module', 'metabolism', + 'reference', 'verified'], + value_name='gene_id') + .drop('variable', axis=1) + .assign( + gene_id=lambda x: x['gene_id'].apply( + lambda y: [i.strip() for i in str(y).split(';')])) + .explode('gene_id') + .dropna(subset='gene_id') + .set_index('gene_id') + .rename(columns = {'gene': 'gene_description'}) + ) + potential_amgs = potential_amgs.assign(ids=get_ids_from_annotations_by_row(potential_amgs)) + metabolic_df = look_up_metabolic_info(potential_amgs, genome_summary_form, 'genome_summary_form') + amg_df = look_up_metabolic_info(potential_amgs, amg_database_frame, 'amg_database') + missing = list(set(potential_amgs.index) - (set(metabolic_df.index) | (set(amg_df.index)) )) + # evaluate what is mising + logger.warning(f"No distillate information found for {len(missing)} genes.") + logger.debug('\n'.join(missing)) + + + summary = pd.concat([ + metabolic_df, + amg_df, + potential_amgs.loc[missing, ['scaffold', 'auxiliary_score', 'amg_flags']]]) + summary.reset_index(inplace=True, drop=False, names='gene') + return summary def make_vgf_order(amgs): @@ -251,7 +267,11 @@ def summarize_vgfs(input_file, output_dir, groupby_column='scaffold', max_auxili viral_genome_stats.to_csv(path.join(output_dir, 'vMAG_stats.tsv'), sep='\t') logger.info('Calculated viral genome statistics') - viral_distillate = make_viral_distillate(potential_amgs, genome_summary_form) + viral_distillate = make_viral_distillate( + potential_amgs, + genome_summary_form, + pd.read_csv(database_handler.config["dram_sheets"].get('amg_database'), sep='\t'), + logger) viral_distillate.to_csv(path.join(output_dir, 'amg_summary.tsv'), sep='\t', index=None) logger.info('Generated AMG summary') diff --git a/scripts/DRAM-v.py b/scripts/DRAM-v.py index 38abc0e9..93b110f3 100644 --- a/scripts/DRAM-v.py +++ b/scripts/DRAM-v.py @@ -27,7 +27,7 @@ annotate_parser.add_argument('-i', '--input_fasta', help="fasta file, output from ", required=True) annotate_parser.add_argument('-v', '--virsorter_affi_contigs', help="VirSorter VIRSorter_affi-contigs.tab " "output file") - annotate_parser.add_argument('-o', '--output_dir', help="output directory") + annotate_parser.add_argument('-o', '--output_dir', help="output directory", required=True) annotate_parser.add_argument('--min_contig_size', type=int, default=2500, help='minimum contig size to be used for gene prediction') annotate_parser.add_argument('--split_contigs', action='store_true', default=False, @@ -78,7 +78,7 @@ # parser for summarizing genomes distill_parser.add_argument("-i", "--input_file", help="Annotations path") - distill_parser.add_argument("-o", "--output_dir", help="Directory to write summarized genomes") + distill_parser.add_argument("-o", "--output_dir", help="Directory to write summarized genomes", required=True) distill_parser.add_argument("--groupby_column", help="Column from annotations to group as VGF units", default='scaffold') distill_parser.add_argument("--max_auxiliary_score", type=int, default=3, @@ -134,7 +134,7 @@ # parser for getting gene neighborhoods neighborhood_parser.add_argument("-i", "--input_file", help="Annotations path") - neighborhood_parser.add_argument("-o", "--output_dir", help="Directory to write gene neighborhoods") + neighborhood_parser.add_argument("-o", "--output_dir", help="Directory to write gene neighborhoods", required=True) neighborhood_parser.add_argument("--genes", nargs='*', help="Gene names from DRAM to find neighborhoods around") neighborhood_parser.add_argument("--identifiers", nargs='*', help="Database identifiers assigned by DRAM to find neighborhoods around") diff --git a/scripts/DRAM.py b/scripts/DRAM.py index 2430a68d..50228449 100644 --- a/scripts/DRAM.py +++ b/scripts/DRAM.py @@ -32,7 +32,7 @@ annotate_parser.add_argument('-i', '--input_fasta', action='append', help="fasta file, optionally with wildcards to point to multiple fastas", required=True) - annotate_parser.add_argument('-o', '--output_dir', help="output directory") + annotate_parser.add_argument('-o', '--output_dir', help="output directory", required=True) annotate_parser.add_argument('--min_contig_size', type=int, default=2500, help='minimum contig size to be used for gene prediction') annotate_parser.add_argument('--config_loc', default=None, @@ -130,7 +130,7 @@ # parser for summarizing genomes distill_parser.add_argument("-i", "--input_file", help="Annotations path") - distill_parser.add_argument("-o", "--output_dir", help="Directory to write summarized genomes") + distill_parser.add_argument("-o", "--output_dir", required=True, help="Directory to write summarized genomes") distill_parser.add_argument('--log_file_path', help="A name and loctation for the log file") distill_parser.add_argument("--rrna_path", help="rRNA output from annotation") @@ -179,7 +179,7 @@ # parser for getting gene neighborhoods neighborhood_parser.add_argument("-i", "--input_file", help="Annotations path") - neighborhood_parser.add_argument("-o", "--output_dir", help="Directory to write gene neighborhoods") + neighborhood_parser.add_argument("-o", "--output_dir", required=True, help="Directory to write gene neighborhoods") neighborhood_parser.add_argument("--genes", nargs='*', help="Gene names from DRAM to find neighborhoods around") neighborhood_parser.add_argument("--identifiers", nargs='*', help="Database identifiers assigned by DRAM to find neighborhoods around") @@ -199,7 +199,7 @@ # TODO: Make it work with append so you can use multiple -i's merge_annotations_parser.add_argument("-i", "--input_dirs", help="Path with wildcards pointing to DRAM annotation " "output directories") - merge_annotations_parser.add_argument("-o", "--output_dir", help="Path to output merged annotations files") + merge_annotations_parser.add_argument("-o", "--output_dir", required=True, help="Path to output merged annotations files") merge_annotations_parser.set_defaults(func=merge_annotations_cmd) args = parser.parse_args() diff --git a/tests/test_summarize_vgfs.py b/tests/test_summarize_vgfs.py index b9e5681f..01639a33 100644 --- a/tests/test_summarize_vgfs.py +++ b/tests/test_summarize_vgfs.py @@ -1,28 +1,49 @@ import pytest +import logging +from mag_annotator.utils import setup_logger import pandas as pd import altair as alt -from mag_annotator.summarize_vgfs import filter_to_amgs, get_strand_switches, make_viral_distillate, make_vgf_order, \ - make_amg_count_column, make_viral_functional_df, make_viral_functional_heatmap +from mag_annotator.summarize_vgfs import ( + filter_to_amgs, + get_strand_switches, + make_viral_distillate, + make_vgf_order, + make_amg_count_column, + make_viral_functional_df, + make_viral_functional_heatmap, +) @pytest.fixture() def annotations(): - return pd.DataFrame([['scaffold_1', 'K99999', 'VTF', 1], - ['scaffold_1', 'K12345', 'M', 4], - ['scaffold_1', 'K00001', 'MFTJ', 3], - ['scaffold_1', 'K11111', 'MKE', 2]], - index=['gene_1', 'gene_2', 'gene_3', 'gene_4'], - columns=['scaffold', 'ko_id', 'amg_flags', 'auxiliary_score']) + return pd.DataFrame( + [ + ["scaffold_1", "K99999", "VTF", 1], + ["scaffold_1", "K12345", "M", 4], + ["scaffold_1", "K00001", "MFTJ", 3], + ["scaffold_1", "K11111", "MKE", 2], + ], + index=["gene_1", "gene_2", "gene_3", "gene_4"], + columns=["scaffold", "ko_id", "amg_flags", "auxiliary_score"], + ) @pytest.fixture() def pamgs(): - return pd.DataFrame([['scaffold_1', 'K00001', 'MFTJ', 3], - ['scaffold_1', 'K11111', 'MKE', 2]], - index=['gene_3', 'gene_4'], - columns=['scaffold', 'ko_id', 'amg_flags', 'auxiliary_score']) + return pd.DataFrame( + [["scaffold_1", "K00001", "MFTJ", 3], ["scaffold_1", "K11111", "MKE", 2]], + index=["gene_3", "gene_4"], + columns=["scaffold", "ko_id", "amg_flags", "auxiliary_score"], + ) + + +@pytest.fixture() +def logger(tmpdir): + logger = logging.getLogger("test_log") + setup_logger(logger) + return logger def test_filter_to_amgs(annotations, pamgs): @@ -31,35 +52,129 @@ def test_filter_to_amgs(annotations, pamgs): def test_get_strand_switches(): - assert get_strand_switches(['+', '-', '+']) == 2 - assert get_strand_switches(['+', '+', '+']) == 0 - assert get_strand_switches(['-']) == 0 - assert get_strand_switches(['+', '+', '-', '-']) == 1 + assert get_strand_switches(["+", "-", "+"]) == 2 + assert get_strand_switches(["+", "+", "+"]) == 0 + assert get_strand_switches(["-"]) == 0 + assert get_strand_switches(["+", "+", "-", "-"]) == 1 @pytest.fixture() def genome_summary_frame(): - return pd.DataFrame(pd.DataFrame([['description', 'module1', 'main', 'header1', 'subheader1'], - ['description2', 'module2', 'main', 'header1', 'subheader1'], - ['description3', 'module3', 'other', 'header1', 'subheader1']], - index=['K00001', 'K12345', 'K00001'], - columns=['gene_description', 'module', 'sheet', 'header', 'subheader'])) - - -def test_make_viral_distillate(pamgs, genome_summary_frame): - test_viral_distillate = make_viral_distillate(pamgs, genome_summary_frame) - viral_distillate = pd.DataFrame([['gene_3', 'scaffold_1', 'K00001', 'description', 'main', 'header1', 'subheader1', - 'module1', 3, 'MFTJ'], - ['gene_3', 'scaffold_1', 'K00001', 'description3', 'other', 'header1', - 'subheader1', 'module3', 3, 'MFTJ'], - ['gene_4', 'scaffold_1', '', '', '', '', '', '', 2, 'MKE']], - columns=['gene', 'scaffold', 'gene_id', 'gene_description', 'category', 'header', - 'subheader', 'module', 'auxiliary_score', 'amg_flags']) - pd.testing.assert_frame_equal(test_viral_distillate, viral_distillate) + return pd.DataFrame( + pd.DataFrame( + [ + ["description", "module1", "main", "header1", "subheader1"], + ["description2", "module2", "main", "header1", "subheader1"], + ["description3", "module3", "other", "header1", "subheader1"], + ], + index=["K00001", "K12345", "K00001"], + columns=["gene_description", "module", "sheet", "header", "subheader"], + ) + ) + + +def test_make_viral_distillate(pamgs, genome_summary_frame, logger): + amg_database = pd.DataFrame( + [ + [ + "K11111", + "gene_4", + "gene_4", + "d4", + "module3", + "metabolism3", + "reference2", + "true", + ] + ], + columns=[ + "KO", + "EC", + "PFAM", + "gene", + "module", + "metabolism", + "reference", + "verified", + ], + ) + output_viral_distillate = make_viral_distillate( + pamgs, genome_summary_frame, amg_database=amg_database, logger=logger + ) + expected_viral_distillate = pd.DataFrame( + [ + [ + "gene_3", + "scaffold_1", + "K00001", + "description", + "header1", + "subheader1", + "module1", + 3, + "MFTJ", + "main", + "genome_summary_form", + "", + "", + ], + [ + "gene_3", + "scaffold_1", + "K00001", + "description3", + "header1", + "subheader1", + "module3", + 3, + "MFTJ", + "other", + "genome_summary_form", + "", + "", + "", + ], + [ + "gene_4", + "scaffold_1", + "K11111", + "d4", + "", + "", + "module3", + 2, + "MKE", + "", + "amg_database", + "metabolism3", + "reference2", + "true", + ], + ], + columns=[ + "gene", + "scaffold", + "gene_id", + "gene_description", + "header", + "subheader", + "module", + "auxiliary_score", + "amg_flags", + "sheet", + "gene_id_origin", + "metabolism", + "reference", + "verified", + ], + ) + output_viral_distillate = output_viral_distillate.sort_index(axis=1).fillna("") + expected_viral_distillate = expected_viral_distillate.sort_index(axis=1).fillna("") + pd.testing.assert_frame_equal(output_viral_distillate, expected_viral_distillate) def test_make_vgf_order(pamgs): - assert make_vgf_order(pamgs) == ['scaffold_1'] + assert make_vgf_order(pamgs) == ["scaffold_1"] def test_make_amg_count_column(pamgs): @@ -68,14 +183,26 @@ def test_make_amg_count_column(pamgs): @pytest.fixture() def viral_functional_df(): - return pd.DataFrame([['main', 'module1', 'gene_3', 'K00001', 'scaffold_1', True], - ['other', 'module3', 'gene_3', 'K00001', 'scaffold_1', True]], - columns=['Category', 'Function', 'AMG Genes', 'Genes Present', 'Contig Name', - 'Present in Contig']) + return pd.DataFrame( + [ + ["main", "module1", "gene_3", "K00001", "scaffold_1", True], + ["other", "module3", "gene_3", "K00001", "scaffold_1", True], + ], + columns=[ + "Category", + "Function", + "AMG Genes", + "Genes Present", + "Contig Name", + "Present in Contig", + ], + ) def test_make_viral_functional_df(pamgs, genome_summary_frame, viral_functional_df): - test_viral_function_df = make_viral_functional_df(pamgs, genome_summary_frame, 'scaffold') + test_viral_function_df = make_viral_functional_df( + pamgs, genome_summary_frame, "scaffold" + ) pd.testing.assert_frame_equal(test_viral_function_df, viral_functional_df)