From 052e52456581f99b28e8044b0ea1e4a7a26525eb Mon Sep 17 00:00:00 2001 From: Rory M Flynn Date: Wed, 12 Oct 2022 13:50:33 -0600 Subject: [PATCH 01/10] Fix up how viral distillate works --- mag_annotator/pull_sequences.py | 1 - mag_annotator/summarize_vgfs.py | 98 +++++++++++---------------------- 2 files changed, 33 insertions(+), 66 deletions(-) diff --git a/mag_annotator/pull_sequences.py b/mag_annotator/pull_sequences.py index d0046dca..4a68e842 100644 --- a/mag_annotator/pull_sequences.py +++ b/mag_annotator/pull_sequences.py @@ -84,7 +84,6 @@ def pull_sequences(input_tsv, input_fasta, output_fasta, fastas=None, scaffolds= if adjective_sheet is not None: adjective_sheet = pd.read_csv(input_tsv, sep='\t', index_col=0) annotations = annotations.loc[adjective_sheet.index] - breakpoint() annotation_genes_to_keep = get_genes_from_identifiers(annotations, genes, fastas, scaffolds, identifiers, categories, custom_distillate) annotations = annotations.loc[annotation_genes_to_keep] diff --git a/mag_annotator/summarize_vgfs.py b/mag_annotator/summarize_vgfs.py index bd980724..ff295b0a 100644 --- a/mag_annotator/summarize_vgfs.py +++ b/mag_annotator/summarize_vgfs.py @@ -104,78 +104,48 @@ def make_viral_stats_table(annotations, potential_amgs, groupby_column='scaffold return pd.DataFrame(viral_stats_series).fillna(0) -def make_viral_distillate(potential_amgs, genome_summary_form): - rows = list() - potential_amgs['ids'] = get_ids_from_annotations_by_row(potential_amgs) - potential_amgs.iloc[0] - logger = logging.getLogger() - check_columns(potential_amgs, logger) - # breakpoint() - for gene, row in potential_amgs.iterrows(): - gene_ids = row.ids & set(genome_summary_form.index) - if len(gene_ids) > 0: - for gene_id in gene_ids: - gene_summary = genome_summary_form.loc[gene_id] - # breakpoint() - if type(gene_summary) is pd.Series: - rows.append([gene, row['scaffold'], gene_id, gene_summary['gene_description'], - gene_summary['sheet'], gene_summary['header'], gene_summary['subheader'], - gene_summary['module'], row['auxiliary_score'], row['amg_flags']]) - else: - for sub_gene_id, sub_gene_summary in gene_summary.iterrows(): - rows.append([gene, row['scaffold'], gene_id, sub_gene_summary['gene_description'], - sub_gene_summary['sheet'], sub_gene_summary['header'], - sub_gene_summary['subheader'], sub_gene_summary['module'], - row['auxiliary_score'], row['amg_flags']]) - else: - warnings.warn("No distillate information found for gene %s" % gene) - rows.append([gene, row['scaffold'], '', '', '', '', '', '', row['auxiliary_score'], - row['amg_flags']]) - return pd.DataFrame(rows, columns=VIRAL_DISTILLATE_COLUMNS) - - -def make_viral_distillate2(potential_amgs, genome_summary_form, amg_database, logger): +def make_viral_distillate(potential_amgs, genome_summary_form, amg_database, logger): """Make a summary of what in our database makes somthing a AMG or likly AMG to dram""" # Transform the amg database to make it more workable + def look_up_metabolic_info(search_db, match_db, match_db_name): + id_genes = set(match_db.index) + return ( + (search_db + .assign(gene_id = lambda x: x['ids'].apply(lambda y: y & id_genes)) + )[['gene_id', 'scaffold', 'auxiliary_score', 'amg_flags']] + .explode('gene_id') + .dropna(subset=['gene_id']) + .merge(match_db, how='left', left_on='gene_id', right_index=True) + .assign(gene_id_origin=match_db_name)) + amg_database_frame = (amg_database .melt(value_vars=['KO', 'EC', 'PFAM'], id_vars=['gene', 'module', 'metabolism', 'reference', 'verified'], value_name='gene_id') .drop('variable', axis=1) + .assign( + gene_id=lambda x: x['gene_id'].apply( + lambda y: [i.strip() for i in str(y).split(';')])) + .explode('gene_id') + .dropna(subset='gene_id') .set_index('gene_id') .rename(columns = {'gene': 'gene_description'}) ) - metabolic_genes = set(genome_summary_form.index) - amg_db_genes = set(amg_database_frame.index) - # verified_amgs = get_amg_ids(amg_database_frame.loc[amg_database_frame.verified]) - potential_amgs['ids'] = get_ids_from_annotations_by_row(potential_amgs) - potential_amgs['amg_ids'] = potential_amgs['ids'].apply(lambda x: x & amg_db_genes) - potential_amgs['metabo_ids'] = potential_amgs['ids'].apply(lambda x: x & metabolic_genes) + potential_amgs = potential_amgs.assign(ids=get_ids_from_annotations_by_row(potential_amgs)) + metabolic_df = look_up_metabolic_info(potential_amgs, genome_summary_form, 'genome_summary_form') + amg_df = look_up_metabolic_info(potential_amgs, amg_database_frame, 'amg_database') + missing = list(set(potential_amgs.index) - (set(metabolic_df.index) | (set(amg_df.index)) )) # evaluate what is mising + logger.warning(f"No distillate information found for {len(missing)} genes.") + logger.debug('\n'.join(missing)) + - missing_info = potential_amgs[((potential_amgs['amg_ids'].apply(len) < 1) & - (potential_amgs['metabo_ids'].apply(len) < 1))] - logger.warning(f"No distillate information found for {len(missing_info.index.unique())} genes.") - logger.debug('\n'.join(missing_info.index.unique())) - missing_info['amg_flags'] - def look_up_metabolic_info(search_db, ids_col, match_db, match_db_name): - return ( - search_db[[ids_col, 'scaffold', 'auxiliary_score', 'amg_flags']] - .rename(columns={ids_col: 'gene_id'}) - .explode('gene_id') - .dropna(subset=['gene_id']) - .merge(match_db, - how='left', - left_on='gene_id', - right_index=True) - .assign(gene_id_origin=match_db_name)) summary = pd.concat([ - look_up_metabolic_info(potential_amgs, 'metabo_ids', genome_summary_form, 'genome_summary_form'), - look_up_metabolic_info(potential_amgs, 'amg_ids', amg_database_frame, 'amg_database'), - missing_info[['scaffold', 'auxiliary_score', 'amg_flags']]]) + metabolic_df, + amg_df, + potential_amgs.loc[missing, ['scaffold', 'auxiliary_score', 'amg_flags']]]) summary.reset_index(inplace=True, drop=False, names='gene') - return summary @@ -203,7 +173,7 @@ def make_viral_functional_df(annotations, genome_summary_form, groupby_column='s # build dict of ids per genome vgf_to_id_dict = defaultdict(defaultdict_list) for vgf, frame in annotations.groupby(groupby_column, sort=False): - for gene, id_list in get_ids_from_annotations_by_row(frame).iteritems(): + for gene, id_list in get_ids_from_annotations_by_row(frame).items(): for id_ in id_list: vgf_to_id_dict[vgf][id_].append(gene) # build long from data frame @@ -292,7 +262,6 @@ def summarize_vgfs(input_file, output_dir, groupby_column='scaffold', max_auxili check_columns(potential_amgs, logger) annotations.kegg_hit annotations.iloc[0] - # breakpoint() logger.info('Determined potential amgs') # make distillate @@ -300,12 +269,11 @@ def summarize_vgfs(input_file, output_dir, groupby_column='scaffold', max_auxili viral_genome_stats.to_csv(path.join(output_dir, 'vMAG_stats.tsv'), sep='\t') logger.info('Calculated viral genome statistics') - # viral_distillate = make_viral_distillate2( - # potential_amgs, - # genome_summary_form, - # pd.read_csv(database_handler.config["dram_sheets"].get('amg_database'), sep='\t'), - # logger) - viral_distillate = make_viral_distillate(potential_amgs, genome_summary_form) + viral_distillate = make_viral_distillate( + potential_amgs, + genome_summary_form, + pd.read_csv(database_handler.config["dram_sheets"].get('amg_database'), sep='\t'), + logger) viral_distillate.to_csv(path.join(output_dir, 'amg_summary.tsv'), sep='\t', index=None) logger.info('Generated AMG summary') From 522bc4b6cc4f89cdaf88e0c06097f6b865883c79 Mon Sep 17 00:00:00 2001 From: Rory M Flynn Date: Fri, 14 Oct 2022 13:29:35 -0600 Subject: [PATCH 02/10] lets see why vogdb is not running --- mag_annotator/CONFIG | 167 +++++++++++++++++++++++++++++---- mag_annotator/annotate_vgfs.py | 27 +++++- 2 files changed, 169 insertions(+), 25 deletions(-) diff --git a/mag_annotator/CONFIG b/mag_annotator/CONFIG index bc8aa5c7..0f019614 100644 --- a/mag_annotator/CONFIG +++ b/mag_annotator/CONFIG @@ -1,28 +1,155 @@ { "search_databases": { - "kegg": null, - "kofam_hmm": null, - "kofam_ko_list": null, - "uniref": null, - "pfam": null, - "dbcan": null, - "viral": null, - "peptidase": null, - "vogdb": null + "kegg": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kegg.20221012.mmsdb", + "kofam_hmm": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kofam_profiles.hmm", + "kofam_ko_list": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kofam_ko_list.tsv", + "uniref": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/uniref90.20220928.mmsdb", + "pfam": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/pfam.mmspro", + "dbcan": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/dbCAN-HMMdb-V11.txt", + "viral": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/refseq_viral.20220928.mmsdb", + "peptidase": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/peptidases.20220928.mmsdb", + "vogdb": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/vog_latest_hmms.txt" }, - "custom_dbs": null, "database_descriptions": { - "pfam_hmm_dat": null, - "dbcan_fam_activities": null, - "vog_annotations": null + "pfam_hmm": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/Pfam-A.hmm.dat.gz", + "dbcan_fam_activities": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/CAZyDB.08062022.fam-activities.txt", + "dbcan_subfam_ec": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/CAZyDB.08062022.fam.subfam.ec.txt", + "vog_annotations": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/vog_annotations_latest.tsv.gz" }, "dram_sheets": { - "genome_summary_form": null, - "module_step_form": null, - "etc_module_database": null, - "function_heatmap_form": null, - "amg_database": null + "genome_summary_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/genome_summary_form.20220928.tsv", + "module_step_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/module_step_form.20220928.tsv", + "etc_module_database": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/etc_mdoule_database.20220928.tsv", + "function_heatmap_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/function_heatmap_form.20220928.tsv", + "amg_database": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/amg_database.20220928.tsv" }, - "description_db": null, - "dram_version": null + "dram_version": "1.4.0rc1", + "description_db": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/description_db.sqlite", + "setup_info": { + "kegg": { + "name": "KEGG db", + "description_db_updated": "10/12/2022, 18:52:36", + "citation": " M. Kanehisa, M. Furumichi, Y. Sato, M. Ishiguro-Watanabe, and M. Tanabe, \"Kegg: integrating viruses and cellular organisms,\" Nucleic acids research, vol. 49, no. D1, pp. D545\u2013D551, 2021." + }, + "kofam_hmm": { + "name": "KOfam db", + "citation": "T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa, S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignment based on profile hmm and adaptive score threshold,\" Bioinformatics, vol. 36, no. 7, pp. 2251\u20132252, 2020.", + "Download time": "09/28/2022, 11:00:09", + "Origin": "Downloaded by DRAM" + }, + "kofam_ko_list": { + "name": "KOfam KO list", + "citation": "T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa, S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignment based on profile hmm and adaptive score threshold,\" Bioinformatics, vol. 36, no. 7, pp. 2251\u20132252, 2020.", + "Download time": "09/28/2022, 11:00:11", + "Origin": "Downloaded by DRAM" + }, + "uniref": { + "name": "UniRef db", + "description_db_updated": "09/29/2022, 13:14:40", + "citation": "Y. Wang, Q. Wang, H. Huang, W. Huang, Y. Chen, P. B. McGarvey, C. H. Wu, C. N. Arighi, and U. Consortium, \"A crowdsourcing open platform for literature curation in uniprot,\" PLoS Biology, vol. 19, no. 12, p. e3001464, 2021.", + "version": "90", + "Download time": "09/28/2022, 11:15:01", + "Origin": "Downloaded by DRAM" + }, + "pfam": { + "name": "Pfam db", + "citation": "J. Mistry, S. Chuguransky, L. Williams, M. Qureshi, G. A. Salazar, E. L. Sonnhammer, S. C. Tosatto, L. Paladin, S. Raj, L. J. Richardson et al., \"Pfam: The protein families database in 2021,\" Nucleic acids research, vol. 49, no. D1, pp. D412\u2013D419, 2021.", + "Download time": "09/28/2022, 11:49:29", + "Origin": "Downloaded by DRAM", + "description_db_updated": "09/29/2022, 13:23:47" + }, + "pfam_hmm": { + "name": "Pfam hmm dat", + "description_db_updated": "Unknown, or Never", + "citation": "J. Mistry, S. Chuguransky, L. Williams, M. Qureshi, G. A. Salazar, E. L. Sonnhammer, S. C. Tosatto, L. Paladin, S. Raj, L. J. Richardson et al., \"Pfam: The protein families database in 2021,\" Nucleic acids research, vol. 49, no. D1, pp. D412\u2013D419, 2021.", + "Download time": "09/28/2022, 11:49:31", + "Origin": "Downloaded by DRAM" + }, + "dbcan": { + "name": "dbCAN db", + "citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.", + "version": "11", + "Download time": "09/28/2022, 11:49:33", + "Origin": "Downloaded by DRAM", + "description_db_updated": "09/29/2022, 13:23:50" + }, + "dbcan_fam_activities": { + "name": "dbCAN family activities", + "citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.", + "version": "11", + "upload_date": "08062022", + "Download time": "09/28/2022, 11:49:33", + "Origin": "Downloaded by DRAM" + }, + "dbcan_subfam_ec": { + "name": "dbCAN subfamily EC numbers", + "citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.", + "version": "11", + "upload_date": "08062022", + "Download time": "09/28/2022, 11:49:33", + "Origin": "Downloaded by DRAM" + }, + "vogdb": { + "name": "VOGDB db", + "citation": "J. Thannesberger, H.-J. Hellinger, I. Klymiuk, M.-T. Kastner, F. J. Rieder, M. Schneider, S. Fister, T. Lion, K. Kosulin, J. Laengle et al., \"Viruses comprise an extensive pool of mobile genetic elements in eukaryote cell cultures and human clinical samples,\" The FASEB Journal, vol. 31, no. 5, pp. 1987\u20132000, 2017.", + "version": "latest", + "Download time": "09/28/2022, 11:51:57", + "Origin": "Downloaded by DRAM", + "description_db_updated": "09/29/2022, 13:24:14" + }, + "vog_annotations": { + "name": "VOG annotations", + "description_db_updated": "Unknown, or Never", + "citation": "J. Thannesberger, H.-J. Hellinger, I. Klymiuk, M.-T. Kastner, F. J. Rieder, M. Schneider, S. Fister, T. Lion, K. Kosulin, J. Laengle et al., \"Viruses comprise an extensive pool of mobile genetic elements in eukaryote cell cultures and human clinical samples,\" The FASEB Journal, vol. 31, no. 5, pp. 1987\u20132000, 2017.", + "version": "latest", + "Download time": "09/28/2022, 11:51:58", + "Origin": "Downloaded by DRAM" + }, + "viral": { + "name": "RefSeq Viral db", + "description_db_updated": "09/29/2022, 13:16:15", + "citation": "J. R. Brister, D. Ako-Adjei, Y. Bao, and O. Blinkova, \"Ncbi viral genomes resource,\" Nucleic acids research, vol. 43, no. D1, pp. D571\u2013D577, 2015. [3] M. Kanehisa, M. Furumichi, Y. Sato, M. Ishiguro-Watanabe, and M. Tan-abe, \"Kegg: integrating viruses and cellular organisms,\" Nucleic acids research, vol. 49, no. D1, pp. D545\u2013D551, 2021.", + "viral_files": 2, + "Download time": "09/28/2022, 11:52:20", + "Origin": "Downloaded by DRAM" + }, + "peptidase": { + "name": "MEROPS peptidase db", + "description_db_updated": "09/29/2022, 13:23:40", + "citation": "N. D. Rawlings, A. J. Barrett, P. D. Thomas, X. Huang, A. Bateman, and R. D. Finn, \"The merops database of proteolytic enzymes, their substrates and inhibitors in 2017 and a comparison with peptidases in the panther database,\" Nucleic acids research, vol. 46, no. D1, pp. D624\u2013D632, 2018.", + "Download time": "09/28/2022, 12:01:46", + "Origin": "Downloaded by DRAM" + }, + "genome_summary_form": { + "name": "Genome summary form", + "branch": "master", + "Download time": "09/28/2022, 12:01:46", + "Origin": "Downloaded by DRAM" + }, + "module_step_form": { + "name": "Module step form", + "branch": "master", + "Download time": "09/28/2022, 12:01:47", + "Origin": "Downloaded by DRAM" + }, + "function_heatmap_form": { + "name": "Function heatmap form", + "branch": "master", + "Download time": "09/28/2022, 12:01:47", + "Origin": "Downloaded by DRAM" + }, + "amg_database": { + "name": "AMG database", + "branch": "master", + "Download time": "09/28/2022, 12:01:47", + "Origin": "Downloaded by DRAM" + }, + "etc_module_database": { + "name": "ETC module database", + "branch": "master", + "Download time": "09/28/2022, 12:01:47", + "Origin": "Downloaded by DRAM" + } + }, + "log_path": null } diff --git a/mag_annotator/annotate_vgfs.py b/mag_annotator/annotate_vgfs.py index 642ca929..eaf0ffb3 100644 --- a/mag_annotator/annotate_vgfs.py +++ b/mag_annotator/annotate_vgfs.py @@ -446,7 +446,7 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_ # get database locations db_handler = DatabaseHandler(logger) - db_handler.filter_db_locs(low_mem_mode, use_uniref, True, VMAG_DBS_TO_ANNOTATE) + db_handler.filter_db_locs(low_mem_mode=low_mem_mode, use_uniref=use_uniref, use_vogdb=True, master_list=VMAG_DBS_TO_ANNOTATE) if virsorter_affi_contigs is not None: virsorter_hits = get_virsorter_hits(virsorter_affi_contigs) @@ -482,10 +482,27 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_ # annotate vMAGs rename_bins = False - annotations = annotate_fastas(contig_locs, output_dir, db_handler, logger, min_contig_size, prodigal_mode, trans_table, - bit_score_threshold, rbh_bit_score_threshold, custom_db_name, custom_fasta_loc, - custom_hmm_name, custom_hmm_loc, custom_hmm_cutoffs_loc, kofam_use_dbcan2_thresholds, - skip_trnascan, rename_bins, keep_tmp_dir, threads, verbose) + annotations = annotate_fastas( + fasta_locs=contig_locs, + output_dir=output_dir, + db_handler=db_handler, + logger=logger, + min_contig_size=min_contig_size, + prodigal_mode=prodigal_mode, + trans_table=trans_table, + bit_score_threshold=bit_score_threshold, + rbh_bit_score_threshold=rbh_bit_score_threshold, + custom_db_name=custom_db_name, + custom_fasta_loc=custom_fasta_loc, + custom_hmm_name=custom_hmm_name, + custom_hmm_loc=custom_hmm_loc, + custom_hmm_cutoffs_loc=custom_hmm_cutoffs_loc, + kofam_use_dbcan2_thresholds=kofam_use_dbcan2_thresholds, + skip_trnascan=skip_trnascan, + rename_bins=rename_bins, + keep_tmp_dir=keep_tmp_dir, + threads=threads, + verbose=verbose) logging.info('Annotations complete, assigning auxiliary scores and flags') annotations = add_dramv_scores_and_flags(annotations, db_handler, logger, virsorter_hits, input_fasta) From e2a8804f9364c6cc7af21ccbc8c2de988d09f4af Mon Sep 17 00:00:00 2001 From: Rory M Flynn Date: Fri, 14 Oct 2022 13:42:18 -0600 Subject: [PATCH 03/10] Add breaks --- mag_annotator/annotate_bins.py | 2 ++ mag_annotator/annotate_vgfs.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/mag_annotator/annotate_bins.py b/mag_annotator/annotate_bins.py index c6e0059f..fce15166 100644 --- a/mag_annotator/annotate_bins.py +++ b/mag_annotator/annotate_bins.py @@ -891,6 +891,7 @@ def annotate_fastas(fasta_locs, output_dir, db_handler, logger, min_contig_size= tmp_dir = path.join(output_dir, 'working_dir') mkdir(tmp_dir) + breakpoint() # setup custom databases to be searched custom_db_locs = process_custom_dbs(custom_fasta_loc, custom_db_name, path.join(tmp_dir, 'custom_dbs'), logger, threads, verbose) @@ -906,6 +907,7 @@ def annotate_fastas(fasta_locs, output_dir, db_handler, logger, min_contig_size= logger.info('Annotating %s' % fasta_name) fasta_dir = path.join(tmp_dir, fasta_name) mkdir(fasta_dir) + breakpoint() annotations_list.append( annotate_fasta(fasta_loc, fasta_name, fasta_dir, db_handler, logger, min_contig_size, prodigal_mode, trans_table, custom_db_locs, custom_hmm_locs, custom_hmm_cutoffs_locs, diff --git a/mag_annotator/annotate_vgfs.py b/mag_annotator/annotate_vgfs.py index eaf0ffb3..e87c257c 100644 --- a/mag_annotator/annotate_vgfs.py +++ b/mag_annotator/annotate_vgfs.py @@ -434,6 +434,7 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_ log_file_path = path.join(output_dir, "Annotation.log") logger = logging.getLogger('annotation_log') setup_logger(logger, log_file_path) + breakpoint() # check inputs prodigal_modes = ['train', 'meta', 'single'] @@ -446,8 +447,9 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_ # get database locations db_handler = DatabaseHandler(logger) + breakpoint() db_handler.filter_db_locs(low_mem_mode=low_mem_mode, use_uniref=use_uniref, use_vogdb=True, master_list=VMAG_DBS_TO_ANNOTATE) - + breakpoint() if virsorter_affi_contigs is not None: virsorter_hits = get_virsorter_hits(virsorter_affi_contigs) else: @@ -482,6 +484,7 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_ # annotate vMAGs rename_bins = False + breakpoint() annotations = annotate_fastas( fasta_locs=contig_locs, output_dir=output_dir, From ca5dd0197a343af03ca6b0fe76ec1d31d6aa4500 Mon Sep 17 00:00:00 2001 From: Rory M Flynn Date: Tue, 18 Oct 2022 14:49:27 -0600 Subject: [PATCH 04/10] Fix some typo, and name change errors in DRAM-v --- mag_annotator/annotate_bins.py | 59 +++++++-------------------------- mag_annotator/annotate_vgfs.py | 16 ++++----- mag_annotator/summarize_vgfs.py | 5 +-- 3 files changed, 19 insertions(+), 61 deletions(-) diff --git a/mag_annotator/annotate_bins.py b/mag_annotator/annotate_bins.py index fce15166..468f02bb 100644 --- a/mag_annotator/annotate_bins.py +++ b/mag_annotator/annotate_bins.py @@ -244,11 +244,16 @@ def kofam_hmmscan_formater(hits:pd.DataFrame, hmm_info_path:str=None, use_dbcan2 -def vogdb_hmmscan_formater(hits:pd.DataFrame, db_name:str, db_handler=None): +def vogdb_hmmscan_formater(hits:pd.DataFrame, db_name:str, logger:logging.Logger, + db_handler=None): + categories_col = f"{db_name}_categories" + id_col = f"{db_name}_id" + description_col = f"{db_name}_description" hits_sig = hits[hits.apply(get_sig_row, axis=1)] if len(hits_sig) == 0: + logger.warn("No significant hits for vog_db") # if nothing significant then return nothing, don't get descriptions - return pd.DataFrame() + return pd.DataFrame(columns=[categories_col, id_col, description_col]) # Get the best hits hits_best = hits_sig.sort_values('full_evalue').drop_duplicates(subset=["query_id"]) if db_handler is None: @@ -257,16 +262,16 @@ def vogdb_hmmscan_formater(hits:pd.DataFrame, db_name:str, db_handler=None): # get_descriptions desc_col = f"{db_name}_hits" descriptions = pd.DataFrame( - db_handler.get_descriptions(hits_best['target_id'].unique(), f"{db_name}_description"), + db_handler.get_descriptions(hits_best['target_id'].unique(), description_col), index=[desc_col]).T categories = descriptions[desc_col].apply(lambda x: x.split('; ')[-1]) - descriptions[f"{db_name}_categories"] = categories.apply( + descriptions[categories_col] = categories.apply( lambda x: ';'.join(set([x[i:i + 2] for i in range(0, len(x), 2)]))) descriptions['target_id'] = descriptions.index hits_df = pd.merge(hits_best[['query_id', 'target_id']], descriptions, on=f'target_id') hits_df.set_index('query_id', inplace=True, drop=True) hits_df.rename_axis(None, inplace=True) - hits_df.rename(columns={'target_id': f"{db_name}_id"}, inplace=True) + hits_df.rename(columns={'target_id': id_col}, inplace=True) return hits_df @@ -753,6 +758,7 @@ def annotate_orfs(gene_faa, db_handler, tmp_dir, logger, custom_db_locs=(), cust formater=partial( vogdb_hmmscan_formater, db_name='vogdb', + logger=logger, db_handler=db_handler ), logger=logger)) @@ -891,7 +897,6 @@ def annotate_fastas(fasta_locs, output_dir, db_handler, logger, min_contig_size= tmp_dir = path.join(output_dir, 'working_dir') mkdir(tmp_dir) - breakpoint() # setup custom databases to be searched custom_db_locs = process_custom_dbs(custom_fasta_loc, custom_db_name, path.join(tmp_dir, 'custom_dbs'), logger, threads, verbose) @@ -907,7 +912,6 @@ def annotate_fastas(fasta_locs, output_dir, db_handler, logger, min_contig_size= logger.info('Annotating %s' % fasta_name) fasta_dir = path.join(tmp_dir, fasta_name) mkdir(fasta_dir) - breakpoint() annotations_list.append( annotate_fasta(fasta_loc, fasta_name, fasta_dir, db_handler, logger, min_contig_size, prodigal_mode, trans_table, custom_db_locs, custom_hmm_locs, custom_hmm_cutoffs_locs, @@ -929,45 +933,6 @@ def make_fasta_namses_df(fasta_loc): return pd.DataFrame(names) -def annotate_fastas_as_one(fasta_locs, output_dir, db_handler, logger:logging.Logger, min_contig_size=5000, prodigal_mode='meta', - trans_table='11', bit_score_threshold=60, rbh_bit_score_threshold=350, custom_db_name=(), - custom_fasta_loc=(), custom_hmm_name=(), custom_hmm_loc=(), custom_hmm_cutoffs_loc=(), - kofam_use_dbcan2_thresholds=False, skip_trnascan=False, rename_bins=True, keep_tmp_dir=True, - threads=10, verbose=True): - # check for no conflicting options/configurations - tmp_dir = path.join(output_dir, 'working_dir') - mkdir(tmp_dir) - - # setup custom databases to be searched - custom_db_locs = process_custom_dbs(custom_fasta_loc, custom_db_name, path.join(tmp_dir, 'custom_dbs'), - logger, threads, verbose) - custom_hmm_locs = process_custom_hmms(custom_hmm_loc, custom_hmm_name, logger) - custom_hmm_cutoffs_locs= process_custom_hmm_cutoffs(custom_hmm_cutoffs_loc, custom_hmm_name, logger) - logger.info('Retrieved database locations and descriptions') - - # iterate over list of fastas and annotate each individually - annotations_list = list() - names = pd.concat([]) - for fasta_loc in fasta_locs: - # get name of file e.g. /home/shaffemi/my_genome.fa -> my_genome - fasta_name = get_fasta_name(fasta_loc) - logger.info('Annotating %s' % fasta_name) - fasta_dir = path.join(tmp_dir, fasta_name) - mkdir(fasta_dir) - annotations_list.append( - annotate_fasta(fasta_loc, fasta_name, fasta_dir, db_handler, logger, min_contig_size, prodigal_mode, - trans_table, custom_db_locs, custom_hmm_locs, custom_hmm_cutoffs_locs, - bit_score_threshold, rbh_bit_score_threshold, kofam_use_dbcan2_thresholds, - skip_trnascan, threads, rename_bins, keep_tmp_dir, verbose)) - logger.info('Annotations complete, processing annotations') - all_annotations = merge_annotations(annotations_list, output_dir) - - # clean up - if not keep_tmp_dir: - rmtree(tmp_dir) - return all_annotations - - # TODO: Add force flag to remove output dir if it already exists # TODO: Add continute flag to continue if output directory already exists # TODO: make fasta loc either a string or list to remove annotate_bins_cmd and annotate_called_genes_cmd? @@ -1212,7 +1177,7 @@ def merge_annotations_cmd(input_dirs, output_dir): mkdir(output_dir) # Get a logger annotations_list = list() - log_file_path = path.join(output_dir, "annotation.log") + log_file_path = path.join(output_dir, "annotate.log") logger = logging.getLogger('annotate.log') setup_logger(logger, log_file_path) logger.info(f"The log file is created at {log_file_path}") diff --git a/mag_annotator/annotate_vgfs.py b/mag_annotator/annotate_vgfs.py index e87c257c..23e2b5db 100644 --- a/mag_annotator/annotate_vgfs.py +++ b/mag_annotator/annotate_vgfs.py @@ -16,7 +16,7 @@ from mag_annotator.utils import setup_logger from mag_annotator.summarize_genomes import get_ids_from_annotations_all -VMAG_DBS_TO_ANNOTATE = ('kegg', 'kofam', 'kofam_ko_list', 'uniref', 'peptidase', 'pfam', 'dbcan', 'viral', 'vogdb') +VMAG_DBS_TO_ANNOTATE = ('kegg', 'kofam_hmm', 'kofam_ko_list', 'uniref', 'peptidase', 'pfam', 'dbcan', 'viral', 'vogdb') VIRSORTER_COLUMN_NAMES = ['gene_name', 'start_position', 'end_position', 'length', 'strandedness', 'viral_protein_cluster_hit', 'viral_protein_cluster_hit_score', 'viral_protein_cluster_hit_evalue', 'viral_protein_cluster_category', 'pfam_hit', @@ -288,9 +288,6 @@ def get_metabolic_flags(annotations, metabolic_genes, amgs, verified_amgs, scaff flag_dict = dict() metabolic_genes = set(metabolic_genes) for scaffold, scaffold_annotations in annotations.groupby('scaffold'): - # perc_xh = sum([i == 'Xh' if not pd.isna(i) else False for i in scaffold_annotations['vogdb_categories']]) \ - # / scaffold_annotations.shape[0] - # is_j = perc_xh >= 0.18 for gene, row in scaffold_annotations.iterrows(): # set up flags = '' @@ -431,10 +428,9 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_ custom_hmm_name=(), use_uniref=False, kofam_use_dbcan2_thresholds=False, skip_trnascan=False, keep_tmp_dir=True, low_mem_mode=False, threads=10, verbose=True): mkdir(output_dir) - log_file_path = path.join(output_dir, "Annotation.log") + log_file_path = path.join(output_dir, "annotate.log") logger = logging.getLogger('annotation_log') setup_logger(logger, log_file_path) - breakpoint() # check inputs prodigal_modes = ['train', 'meta', 'single'] @@ -447,9 +443,10 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_ # get database locations db_handler = DatabaseHandler(logger) - breakpoint() - db_handler.filter_db_locs(low_mem_mode=low_mem_mode, use_uniref=use_uniref, use_vogdb=True, master_list=VMAG_DBS_TO_ANNOTATE) - breakpoint() + db_handler.filter_db_locs(low_mem_mode=low_mem_mode, + use_uniref=use_uniref, + use_vogdb=True, + master_list=VMAG_DBS_TO_ANNOTATE) if virsorter_affi_contigs is not None: virsorter_hits = get_virsorter_hits(virsorter_affi_contigs) else: @@ -484,7 +481,6 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_ # annotate vMAGs rename_bins = False - breakpoint() annotations = annotate_fastas( fasta_locs=contig_locs, output_dir=output_dir, diff --git a/mag_annotator/summarize_vgfs.py b/mag_annotator/summarize_vgfs.py index ff295b0a..2dc8e485 100644 --- a/mag_annotator/summarize_vgfs.py +++ b/mag_annotator/summarize_vgfs.py @@ -236,7 +236,7 @@ def summarize_vgfs(input_file, output_dir, groupby_column='scaffold', max_auxili # make output folder mkdir(output_dir) if log_file_path is None: - log_file_path = path.join(output_dir, "Distillation.log") + log_file_path = path.join(output_dir, "distill.log") logger = logging.getLogger('distillation_log') setup_logger(logger, log_file_path) logger.info(f"The log file is created at {log_file_path}") @@ -258,10 +258,7 @@ def summarize_vgfs(input_file, output_dir, groupby_column='scaffold', max_auxili # get potential AMGs potential_amgs = filter_to_amgs(annotations, max_aux=max_auxiliary_score, remove_transposons=remove_transposons, remove_fs=remove_fs) - check_columns(annotations, logger) check_columns(potential_amgs, logger) - annotations.kegg_hit - annotations.iloc[0] logger.info('Determined potential amgs') # make distillate From bc9fd2e88de896c32c3f4826e48799aa2350c1a4 Mon Sep 17 00:00:00 2001 From: Rory M Flynn Date: Tue, 18 Oct 2022 14:59:47 -0600 Subject: [PATCH 05/10] Tweak some fixes from amg_summery fix --- mag_annotator/annotate_bins.py | 55 +++++++-------------------------- mag_annotator/summarize_vgfs.py | 3 -- 2 files changed, 11 insertions(+), 47 deletions(-) diff --git a/mag_annotator/annotate_bins.py b/mag_annotator/annotate_bins.py index 0c8dcb15..468f02bb 100644 --- a/mag_annotator/annotate_bins.py +++ b/mag_annotator/annotate_bins.py @@ -244,11 +244,16 @@ def kofam_hmmscan_formater(hits:pd.DataFrame, hmm_info_path:str=None, use_dbcan2 -def vogdb_hmmscan_formater(hits:pd.DataFrame, db_name:str, db_handler=None): +def vogdb_hmmscan_formater(hits:pd.DataFrame, db_name:str, logger:logging.Logger, + db_handler=None): + categories_col = f"{db_name}_categories" + id_col = f"{db_name}_id" + description_col = f"{db_name}_description" hits_sig = hits[hits.apply(get_sig_row, axis=1)] if len(hits_sig) == 0: + logger.warn("No significant hits for vog_db") # if nothing significant then return nothing, don't get descriptions - return pd.DataFrame() + return pd.DataFrame(columns=[categories_col, id_col, description_col]) # Get the best hits hits_best = hits_sig.sort_values('full_evalue').drop_duplicates(subset=["query_id"]) if db_handler is None: @@ -257,16 +262,16 @@ def vogdb_hmmscan_formater(hits:pd.DataFrame, db_name:str, db_handler=None): # get_descriptions desc_col = f"{db_name}_hits" descriptions = pd.DataFrame( - db_handler.get_descriptions(hits_best['target_id'].unique(), f"{db_name}_description"), + db_handler.get_descriptions(hits_best['target_id'].unique(), description_col), index=[desc_col]).T categories = descriptions[desc_col].apply(lambda x: x.split('; ')[-1]) - descriptions[f"{db_name}_categories"] = categories.apply( + descriptions[categories_col] = categories.apply( lambda x: ';'.join(set([x[i:i + 2] for i in range(0, len(x), 2)]))) descriptions['target_id'] = descriptions.index hits_df = pd.merge(hits_best[['query_id', 'target_id']], descriptions, on=f'target_id') hits_df.set_index('query_id', inplace=True, drop=True) hits_df.rename_axis(None, inplace=True) - hits_df.rename(columns={'target_id': f"{db_name}_id"}, inplace=True) + hits_df.rename(columns={'target_id': id_col}, inplace=True) return hits_df @@ -753,6 +758,7 @@ def annotate_orfs(gene_faa, db_handler, tmp_dir, logger, custom_db_locs=(), cust formater=partial( vogdb_hmmscan_formater, db_name='vogdb', + logger=logger, db_handler=db_handler ), logger=logger)) @@ -927,45 +933,6 @@ def make_fasta_namses_df(fasta_loc): return pd.DataFrame(names) -def annotate_fastas_as_one(fasta_locs, output_dir, db_handler, logger:logging.Logger, min_contig_size=5000, prodigal_mode='meta', - trans_table='11', bit_score_threshold=60, rbh_bit_score_threshold=350, custom_db_name=(), - custom_fasta_loc=(), custom_hmm_name=(), custom_hmm_loc=(), custom_hmm_cutoffs_loc=(), - kofam_use_dbcan2_thresholds=False, skip_trnascan=False, rename_bins=True, keep_tmp_dir=True, - threads=10, verbose=True): - # check for no conflicting options/configurations - tmp_dir = path.join(output_dir, 'working_dir') - mkdir(tmp_dir) - - # setup custom databases to be searched - custom_db_locs = process_custom_dbs(custom_fasta_loc, custom_db_name, path.join(tmp_dir, 'custom_dbs'), - logger, threads, verbose) - custom_hmm_locs = process_custom_hmms(custom_hmm_loc, custom_hmm_name, logger) - custom_hmm_cutoffs_locs= process_custom_hmm_cutoffs(custom_hmm_cutoffs_loc, custom_hmm_name, logger) - logger.info('Retrieved database locations and descriptions') - - # iterate over list of fastas and annotate each individually - annotations_list = list() - names = pd.concat([]) - for fasta_loc in fasta_locs: - # get name of file e.g. /home/shaffemi/my_genome.fa -> my_genome - fasta_name = get_fasta_name(fasta_loc) - logger.info('Annotating %s' % fasta_name) - fasta_dir = path.join(tmp_dir, fasta_name) - mkdir(fasta_dir) - annotations_list.append( - annotate_fasta(fasta_loc, fasta_name, fasta_dir, db_handler, logger, min_contig_size, prodigal_mode, - trans_table, custom_db_locs, custom_hmm_locs, custom_hmm_cutoffs_locs, - bit_score_threshold, rbh_bit_score_threshold, kofam_use_dbcan2_thresholds, - skip_trnascan, threads, rename_bins, keep_tmp_dir, verbose)) - logger.info('Annotations complete, processing annotations') - all_annotations = merge_annotations(annotations_list, output_dir) - - # clean up - if not keep_tmp_dir: - rmtree(tmp_dir) - return all_annotations - - # TODO: Add force flag to remove output dir if it already exists # TODO: Add continute flag to continue if output directory already exists # TODO: make fasta loc either a string or list to remove annotate_bins_cmd and annotate_called_genes_cmd? diff --git a/mag_annotator/summarize_vgfs.py b/mag_annotator/summarize_vgfs.py index c091689d..c5c8760b 100644 --- a/mag_annotator/summarize_vgfs.py +++ b/mag_annotator/summarize_vgfs.py @@ -243,10 +243,7 @@ def summarize_vgfs(input_file, output_dir, groupby_column='scaffold', max_auxili # get potential AMGs potential_amgs = filter_to_amgs(annotations, max_aux=max_auxiliary_score, remove_transposons=remove_transposons, remove_fs=remove_fs) - check_columns(annotations, logger) check_columns(potential_amgs, logger) - annotations.kegg_hit - annotations.iloc[0] logger.info('Determined potential amgs') # make distillate From fab203a990e234eee8e798f932f3475ec07e8dbb Mon Sep 17 00:00:00 2001 From: Rory M Flynn Date: Wed, 4 Jan 2023 14:32:44 -0700 Subject: [PATCH 06/10] AMG summary improved, some warnings prevented This update changes the AMG summary so that it will provide all information from both the Metabolic DB and The AMG DB. It is a bit of a kitchen sink approach but we can always cut it down if needed and most dram-v users I would call advanced. There are also some small quality of life improvements. --- mag_annotator/summarize_genomes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mag_annotator/summarize_genomes.py b/mag_annotator/summarize_genomes.py index 3c2d22a9..c58cb130 100644 --- a/mag_annotator/summarize_genomes.py +++ b/mag_annotator/summarize_genomes.py @@ -97,7 +97,7 @@ def fill_genome_summary_frame_gene_names(annotations, genome_summary_frame, grou for genome, frame in annotations.groupby(groupby_column, sort=False): # make dict of identifiers to gene names id_gene_dict = defaultdict(list) - for gene, ids in get_ids_from_annotations_by_row(frame).iteritems(): + for gene, ids in get_ids_from_annotations_by_row(frame).items(): for id_ in ids: id_gene_dict[id_].append(gene) # fill in genome summary_frame @@ -316,7 +316,7 @@ def get_module_step_coverage(kos, module_net): def make_module_coverage_df(annotation_df, module_nets): kos_to_genes = defaultdict(list) ko_id_name = 'kegg_id' if 'kegg_id' in annotation_df.columns else 'ko_id' - for gene_id, ko_list in annotation_df[ko_id_name].iteritems(): + for gene_id, ko_list in annotation_df[ko_id_name].items(): if type(ko_list) is str: for ko in ko_list.split(','): kos_to_genes[ko].append(gene_id) From 135c08b00f6176c377654f1527ec92573c09190e Mon Sep 17 00:00:00 2001 From: Rory M Flynn Date: Wed, 4 Jan 2023 14:36:46 -0700 Subject: [PATCH 07/10] A version bump, a new release will be made --- mag_annotator/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mag_annotator/__init__.py b/mag_annotator/__init__.py index 4e7c72a5..9e0feee7 100644 --- a/mag_annotator/__init__.py +++ b/mag_annotator/__init__.py @@ -1 +1 @@ -__version__ = '1.4.3' +__version__ = '1.4.4' From 7b253d79c512ff5fcce7b3c557b04bf2eb36f2b3 Mon Sep 17 00:00:00 2001 From: Rory M Flynn Date: Wed, 4 Jan 2023 14:42:22 -0700 Subject: [PATCH 08/10] Output folders now required in almost all cases --- scripts/DRAM-v.py | 6 +++--- scripts/DRAM.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/DRAM-v.py b/scripts/DRAM-v.py index 38abc0e9..93b110f3 100644 --- a/scripts/DRAM-v.py +++ b/scripts/DRAM-v.py @@ -27,7 +27,7 @@ annotate_parser.add_argument('-i', '--input_fasta', help="fasta file, output from ", required=True) annotate_parser.add_argument('-v', '--virsorter_affi_contigs', help="VirSorter VIRSorter_affi-contigs.tab " "output file") - annotate_parser.add_argument('-o', '--output_dir', help="output directory") + annotate_parser.add_argument('-o', '--output_dir', help="output directory", required=True) annotate_parser.add_argument('--min_contig_size', type=int, default=2500, help='minimum contig size to be used for gene prediction') annotate_parser.add_argument('--split_contigs', action='store_true', default=False, @@ -78,7 +78,7 @@ # parser for summarizing genomes distill_parser.add_argument("-i", "--input_file", help="Annotations path") - distill_parser.add_argument("-o", "--output_dir", help="Directory to write summarized genomes") + distill_parser.add_argument("-o", "--output_dir", help="Directory to write summarized genomes", required=True) distill_parser.add_argument("--groupby_column", help="Column from annotations to group as VGF units", default='scaffold') distill_parser.add_argument("--max_auxiliary_score", type=int, default=3, @@ -134,7 +134,7 @@ # parser for getting gene neighborhoods neighborhood_parser.add_argument("-i", "--input_file", help="Annotations path") - neighborhood_parser.add_argument("-o", "--output_dir", help="Directory to write gene neighborhoods") + neighborhood_parser.add_argument("-o", "--output_dir", help="Directory to write gene neighborhoods", required=True) neighborhood_parser.add_argument("--genes", nargs='*', help="Gene names from DRAM to find neighborhoods around") neighborhood_parser.add_argument("--identifiers", nargs='*', help="Database identifiers assigned by DRAM to find neighborhoods around") diff --git a/scripts/DRAM.py b/scripts/DRAM.py index 2430a68d..50228449 100644 --- a/scripts/DRAM.py +++ b/scripts/DRAM.py @@ -32,7 +32,7 @@ annotate_parser.add_argument('-i', '--input_fasta', action='append', help="fasta file, optionally with wildcards to point to multiple fastas", required=True) - annotate_parser.add_argument('-o', '--output_dir', help="output directory") + annotate_parser.add_argument('-o', '--output_dir', help="output directory", required=True) annotate_parser.add_argument('--min_contig_size', type=int, default=2500, help='minimum contig size to be used for gene prediction') annotate_parser.add_argument('--config_loc', default=None, @@ -130,7 +130,7 @@ # parser for summarizing genomes distill_parser.add_argument("-i", "--input_file", help="Annotations path") - distill_parser.add_argument("-o", "--output_dir", help="Directory to write summarized genomes") + distill_parser.add_argument("-o", "--output_dir", required=True, help="Directory to write summarized genomes") distill_parser.add_argument('--log_file_path', help="A name and loctation for the log file") distill_parser.add_argument("--rrna_path", help="rRNA output from annotation") @@ -179,7 +179,7 @@ # parser for getting gene neighborhoods neighborhood_parser.add_argument("-i", "--input_file", help="Annotations path") - neighborhood_parser.add_argument("-o", "--output_dir", help="Directory to write gene neighborhoods") + neighborhood_parser.add_argument("-o", "--output_dir", required=True, help="Directory to write gene neighborhoods") neighborhood_parser.add_argument("--genes", nargs='*', help="Gene names from DRAM to find neighborhoods around") neighborhood_parser.add_argument("--identifiers", nargs='*', help="Database identifiers assigned by DRAM to find neighborhoods around") @@ -199,7 +199,7 @@ # TODO: Make it work with append so you can use multiple -i's merge_annotations_parser.add_argument("-i", "--input_dirs", help="Path with wildcards pointing to DRAM annotation " "output directories") - merge_annotations_parser.add_argument("-o", "--output_dir", help="Path to output merged annotations files") + merge_annotations_parser.add_argument("-o", "--output_dir", required=True, help="Path to output merged annotations files") merge_annotations_parser.set_defaults(func=merge_annotations_cmd) args = parser.parse_args() From 7202d7154723755f349e8b0667b5d2895423d57e Mon Sep 17 00:00:00 2001 From: Rory M Flynn Date: Wed, 4 Jan 2023 15:16:59 -0700 Subject: [PATCH 09/10] Fix one test and remove some commented code --- mag_annotator/database_processing.py | 48 ---------------------------- tests/test_summarize_vgfs.py | 6 ++-- 2 files changed, 4 insertions(+), 50 deletions(-) diff --git a/mag_annotator/database_processing.py b/mag_annotator/database_processing.py index 72dff9ea..cee0eefa 100644 --- a/mag_annotator/database_processing.py +++ b/mag_annotator/database_processing.py @@ -26,54 +26,6 @@ DEFAULT_MMMSPRO_DB_NAME = 'db' -# KOFAM_CITATION = ("Aramaki T., Blanc-Mathieu R., Endo H., Ohkubo K., Kanehisa " -# "M., Goto S., Ogata H.\nKofamKOALA: KEGG ortholog assignment" -# " based on profile HMM and adaptive score threshold.\nBioinf" -# "ormatics. 2019 Nov 19. pii: btz859. doi: 10.1093/bioinforma" -# "tics/btz859." -# ) # arguably not for kofam but the closest I saw -# VIRAL_REFSEQ_CITATION = ("Brister JR, Ako-Adjei D, Bao Y, Blinkova O. NCBI vir" -# "al genomes resource. Nucleic Acids Res. 2015 Jan;43(" -# "Database issue):D571-7 PubMed PubMedCentral" -# ) # Three options but this one is viral specific -# KEGG_CITATION = ("Kanehisa, M., Furumichi, M., Sato, Y., Ishiguro-Watanabe, M." -# ", and Tanabe, M.; KEGG: integrating viruses and cellular org" -# "anisms. Nucleic Acids Res. 49, D545-D551 (2021)." -# ) -# PFAM_CITATION = ("Pfam: The protein families database in 2021: J. Mistry, S. C" -# "huguransky, L. Williams, M. Qureshi, G.A. Salazar, E.L.L. So" -# "nnhammer, S.C.E. Tosatto, L. Paladin, S. Raj, L.J. Richardso" -# "n, R.D. Finn, A. Bateman" -# ) -# PEPTIDASE_CITATION = ("Rawlings, N.D., Barrett, A.J., Thomas, P.D., Huang, X.," -# " Bateman, A. & Finn, R.D. (2018) The MEROPS database of" -# " proteolytic enzymes, their substrates and inhibitors i" -# "n 2017 and a comparison with peptidases in the PANTHER " -# "database. Nucleic Acids Res 46, D624-D632." -# ) -# VOGDB_CITATION = ("Thannesberger, J., Hellinger, H. J., Klymiuk, I., Kastner, M" -# ". T., Rieder, F. J., Schneider, M., ... & Steininger, C. (20" -# "17). Viruses comprise an extensive pool of mobile genetic el" -# "ements in eukaryote cell cultures and human clinical samples" -# ". The FASEB Journal, 31(5), 1987-2000." -# ) -# UNIREF_CITATION = ("Wang Y, Wang Q, Huang H, Huang W, Chen Y, McGarvey PB, Wu C" -# "H, Arighi CN, UniProt Consortium. A crowdsourcing open plat" -# "form for literature curation in UniProt Plos Biology. 19(12" -# "):e3001464 (2021)" -# ) -# DBCAN_CITATION = ("Yin Y*, Mao X*, Yang JC, Chen X, Mao F and Xu Y, dbCAN: a we" -# "b resource for automated carbohydrate-active enzyme annotati" -# "on, Nucleic Acids Res. 2012 Jul;40(Web Server issue):W445-51" -# ) # again a citation for the tool not the db -# DRAM_CITATION = ("M. Shaffer, M. A. Borton, B. B. McGivern, A. A. Zayed, S. L. " -# "La Rosa, L. M. Solden, P. Liu, A. B. Narrowe, J. Rodríguez-Ra" -# "mos, B. Bolduc et al., \"Dram for distilling microbial metabo" -# "lism to automate the curation of microbiome function,\" Nucle" -# "ic acids research, vol. 48, no. 16, pp. 8883–8900, 2020." -# ) - - KOFAM_CITATION = ("T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa" ", S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignme" "nt based on profile hmm and adaptive score threshold,\" Bioin" diff --git a/tests/test_summarize_vgfs.py b/tests/test_summarize_vgfs.py index b9e5681f..9e890274 100644 --- a/tests/test_summarize_vgfs.py +++ b/tests/test_summarize_vgfs.py @@ -46,8 +46,10 @@ def genome_summary_frame(): columns=['gene_description', 'module', 'sheet', 'header', 'subheader'])) -def test_make_viral_distillate(pamgs, genome_summary_frame): - test_viral_distillate = make_viral_distillate(pamgs, genome_summary_frame) +def test_make_viral_distillate(pamgs, genome_summary_frame, logger): + amg_database = pd.DataFrame() + test_viral_distillate = make_viral_distillate(pamgs, genome_summary_frame, amg_database=amg_database, logger=logger) + breakpoint() viral_distillate = pd.DataFrame([['gene_3', 'scaffold_1', 'K00001', 'description', 'main', 'header1', 'subheader1', 'module1', 3, 'MFTJ'], ['gene_3', 'scaffold_1', 'K00001', 'description3', 'other', 'header1', From e59237687370c32833b839d66f6baf89d2d22b24 Mon Sep 17 00:00:00 2001 From: Rory M Flynn Date: Thu, 5 Jan 2023 15:23:15 -0700 Subject: [PATCH 10/10] add a test for the new amg summary I could not fix the sql someone more familiar with sql is free to do so I will make a branch to deal with this --- mag_annotator/database_handler.py | 5 +- mag_annotator/database_setup.py | 20 --- tests/test_summarize_vgfs.py | 201 ++++++++++++++++++++++++------ 3 files changed, 165 insertions(+), 61 deletions(-) diff --git a/mag_annotator/database_handler.py b/mag_annotator/database_handler.py index 78a8bcd7..a94bca9f 100644 --- a/mag_annotator/database_handler.py +++ b/mag_annotator/database_handler.py @@ -5,11 +5,10 @@ import logging from shutil import copy2 import warnings -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker - from datetime import datetime from functools import partial +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker import pandas as pd diff --git a/mag_annotator/database_setup.py b/mag_annotator/database_setup.py index c0c2216d..dbe686c6 100644 --- a/mag_annotator/database_setup.py +++ b/mag_annotator/database_setup.py @@ -82,25 +82,6 @@ def serialize(self): 'dbcan_subfam_ec': self.ec, } - -# DBCAN_SUBFAM_EC_TABLE_NAME = 'dbcan_subfam_ec' - - -# class DbcanSubfamEC(Base): -# __tablename__ = DBCAN_SUBFAM_EC_TABLE_NAME -# -# id = Column(String(30), primary_key=True, nullable=False, index=True) -# -# description = Column(String(1000)) -# -# @property -# def serialize(self): -# return { -# 'dbcan_id': self.id, -# 'dbcan_subfam_ec': self.description, -# } - - VIRAL_DESCRIPTION_TABLE_NAME = 'viral_description' @@ -164,7 +145,6 @@ def create_description_db(db_loc): UNIREF_DESCRIPTION_TABLE_NAME: UniRefDescription, PFAM_DESCRIPTION_TABLE_NAME: PfamDescription, DBCAN_DESCRIPTION_TABLE_NAME: DbcanDescription, - # DBCAN_SUBFAM_EC_TABLE_NAME: DbcanSubfamEC, VIRAL_DESCRIPTION_TABLE_NAME: ViralDescription, PEPTIDASE_DESCRIPTION_TABLE_NAME: PeptidaseDescription, VOGDB_DESCRIPTION_TABLE_NAME: VOGDBDescription} diff --git a/tests/test_summarize_vgfs.py b/tests/test_summarize_vgfs.py index 9e890274..01639a33 100644 --- a/tests/test_summarize_vgfs.py +++ b/tests/test_summarize_vgfs.py @@ -1,28 +1,49 @@ import pytest +import logging +from mag_annotator.utils import setup_logger import pandas as pd import altair as alt -from mag_annotator.summarize_vgfs import filter_to_amgs, get_strand_switches, make_viral_distillate, make_vgf_order, \ - make_amg_count_column, make_viral_functional_df, make_viral_functional_heatmap +from mag_annotator.summarize_vgfs import ( + filter_to_amgs, + get_strand_switches, + make_viral_distillate, + make_vgf_order, + make_amg_count_column, + make_viral_functional_df, + make_viral_functional_heatmap, +) @pytest.fixture() def annotations(): - return pd.DataFrame([['scaffold_1', 'K99999', 'VTF', 1], - ['scaffold_1', 'K12345', 'M', 4], - ['scaffold_1', 'K00001', 'MFTJ', 3], - ['scaffold_1', 'K11111', 'MKE', 2]], - index=['gene_1', 'gene_2', 'gene_3', 'gene_4'], - columns=['scaffold', 'ko_id', 'amg_flags', 'auxiliary_score']) + return pd.DataFrame( + [ + ["scaffold_1", "K99999", "VTF", 1], + ["scaffold_1", "K12345", "M", 4], + ["scaffold_1", "K00001", "MFTJ", 3], + ["scaffold_1", "K11111", "MKE", 2], + ], + index=["gene_1", "gene_2", "gene_3", "gene_4"], + columns=["scaffold", "ko_id", "amg_flags", "auxiliary_score"], + ) @pytest.fixture() def pamgs(): - return pd.DataFrame([['scaffold_1', 'K00001', 'MFTJ', 3], - ['scaffold_1', 'K11111', 'MKE', 2]], - index=['gene_3', 'gene_4'], - columns=['scaffold', 'ko_id', 'amg_flags', 'auxiliary_score']) + return pd.DataFrame( + [["scaffold_1", "K00001", "MFTJ", 3], ["scaffold_1", "K11111", "MKE", 2]], + index=["gene_3", "gene_4"], + columns=["scaffold", "ko_id", "amg_flags", "auxiliary_score"], + ) + + +@pytest.fixture() +def logger(tmpdir): + logger = logging.getLogger("test_log") + setup_logger(logger) + return logger def test_filter_to_amgs(annotations, pamgs): @@ -31,37 +52,129 @@ def test_filter_to_amgs(annotations, pamgs): def test_get_strand_switches(): - assert get_strand_switches(['+', '-', '+']) == 2 - assert get_strand_switches(['+', '+', '+']) == 0 - assert get_strand_switches(['-']) == 0 - assert get_strand_switches(['+', '+', '-', '-']) == 1 + assert get_strand_switches(["+", "-", "+"]) == 2 + assert get_strand_switches(["+", "+", "+"]) == 0 + assert get_strand_switches(["-"]) == 0 + assert get_strand_switches(["+", "+", "-", "-"]) == 1 @pytest.fixture() def genome_summary_frame(): - return pd.DataFrame(pd.DataFrame([['description', 'module1', 'main', 'header1', 'subheader1'], - ['description2', 'module2', 'main', 'header1', 'subheader1'], - ['description3', 'module3', 'other', 'header1', 'subheader1']], - index=['K00001', 'K12345', 'K00001'], - columns=['gene_description', 'module', 'sheet', 'header', 'subheader'])) + return pd.DataFrame( + pd.DataFrame( + [ + ["description", "module1", "main", "header1", "subheader1"], + ["description2", "module2", "main", "header1", "subheader1"], + ["description3", "module3", "other", "header1", "subheader1"], + ], + index=["K00001", "K12345", "K00001"], + columns=["gene_description", "module", "sheet", "header", "subheader"], + ) + ) def test_make_viral_distillate(pamgs, genome_summary_frame, logger): - amg_database = pd.DataFrame() - test_viral_distillate = make_viral_distillate(pamgs, genome_summary_frame, amg_database=amg_database, logger=logger) - breakpoint() - viral_distillate = pd.DataFrame([['gene_3', 'scaffold_1', 'K00001', 'description', 'main', 'header1', 'subheader1', - 'module1', 3, 'MFTJ'], - ['gene_3', 'scaffold_1', 'K00001', 'description3', 'other', 'header1', - 'subheader1', 'module3', 3, 'MFTJ'], - ['gene_4', 'scaffold_1', '', '', '', '', '', '', 2, 'MKE']], - columns=['gene', 'scaffold', 'gene_id', 'gene_description', 'category', 'header', - 'subheader', 'module', 'auxiliary_score', 'amg_flags']) - pd.testing.assert_frame_equal(test_viral_distillate, viral_distillate) + amg_database = pd.DataFrame( + [ + [ + "K11111", + "gene_4", + "gene_4", + "d4", + "module3", + "metabolism3", + "reference2", + "true", + ] + ], + columns=[ + "KO", + "EC", + "PFAM", + "gene", + "module", + "metabolism", + "reference", + "verified", + ], + ) + output_viral_distillate = make_viral_distillate( + pamgs, genome_summary_frame, amg_database=amg_database, logger=logger + ) + expected_viral_distillate = pd.DataFrame( + [ + [ + "gene_3", + "scaffold_1", + "K00001", + "description", + "header1", + "subheader1", + "module1", + 3, + "MFTJ", + "main", + "genome_summary_form", + "", + "", + ], + [ + "gene_3", + "scaffold_1", + "K00001", + "description3", + "header1", + "subheader1", + "module3", + 3, + "MFTJ", + "other", + "genome_summary_form", + "", + "", + "", + ], + [ + "gene_4", + "scaffold_1", + "K11111", + "d4", + "", + "", + "module3", + 2, + "MKE", + "", + "amg_database", + "metabolism3", + "reference2", + "true", + ], + ], + columns=[ + "gene", + "scaffold", + "gene_id", + "gene_description", + "header", + "subheader", + "module", + "auxiliary_score", + "amg_flags", + "sheet", + "gene_id_origin", + "metabolism", + "reference", + "verified", + ], + ) + output_viral_distillate = output_viral_distillate.sort_index(axis=1).fillna("") + expected_viral_distillate = expected_viral_distillate.sort_index(axis=1).fillna("") + pd.testing.assert_frame_equal(output_viral_distillate, expected_viral_distillate) def test_make_vgf_order(pamgs): - assert make_vgf_order(pamgs) == ['scaffold_1'] + assert make_vgf_order(pamgs) == ["scaffold_1"] def test_make_amg_count_column(pamgs): @@ -70,14 +183,26 @@ def test_make_amg_count_column(pamgs): @pytest.fixture() def viral_functional_df(): - return pd.DataFrame([['main', 'module1', 'gene_3', 'K00001', 'scaffold_1', True], - ['other', 'module3', 'gene_3', 'K00001', 'scaffold_1', True]], - columns=['Category', 'Function', 'AMG Genes', 'Genes Present', 'Contig Name', - 'Present in Contig']) + return pd.DataFrame( + [ + ["main", "module1", "gene_3", "K00001", "scaffold_1", True], + ["other", "module3", "gene_3", "K00001", "scaffold_1", True], + ], + columns=[ + "Category", + "Function", + "AMG Genes", + "Genes Present", + "Contig Name", + "Present in Contig", + ], + ) def test_make_viral_functional_df(pamgs, genome_summary_frame, viral_functional_df): - test_viral_function_df = make_viral_functional_df(pamgs, genome_summary_frame, 'scaffold') + test_viral_function_df = make_viral_functional_df( + pamgs, genome_summary_frame, "scaffold" + ) pd.testing.assert_frame_equal(test_viral_function_df, viral_functional_df)