From 052e52456581f99b28e8044b0ea1e4a7a26525eb Mon Sep 17 00:00:00 2001
From: Rory M Flynn <rmflynn@colostate.com>
Date: Wed, 12 Oct 2022 13:50:33 -0600
Subject: [PATCH 01/10] Fix up how viral distillate works

---
 mag_annotator/pull_sequences.py |  1 -
 mag_annotator/summarize_vgfs.py | 98 +++++++++++----------------------
 2 files changed, 33 insertions(+), 66 deletions(-)

diff --git a/mag_annotator/pull_sequences.py b/mag_annotator/pull_sequences.py
index d0046dca..4a68e842 100644
--- a/mag_annotator/pull_sequences.py
+++ b/mag_annotator/pull_sequences.py
@@ -84,7 +84,6 @@ def pull_sequences(input_tsv, input_fasta, output_fasta, fastas=None, scaffolds=
         if adjective_sheet is not None:
             adjective_sheet = pd.read_csv(input_tsv, sep='\t', index_col=0) 
             annotations = annotations.loc[adjective_sheet.index]
-            breakpoint()
         annotation_genes_to_keep = get_genes_from_identifiers(annotations, genes, fastas, scaffolds, identifiers,
                                                               categories, custom_distillate)
         annotations = annotations.loc[annotation_genes_to_keep]
diff --git a/mag_annotator/summarize_vgfs.py b/mag_annotator/summarize_vgfs.py
index bd980724..ff295b0a 100644
--- a/mag_annotator/summarize_vgfs.py
+++ b/mag_annotator/summarize_vgfs.py
@@ -104,78 +104,48 @@ def make_viral_stats_table(annotations, potential_amgs, groupby_column='scaffold
     return pd.DataFrame(viral_stats_series).fillna(0)
 
 
-def make_viral_distillate(potential_amgs, genome_summary_form):
-    rows = list()
-    potential_amgs['ids'] = get_ids_from_annotations_by_row(potential_amgs)
-    potential_amgs.iloc[0]
-    logger =  logging.getLogger()
-    check_columns(potential_amgs, logger)
-    # breakpoint()
-    for gene, row in potential_amgs.iterrows():
-        gene_ids = row.ids & set(genome_summary_form.index)
-        if len(gene_ids) > 0:
-            for gene_id in gene_ids:
-                gene_summary = genome_summary_form.loc[gene_id]
-                # breakpoint()
-                if type(gene_summary) is pd.Series:
-                    rows.append([gene, row['scaffold'], gene_id, gene_summary['gene_description'],
-                                 gene_summary['sheet'], gene_summary['header'], gene_summary['subheader'],
-                                 gene_summary['module'], row['auxiliary_score'], row['amg_flags']])
-                else:
-                    for sub_gene_id, sub_gene_summary in gene_summary.iterrows():
-                        rows.append([gene, row['scaffold'], gene_id, sub_gene_summary['gene_description'],
-                                     sub_gene_summary['sheet'], sub_gene_summary['header'],
-                                     sub_gene_summary['subheader'], sub_gene_summary['module'],
-                                     row['auxiliary_score'], row['amg_flags']])
-        else:
-            warnings.warn("No distillate information found for gene %s" % gene)
-            rows.append([gene, row['scaffold'], '', '', '', '', '', '', row['auxiliary_score'],
-                         row['amg_flags']])
-    return pd.DataFrame(rows, columns=VIRAL_DISTILLATE_COLUMNS)
-
-
-def make_viral_distillate2(potential_amgs, genome_summary_form, amg_database, logger):
+def make_viral_distillate(potential_amgs, genome_summary_form, amg_database, logger):
     """Make a summary of what in our database makes somthing a AMG or likly AMG to dram"""
     # Transform the amg database to make it more workable
+    def look_up_metabolic_info(search_db, match_db, match_db_name):
+        id_genes = set(match_db.index)
+        return (
+            (search_db
+             .assign(gene_id = lambda x: x['ids'].apply(lambda y: y & id_genes))
+             )[['gene_id', 'scaffold', 'auxiliary_score', 'amg_flags']]
+            .explode('gene_id')
+            .dropna(subset=['gene_id'])
+            .merge(match_db, how='left', left_on='gene_id', right_index=True)
+            .assign(gene_id_origin=match_db_name))
+
     amg_database_frame = (amg_database
                           .melt(value_vars=['KO', 'EC', 'PFAM'], 
                                 id_vars=['gene', 'module', 'metabolism', 
                                          'reference', 'verified'],
                                 value_name='gene_id')
                           .drop('variable', axis=1)
+                          .assign(
+                              gene_id=lambda x: x['gene_id'].apply(
+                                  lambda y: [i.strip() for i in str(y).split(';')]))
+                          .explode('gene_id')
+                          .dropna(subset='gene_id')
                           .set_index('gene_id')
                           .rename(columns = {'gene': 'gene_description'})
                           )
-    metabolic_genes = set(genome_summary_form.index)
-    amg_db_genes = set(amg_database_frame.index)
-    # verified_amgs = get_amg_ids(amg_database_frame.loc[amg_database_frame.verified])
-    potential_amgs['ids'] = get_ids_from_annotations_by_row(potential_amgs)
-    potential_amgs['amg_ids'] = potential_amgs['ids'].apply(lambda x: x & amg_db_genes)
-    potential_amgs['metabo_ids'] = potential_amgs['ids'].apply(lambda x: x & metabolic_genes)
+    potential_amgs = potential_amgs.assign(ids=get_ids_from_annotations_by_row(potential_amgs))
+    metabolic_df = look_up_metabolic_info(potential_amgs, genome_summary_form, 'genome_summary_form')
+    amg_df = look_up_metabolic_info(potential_amgs, amg_database_frame, 'amg_database')
+    missing = list(set(potential_amgs.index) - (set(metabolic_df.index) | (set(amg_df.index)) ))
     # evaluate what is mising
+    logger.warning(f"No distillate information found for {len(missing)} genes.")
+    logger.debug('\n'.join(missing))
+
 
-    missing_info = potential_amgs[((potential_amgs['amg_ids'].apply(len) < 1) &
-                                  (potential_amgs['metabo_ids'].apply(len) < 1))]
-    logger.warning(f"No distillate information found for {len(missing_info.index.unique())} genes.")
-    logger.debug('\n'.join(missing_info.index.unique()))
-    missing_info['amg_flags']
-    def look_up_metabolic_info(search_db, ids_col, match_db, match_db_name):
-        return (
-            search_db[[ids_col, 'scaffold', 'auxiliary_score', 'amg_flags']]
-            .rename(columns={ids_col: 'gene_id'})
-            .explode('gene_id')
-            .dropna(subset=['gene_id'])
-            .merge(match_db,
-                   how='left',
-                   left_on='gene_id',
-                   right_index=True)
-            .assign(gene_id_origin=match_db_name))
     summary = pd.concat([
-        look_up_metabolic_info(potential_amgs, 'metabo_ids', genome_summary_form, 'genome_summary_form'),
-        look_up_metabolic_info(potential_amgs, 'amg_ids', amg_database_frame, 'amg_database'), 
-        missing_info[['scaffold', 'auxiliary_score', 'amg_flags']]])
+        metabolic_df,
+        amg_df,
+        potential_amgs.loc[missing, ['scaffold', 'auxiliary_score', 'amg_flags']]])
     summary.reset_index(inplace=True, drop=False, names='gene')
-    
     return summary
 
 
@@ -203,7 +173,7 @@ def make_viral_functional_df(annotations, genome_summary_form, groupby_column='s
     # build dict of ids per genome
     vgf_to_id_dict = defaultdict(defaultdict_list)
     for vgf, frame in annotations.groupby(groupby_column, sort=False):
-        for gene, id_list in get_ids_from_annotations_by_row(frame).iteritems():
+        for gene, id_list in get_ids_from_annotations_by_row(frame).items():
             for id_ in id_list:
                 vgf_to_id_dict[vgf][id_].append(gene)
     # build long from data frame
@@ -292,7 +262,6 @@ def summarize_vgfs(input_file, output_dir, groupby_column='scaffold', max_auxili
     check_columns(potential_amgs, logger)
     annotations.kegg_hit
     annotations.iloc[0]
-    # breakpoint()
     logger.info('Determined potential amgs')
 
     # make distillate
@@ -300,12 +269,11 @@ def summarize_vgfs(input_file, output_dir, groupby_column='scaffold', max_auxili
     viral_genome_stats.to_csv(path.join(output_dir, 'vMAG_stats.tsv'), sep='\t')
     logger.info('Calculated viral genome statistics')
 
-    # viral_distillate = make_viral_distillate2(
-    #     potential_amgs,
-    #     genome_summary_form,
-    #     pd.read_csv(database_handler.config["dram_sheets"].get('amg_database'), sep='\t'),
-    #     logger)
-    viral_distillate = make_viral_distillate(potential_amgs, genome_summary_form)
+    viral_distillate = make_viral_distillate(
+        potential_amgs,
+        genome_summary_form,
+        pd.read_csv(database_handler.config["dram_sheets"].get('amg_database'), sep='\t'),
+        logger)
     viral_distillate.to_csv(path.join(output_dir, 'amg_summary.tsv'), sep='\t', index=None)
     logger.info('Generated AMG summary')
 

From 522bc4b6cc4f89cdaf88e0c06097f6b865883c79 Mon Sep 17 00:00:00 2001
From: Rory M Flynn <rmflynn@colostate.com>
Date: Fri, 14 Oct 2022 13:29:35 -0600
Subject: [PATCH 02/10] lets see why vogdb is not running

---
 mag_annotator/CONFIG           | 167 +++++++++++++++++++++++++++++----
 mag_annotator/annotate_vgfs.py |  27 +++++-
 2 files changed, 169 insertions(+), 25 deletions(-)

diff --git a/mag_annotator/CONFIG b/mag_annotator/CONFIG
index bc8aa5c7..0f019614 100644
--- a/mag_annotator/CONFIG
+++ b/mag_annotator/CONFIG
@@ -1,28 +1,155 @@
 {
   "search_databases": {
-    "kegg": null,
-    "kofam_hmm": null,
-    "kofam_ko_list": null,
-    "uniref": null,
-    "pfam": null,
-    "dbcan": null,
-    "viral": null,
-    "peptidase": null,
-    "vogdb": null
+    "kegg": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kegg.20221012.mmsdb",
+    "kofam_hmm": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kofam_profiles.hmm",
+    "kofam_ko_list": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/kofam_ko_list.tsv",
+    "uniref": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/uniref90.20220928.mmsdb",
+    "pfam": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/pfam.mmspro",
+    "dbcan": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/dbCAN-HMMdb-V11.txt",
+    "viral": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/refseq_viral.20220928.mmsdb",
+    "peptidase": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/peptidases.20220928.mmsdb",
+    "vogdb": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/vog_latest_hmms.txt"
   },
-  "custom_dbs": null,
   "database_descriptions": {
-    "pfam_hmm_dat": null,
-    "dbcan_fam_activities": null,
-    "vog_annotations": null
+    "pfam_hmm": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/Pfam-A.hmm.dat.gz",
+    "dbcan_fam_activities": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/CAZyDB.08062022.fam-activities.txt",
+    "dbcan_subfam_ec": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/CAZyDB.08062022.fam.subfam.ec.txt",
+    "vog_annotations": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/vog_annotations_latest.tsv.gz"
   },
   "dram_sheets": {
-    "genome_summary_form": null,
-    "module_step_form": null,
-    "etc_module_database": null,
-    "function_heatmap_form": null,
-    "amg_database": null
+    "genome_summary_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/genome_summary_form.20220928.tsv",
+    "module_step_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/module_step_form.20220928.tsv",
+    "etc_module_database": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/etc_mdoule_database.20220928.tsv",
+    "function_heatmap_form": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/function_heatmap_form.20220928.tsv",
+    "amg_database": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/amg_database.20220928.tsv"
   },
-  "description_db": null,
-  "dram_version": null
+  "dram_version": "1.4.0rc1",
+  "description_db": "/home/projects-wrighton-2/DRAM/development_flynn/public_DRAM/sep_12_22_dram1.4_rc_setup_test/testoutput/DRAM1_4_pycallgraph_3/description_db.sqlite",
+  "setup_info": {
+    "kegg": {
+      "name": "KEGG db",
+      "description_db_updated": "10/12/2022, 18:52:36",
+      "citation": " M. Kanehisa, M. Furumichi, Y. Sato, M. Ishiguro-Watanabe, and M. Tanabe, \"Kegg: integrating viruses and cellular organisms,\" Nucleic acids research, vol. 49, no. D1, pp. D545\u2013D551, 2021."
+    },
+    "kofam_hmm": {
+      "name": "KOfam db",
+      "citation": "T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa, S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignment based on profile hmm and adaptive score threshold,\" Bioinformatics, vol. 36, no. 7, pp. 2251\u20132252, 2020.",
+      "Download time": "09/28/2022, 11:00:09",
+      "Origin": "Downloaded by DRAM"
+    },
+    "kofam_ko_list": {
+      "name": "KOfam KO list",
+      "citation": "T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa, S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignment based on profile hmm and adaptive score threshold,\" Bioinformatics, vol. 36, no. 7, pp. 2251\u20132252, 2020.",
+      "Download time": "09/28/2022, 11:00:11",
+      "Origin": "Downloaded by DRAM"
+    },
+    "uniref": {
+      "name": "UniRef db",
+      "description_db_updated": "09/29/2022, 13:14:40",
+      "citation": "Y. Wang, Q. Wang, H. Huang, W. Huang, Y. Chen, P. B. McGarvey, C. H. Wu, C. N. Arighi, and U. Consortium, \"A crowdsourcing open platform for literature curation in uniprot,\" PLoS Biology, vol. 19, no. 12, p. e3001464, 2021.",
+      "version": "90",
+      "Download time": "09/28/2022, 11:15:01",
+      "Origin": "Downloaded by DRAM"
+    },
+    "pfam": {
+      "name": "Pfam db",
+      "citation": "J. Mistry, S. Chuguransky, L. Williams, M. Qureshi, G. A. Salazar, E. L. Sonnhammer, S. C. Tosatto, L. Paladin, S. Raj, L. J. Richardson et al., \"Pfam: The protein families database in 2021,\" Nucleic acids research, vol. 49, no. D1, pp. D412\u2013D419, 2021.",
+      "Download time": "09/28/2022, 11:49:29",
+      "Origin": "Downloaded by DRAM",
+      "description_db_updated": "09/29/2022, 13:23:47"
+    },
+    "pfam_hmm": {
+      "name": "Pfam hmm dat",
+      "description_db_updated": "Unknown, or Never",
+      "citation": "J. Mistry, S. Chuguransky, L. Williams, M. Qureshi, G. A. Salazar, E. L. Sonnhammer, S. C. Tosatto, L. Paladin, S. Raj, L. J. Richardson et al., \"Pfam: The protein families database in 2021,\" Nucleic acids research, vol. 49, no. D1, pp. D412\u2013D419, 2021.",
+      "Download time": "09/28/2022, 11:49:31",
+      "Origin": "Downloaded by DRAM"
+    },
+    "dbcan": {
+      "name": "dbCAN db",
+      "citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.",
+      "version": "11",
+      "Download time": "09/28/2022, 11:49:33",
+      "Origin": "Downloaded by DRAM",
+      "description_db_updated": "09/29/2022, 13:23:50"
+    },
+    "dbcan_fam_activities": {
+      "name": "dbCAN family activities",
+      "citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.",
+      "version": "11",
+      "upload_date": "08062022",
+      "Download time": "09/28/2022, 11:49:33",
+      "Origin": "Downloaded by DRAM"
+    },
+    "dbcan_subfam_ec": {
+      "name": "dbCAN subfamily EC numbers",
+      "citation": "Y. Yin, X. Mao, J. Yang, X. Chen, F. Mao, and Y. Xu, \"dbcan: a web resource for automated carbohydrate-active enzyme annotation,\" Nucleic acids research, vol. 40, no. W1, pp. W445\u2013W451, 2012.",
+      "version": "11",
+      "upload_date": "08062022",
+      "Download time": "09/28/2022, 11:49:33",
+      "Origin": "Downloaded by DRAM"
+    },
+    "vogdb": {
+      "name": "VOGDB db",
+      "citation": "J. Thannesberger, H.-J. Hellinger, I. Klymiuk, M.-T. Kastner, F. J. Rieder, M. Schneider, S. Fister, T. Lion, K. Kosulin, J. Laengle et al., \"Viruses comprise an extensive pool of mobile genetic elements in eukaryote cell cultures and human clinical samples,\" The FASEB Journal, vol. 31, no. 5, pp. 1987\u20132000, 2017.",
+      "version": "latest",
+      "Download time": "09/28/2022, 11:51:57",
+      "Origin": "Downloaded by DRAM",
+      "description_db_updated": "09/29/2022, 13:24:14"
+    },
+    "vog_annotations": {
+      "name": "VOG annotations",
+      "description_db_updated": "Unknown, or Never",
+      "citation": "J. Thannesberger, H.-J. Hellinger, I. Klymiuk, M.-T. Kastner, F. J. Rieder, M. Schneider, S. Fister, T. Lion, K. Kosulin, J. Laengle et al., \"Viruses comprise an extensive pool of mobile genetic elements in eukaryote cell cultures and human clinical samples,\" The FASEB Journal, vol. 31, no. 5, pp. 1987\u20132000, 2017.",
+      "version": "latest",
+      "Download time": "09/28/2022, 11:51:58",
+      "Origin": "Downloaded by DRAM"
+    },
+    "viral": {
+      "name": "RefSeq Viral db",
+      "description_db_updated": "09/29/2022, 13:16:15",
+      "citation": "J. R. Brister, D. Ako-Adjei, Y. Bao, and O. Blinkova, \"Ncbi viral genomes resource,\" Nucleic acids research, vol. 43, no. D1, pp. D571\u2013D577, 2015. [3] M. Kanehisa, M. Furumichi, Y. Sato, M. Ishiguro-Watanabe, and M. Tan-abe, \"Kegg: integrating viruses and cellular organisms,\" Nucleic acids research, vol. 49, no. D1, pp. D545\u2013D551, 2021.",
+      "viral_files": 2,
+      "Download time": "09/28/2022, 11:52:20",
+      "Origin": "Downloaded by DRAM"
+    },
+    "peptidase": {
+      "name": "MEROPS peptidase db",
+      "description_db_updated": "09/29/2022, 13:23:40",
+      "citation": "N. D. Rawlings, A. J. Barrett, P. D. Thomas, X. Huang, A. Bateman, and R. D. Finn, \"The merops database of proteolytic enzymes, their substrates and inhibitors in 2017 and a comparison with peptidases in the panther database,\" Nucleic acids research, vol. 46, no. D1, pp. D624\u2013D632, 2018.",
+      "Download time": "09/28/2022, 12:01:46",
+      "Origin": "Downloaded by DRAM"
+    },
+    "genome_summary_form": {
+      "name": "Genome summary form",
+      "branch": "master",
+      "Download time": "09/28/2022, 12:01:46",
+      "Origin": "Downloaded by DRAM"
+    },
+    "module_step_form": {
+      "name": "Module step form",
+      "branch": "master",
+      "Download time": "09/28/2022, 12:01:47",
+      "Origin": "Downloaded by DRAM"
+    },
+    "function_heatmap_form": {
+      "name": "Function heatmap form",
+      "branch": "master",
+      "Download time": "09/28/2022, 12:01:47",
+      "Origin": "Downloaded by DRAM"
+    },
+    "amg_database": {
+      "name": "AMG database",
+      "branch": "master",
+      "Download time": "09/28/2022, 12:01:47",
+      "Origin": "Downloaded by DRAM"
+    },
+    "etc_module_database": {
+      "name": "ETC module database",
+      "branch": "master",
+      "Download time": "09/28/2022, 12:01:47",
+      "Origin": "Downloaded by DRAM"
+    }
+  },
+  "log_path": null
 }
diff --git a/mag_annotator/annotate_vgfs.py b/mag_annotator/annotate_vgfs.py
index 642ca929..eaf0ffb3 100644
--- a/mag_annotator/annotate_vgfs.py
+++ b/mag_annotator/annotate_vgfs.py
@@ -446,7 +446,7 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_
 
     # get database locations
     db_handler = DatabaseHandler(logger)
-    db_handler.filter_db_locs(low_mem_mode, use_uniref, True, VMAG_DBS_TO_ANNOTATE)
+    db_handler.filter_db_locs(low_mem_mode=low_mem_mode, use_uniref=use_uniref, use_vogdb=True, master_list=VMAG_DBS_TO_ANNOTATE)
 
     if virsorter_affi_contigs is not None:
         virsorter_hits = get_virsorter_hits(virsorter_affi_contigs)
@@ -482,10 +482,27 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_
 
     # annotate vMAGs
     rename_bins = False
-    annotations = annotate_fastas(contig_locs, output_dir, db_handler, logger, min_contig_size, prodigal_mode, trans_table,
-                                  bit_score_threshold, rbh_bit_score_threshold, custom_db_name, custom_fasta_loc,
-                                  custom_hmm_name, custom_hmm_loc, custom_hmm_cutoffs_loc, kofam_use_dbcan2_thresholds,
-                                  skip_trnascan, rename_bins, keep_tmp_dir, threads, verbose)
+    annotations = annotate_fastas(
+        fasta_locs=contig_locs,
+        output_dir=output_dir, 
+        db_handler=db_handler,
+        logger=logger,
+        min_contig_size=min_contig_size,
+        prodigal_mode=prodigal_mode,
+        trans_table=trans_table,
+        bit_score_threshold=bit_score_threshold,
+        rbh_bit_score_threshold=rbh_bit_score_threshold,
+        custom_db_name=custom_db_name,
+        custom_fasta_loc=custom_fasta_loc,
+        custom_hmm_name=custom_hmm_name,
+        custom_hmm_loc=custom_hmm_loc,
+        custom_hmm_cutoffs_loc=custom_hmm_cutoffs_loc,
+        kofam_use_dbcan2_thresholds=kofam_use_dbcan2_thresholds,
+        skip_trnascan=skip_trnascan,
+        rename_bins=rename_bins,
+        keep_tmp_dir=keep_tmp_dir,
+        threads=threads,
+        verbose=verbose)
     logging.info('Annotations complete, assigning auxiliary scores and flags')
 
     annotations = add_dramv_scores_and_flags(annotations, db_handler, logger, virsorter_hits, input_fasta)

From e2a8804f9364c6cc7af21ccbc8c2de988d09f4af Mon Sep 17 00:00:00 2001
From: Rory M Flynn <rmflynn@colostate.com>
Date: Fri, 14 Oct 2022 13:42:18 -0600
Subject: [PATCH 03/10] Add breaks

---
 mag_annotator/annotate_bins.py | 2 ++
 mag_annotator/annotate_vgfs.py | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/mag_annotator/annotate_bins.py b/mag_annotator/annotate_bins.py
index c6e0059f..fce15166 100644
--- a/mag_annotator/annotate_bins.py
+++ b/mag_annotator/annotate_bins.py
@@ -891,6 +891,7 @@ def annotate_fastas(fasta_locs, output_dir, db_handler, logger, min_contig_size=
     tmp_dir = path.join(output_dir, 'working_dir')
     mkdir(tmp_dir)
 
+    breakpoint()
     # setup custom databases to be searched
     custom_db_locs = process_custom_dbs(custom_fasta_loc, custom_db_name, path.join(tmp_dir, 'custom_dbs'),
                                         logger, threads, verbose)
@@ -906,6 +907,7 @@ def annotate_fastas(fasta_locs, output_dir, db_handler, logger, min_contig_size=
         logger.info('Annotating %s' % fasta_name)
         fasta_dir = path.join(tmp_dir, fasta_name)
         mkdir(fasta_dir)
+        breakpoint()
         annotations_list.append(
             annotate_fasta(fasta_loc, fasta_name, fasta_dir, db_handler, logger, min_contig_size, prodigal_mode,
                            trans_table, custom_db_locs, custom_hmm_locs, custom_hmm_cutoffs_locs,
diff --git a/mag_annotator/annotate_vgfs.py b/mag_annotator/annotate_vgfs.py
index eaf0ffb3..e87c257c 100644
--- a/mag_annotator/annotate_vgfs.py
+++ b/mag_annotator/annotate_vgfs.py
@@ -434,6 +434,7 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_
     log_file_path = path.join(output_dir, "Annotation.log")
     logger = logging.getLogger('annotation_log')
     setup_logger(logger, log_file_path)
+    breakpoint()
 
     # check inputs
     prodigal_modes = ['train', 'meta', 'single']
@@ -446,8 +447,9 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_
 
     # get database locations
     db_handler = DatabaseHandler(logger)
+    breakpoint()
     db_handler.filter_db_locs(low_mem_mode=low_mem_mode, use_uniref=use_uniref, use_vogdb=True, master_list=VMAG_DBS_TO_ANNOTATE)
-
+    breakpoint()
     if virsorter_affi_contigs is not None:
         virsorter_hits = get_virsorter_hits(virsorter_affi_contigs)
     else:
@@ -482,6 +484,7 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_
 
     # annotate vMAGs
     rename_bins = False
+    breakpoint()
     annotations = annotate_fastas(
         fasta_locs=contig_locs,
         output_dir=output_dir, 

From ca5dd0197a343af03ca6b0fe76ec1d31d6aa4500 Mon Sep 17 00:00:00 2001
From: Rory M Flynn <rmflynn@colostate.com>
Date: Tue, 18 Oct 2022 14:49:27 -0600
Subject: [PATCH 04/10] Fix some typo, and name change errors in DRAM-v

---
 mag_annotator/annotate_bins.py  | 59 +++++++--------------------------
 mag_annotator/annotate_vgfs.py  | 16 ++++-----
 mag_annotator/summarize_vgfs.py |  5 +--
 3 files changed, 19 insertions(+), 61 deletions(-)

diff --git a/mag_annotator/annotate_bins.py b/mag_annotator/annotate_bins.py
index fce15166..468f02bb 100644
--- a/mag_annotator/annotate_bins.py
+++ b/mag_annotator/annotate_bins.py
@@ -244,11 +244,16 @@ def kofam_hmmscan_formater(hits:pd.DataFrame, hmm_info_path:str=None, use_dbcan2
 
 
 
-def vogdb_hmmscan_formater(hits:pd.DataFrame,  db_name:str, db_handler=None):
+def vogdb_hmmscan_formater(hits:pd.DataFrame,  db_name:str, logger:logging.Logger,
+                           db_handler=None):
+    categories_col = f"{db_name}_categories"
+    id_col = f"{db_name}_id"
+    description_col = f"{db_name}_description"
     hits_sig = hits[hits.apply(get_sig_row, axis=1)]
     if len(hits_sig) == 0:
+        logger.warn("No significant hits for vog_db")
         # if nothing significant then return nothing, don't get descriptions
-        return pd.DataFrame()
+        return pd.DataFrame(columns=[categories_col, id_col, description_col])
     # Get the best hits
     hits_best = hits_sig.sort_values('full_evalue').drop_duplicates(subset=["query_id"])
     if db_handler is None:
@@ -257,16 +262,16 @@ def vogdb_hmmscan_formater(hits:pd.DataFrame,  db_name:str, db_handler=None):
         # get_descriptions
         desc_col = f"{db_name}_hits"
         descriptions = pd.DataFrame(
-            db_handler.get_descriptions(hits_best['target_id'].unique(), f"{db_name}_description"),
+            db_handler.get_descriptions(hits_best['target_id'].unique(), description_col),
             index=[desc_col]).T
         categories = descriptions[desc_col].apply(lambda x: x.split('; ')[-1])
-        descriptions[f"{db_name}_categories"] = categories.apply(
+        descriptions[categories_col] = categories.apply(
             lambda x: ';'.join(set([x[i:i + 2] for i in range(0, len(x), 2)])))
         descriptions['target_id'] = descriptions.index
         hits_df = pd.merge(hits_best[['query_id', 'target_id']], descriptions, on=f'target_id')
     hits_df.set_index('query_id', inplace=True, drop=True)
     hits_df.rename_axis(None, inplace=True)
-    hits_df.rename(columns={'target_id': f"{db_name}_id"}, inplace=True)
+    hits_df.rename(columns={'target_id': id_col}, inplace=True)
     return hits_df
 
 
@@ -753,6 +758,7 @@ def annotate_orfs(gene_faa, db_handler, tmp_dir, logger, custom_db_locs=(), cust
                                            formater=partial(
                                                vogdb_hmmscan_formater,
                                                db_name='vogdb',
+                                               logger=logger,
                                                db_handler=db_handler
                                            ),
                                            logger=logger))
@@ -891,7 +897,6 @@ def annotate_fastas(fasta_locs, output_dir, db_handler, logger, min_contig_size=
     tmp_dir = path.join(output_dir, 'working_dir')
     mkdir(tmp_dir)
 
-    breakpoint()
     # setup custom databases to be searched
     custom_db_locs = process_custom_dbs(custom_fasta_loc, custom_db_name, path.join(tmp_dir, 'custom_dbs'),
                                         logger, threads, verbose)
@@ -907,7 +912,6 @@ def annotate_fastas(fasta_locs, output_dir, db_handler, logger, min_contig_size=
         logger.info('Annotating %s' % fasta_name)
         fasta_dir = path.join(tmp_dir, fasta_name)
         mkdir(fasta_dir)
-        breakpoint()
         annotations_list.append(
             annotate_fasta(fasta_loc, fasta_name, fasta_dir, db_handler, logger, min_contig_size, prodigal_mode,
                            trans_table, custom_db_locs, custom_hmm_locs, custom_hmm_cutoffs_locs,
@@ -929,45 +933,6 @@ def make_fasta_namses_df(fasta_loc):
     return pd.DataFrame(names)
 
 
-def annotate_fastas_as_one(fasta_locs, output_dir, db_handler, logger:logging.Logger, min_contig_size=5000, prodigal_mode='meta',
-                    trans_table='11', bit_score_threshold=60, rbh_bit_score_threshold=350, custom_db_name=(),
-                    custom_fasta_loc=(), custom_hmm_name=(), custom_hmm_loc=(), custom_hmm_cutoffs_loc=(),
-                    kofam_use_dbcan2_thresholds=False, skip_trnascan=False, rename_bins=True, keep_tmp_dir=True,
-                    threads=10, verbose=True):
-    # check for no conflicting options/configurations
-    tmp_dir = path.join(output_dir, 'working_dir')
-    mkdir(tmp_dir)
-
-    # setup custom databases to be searched
-    custom_db_locs = process_custom_dbs(custom_fasta_loc, custom_db_name, path.join(tmp_dir, 'custom_dbs'),
-                                        logger, threads, verbose)
-    custom_hmm_locs = process_custom_hmms(custom_hmm_loc, custom_hmm_name, logger)
-    custom_hmm_cutoffs_locs= process_custom_hmm_cutoffs(custom_hmm_cutoffs_loc, custom_hmm_name, logger)
-    logger.info('Retrieved database locations and descriptions')
-
-    # iterate over list of fastas and annotate each individually
-    annotations_list = list()
-    names = pd.concat([])
-    for fasta_loc in fasta_locs:
-        # get name of file e.g. /home/shaffemi/my_genome.fa -> my_genome
-        fasta_name = get_fasta_name(fasta_loc)
-        logger.info('Annotating %s' % fasta_name)
-        fasta_dir = path.join(tmp_dir, fasta_name)
-        mkdir(fasta_dir)
-        annotations_list.append(
-            annotate_fasta(fasta_loc, fasta_name, fasta_dir, db_handler, logger, min_contig_size, prodigal_mode,
-                           trans_table, custom_db_locs, custom_hmm_locs, custom_hmm_cutoffs_locs,
-                           bit_score_threshold, rbh_bit_score_threshold, kofam_use_dbcan2_thresholds,
-                           skip_trnascan, threads, rename_bins, keep_tmp_dir, verbose))
-    logger.info('Annotations complete, processing annotations')
-    all_annotations = merge_annotations(annotations_list, output_dir)
-
-    # clean up
-    if not keep_tmp_dir:
-        rmtree(tmp_dir)
-    return all_annotations
-
-
 # TODO: Add force flag to remove output dir if it already exists
 # TODO: Add continute flag to continue if output directory already exists
 # TODO: make fasta loc either a string or list to remove annotate_bins_cmd and annotate_called_genes_cmd?
@@ -1212,7 +1177,7 @@ def merge_annotations_cmd(input_dirs, output_dir):
     mkdir(output_dir)
     # Get a logger
     annotations_list = list()
-    log_file_path = path.join(output_dir, "annotation.log")
+    log_file_path = path.join(output_dir, "annotate.log")
     logger = logging.getLogger('annotate.log')
     setup_logger(logger, log_file_path)
     logger.info(f"The log file is created at {log_file_path}")
diff --git a/mag_annotator/annotate_vgfs.py b/mag_annotator/annotate_vgfs.py
index e87c257c..23e2b5db 100644
--- a/mag_annotator/annotate_vgfs.py
+++ b/mag_annotator/annotate_vgfs.py
@@ -16,7 +16,7 @@
 from mag_annotator.utils import setup_logger
 from mag_annotator.summarize_genomes import get_ids_from_annotations_all
 
-VMAG_DBS_TO_ANNOTATE = ('kegg', 'kofam', 'kofam_ko_list', 'uniref', 'peptidase', 'pfam', 'dbcan', 'viral', 'vogdb')
+VMAG_DBS_TO_ANNOTATE = ('kegg', 'kofam_hmm', 'kofam_ko_list', 'uniref', 'peptidase', 'pfam', 'dbcan', 'viral', 'vogdb')
 VIRSORTER_COLUMN_NAMES = ['gene_name', 'start_position', 'end_position', 'length', 'strandedness',
                           'viral_protein_cluster_hit', 'viral_protein_cluster_hit_score',
                           'viral_protein_cluster_hit_evalue', 'viral_protein_cluster_category', 'pfam_hit',
@@ -288,9 +288,6 @@ def get_metabolic_flags(annotations, metabolic_genes, amgs, verified_amgs, scaff
     flag_dict = dict()
     metabolic_genes = set(metabolic_genes)
     for scaffold, scaffold_annotations in annotations.groupby('scaffold'):
-        # perc_xh = sum([i == 'Xh' if not pd.isna(i) else False for i in scaffold_annotations['vogdb_categories']]) \
-        #           / scaffold_annotations.shape[0]
-        # is_j = perc_xh >= 0.18
         for gene, row in scaffold_annotations.iterrows():
             # set up
             flags = ''
@@ -431,10 +428,9 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_
                   custom_hmm_name=(), use_uniref=False, kofam_use_dbcan2_thresholds=False, skip_trnascan=False,
                   keep_tmp_dir=True, low_mem_mode=False, threads=10, verbose=True):
     mkdir(output_dir)
-    log_file_path = path.join(output_dir, "Annotation.log")
+    log_file_path = path.join(output_dir, "annotate.log")
     logger = logging.getLogger('annotation_log')
     setup_logger(logger, log_file_path)
-    breakpoint()
 
     # check inputs
     prodigal_modes = ['train', 'meta', 'single']
@@ -447,9 +443,10 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_
 
     # get database locations
     db_handler = DatabaseHandler(logger)
-    breakpoint()
-    db_handler.filter_db_locs(low_mem_mode=low_mem_mode, use_uniref=use_uniref, use_vogdb=True, master_list=VMAG_DBS_TO_ANNOTATE)
-    breakpoint()
+    db_handler.filter_db_locs(low_mem_mode=low_mem_mode,
+                              use_uniref=use_uniref,
+                              use_vogdb=True,
+                              master_list=VMAG_DBS_TO_ANNOTATE)
     if virsorter_affi_contigs is not None:
         virsorter_hits = get_virsorter_hits(virsorter_affi_contigs)
     else:
@@ -484,7 +481,6 @@ def annotate_vgfs(input_fasta, virsorter_affi_contigs=None, output_dir='.', min_
 
     # annotate vMAGs
     rename_bins = False
-    breakpoint()
     annotations = annotate_fastas(
         fasta_locs=contig_locs,
         output_dir=output_dir, 
diff --git a/mag_annotator/summarize_vgfs.py b/mag_annotator/summarize_vgfs.py
index ff295b0a..2dc8e485 100644
--- a/mag_annotator/summarize_vgfs.py
+++ b/mag_annotator/summarize_vgfs.py
@@ -236,7 +236,7 @@ def summarize_vgfs(input_file, output_dir, groupby_column='scaffold', max_auxili
     # make output folder
     mkdir(output_dir)
     if log_file_path is None:
-        log_file_path = path.join(output_dir, "Distillation.log")
+        log_file_path = path.join(output_dir, "distill.log")
     logger = logging.getLogger('distillation_log')
     setup_logger(logger, log_file_path)
     logger.info(f"The log file is created at {log_file_path}")
@@ -258,10 +258,7 @@ def summarize_vgfs(input_file, output_dir, groupby_column='scaffold', max_auxili
     # get potential AMGs
     potential_amgs = filter_to_amgs(annotations, max_aux=max_auxiliary_score,
                                     remove_transposons=remove_transposons, remove_fs=remove_fs)
-    check_columns(annotations, logger)
     check_columns(potential_amgs, logger)
-    annotations.kegg_hit
-    annotations.iloc[0]
     logger.info('Determined potential amgs')
 
     # make distillate

From bc9fd2e88de896c32c3f4826e48799aa2350c1a4 Mon Sep 17 00:00:00 2001
From: Rory M Flynn <rmflynn@colostate.com>
Date: Tue, 18 Oct 2022 14:59:47 -0600
Subject: [PATCH 05/10] Tweak some fixes from amg_summery fix

---
 mag_annotator/annotate_bins.py  | 55 +++++++--------------------------
 mag_annotator/summarize_vgfs.py |  3 --
 2 files changed, 11 insertions(+), 47 deletions(-)

diff --git a/mag_annotator/annotate_bins.py b/mag_annotator/annotate_bins.py
index 0c8dcb15..468f02bb 100644
--- a/mag_annotator/annotate_bins.py
+++ b/mag_annotator/annotate_bins.py
@@ -244,11 +244,16 @@ def kofam_hmmscan_formater(hits:pd.DataFrame, hmm_info_path:str=None, use_dbcan2
 
 
 
-def vogdb_hmmscan_formater(hits:pd.DataFrame,  db_name:str, db_handler=None):
+def vogdb_hmmscan_formater(hits:pd.DataFrame,  db_name:str, logger:logging.Logger,
+                           db_handler=None):
+    categories_col = f"{db_name}_categories"
+    id_col = f"{db_name}_id"
+    description_col = f"{db_name}_description"
     hits_sig = hits[hits.apply(get_sig_row, axis=1)]
     if len(hits_sig) == 0:
+        logger.warn("No significant hits for vog_db")
         # if nothing significant then return nothing, don't get descriptions
-        return pd.DataFrame()
+        return pd.DataFrame(columns=[categories_col, id_col, description_col])
     # Get the best hits
     hits_best = hits_sig.sort_values('full_evalue').drop_duplicates(subset=["query_id"])
     if db_handler is None:
@@ -257,16 +262,16 @@ def vogdb_hmmscan_formater(hits:pd.DataFrame,  db_name:str, db_handler=None):
         # get_descriptions
         desc_col = f"{db_name}_hits"
         descriptions = pd.DataFrame(
-            db_handler.get_descriptions(hits_best['target_id'].unique(), f"{db_name}_description"),
+            db_handler.get_descriptions(hits_best['target_id'].unique(), description_col),
             index=[desc_col]).T
         categories = descriptions[desc_col].apply(lambda x: x.split('; ')[-1])
-        descriptions[f"{db_name}_categories"] = categories.apply(
+        descriptions[categories_col] = categories.apply(
             lambda x: ';'.join(set([x[i:i + 2] for i in range(0, len(x), 2)])))
         descriptions['target_id'] = descriptions.index
         hits_df = pd.merge(hits_best[['query_id', 'target_id']], descriptions, on=f'target_id')
     hits_df.set_index('query_id', inplace=True, drop=True)
     hits_df.rename_axis(None, inplace=True)
-    hits_df.rename(columns={'target_id': f"{db_name}_id"}, inplace=True)
+    hits_df.rename(columns={'target_id': id_col}, inplace=True)
     return hits_df
 
 
@@ -753,6 +758,7 @@ def annotate_orfs(gene_faa, db_handler, tmp_dir, logger, custom_db_locs=(), cust
                                            formater=partial(
                                                vogdb_hmmscan_formater,
                                                db_name='vogdb',
+                                               logger=logger,
                                                db_handler=db_handler
                                            ),
                                            logger=logger))
@@ -927,45 +933,6 @@ def make_fasta_namses_df(fasta_loc):
     return pd.DataFrame(names)
 
 
-def annotate_fastas_as_one(fasta_locs, output_dir, db_handler, logger:logging.Logger, min_contig_size=5000, prodigal_mode='meta',
-                    trans_table='11', bit_score_threshold=60, rbh_bit_score_threshold=350, custom_db_name=(),
-                    custom_fasta_loc=(), custom_hmm_name=(), custom_hmm_loc=(), custom_hmm_cutoffs_loc=(),
-                    kofam_use_dbcan2_thresholds=False, skip_trnascan=False, rename_bins=True, keep_tmp_dir=True,
-                    threads=10, verbose=True):
-    # check for no conflicting options/configurations
-    tmp_dir = path.join(output_dir, 'working_dir')
-    mkdir(tmp_dir)
-
-    # setup custom databases to be searched
-    custom_db_locs = process_custom_dbs(custom_fasta_loc, custom_db_name, path.join(tmp_dir, 'custom_dbs'),
-                                        logger, threads, verbose)
-    custom_hmm_locs = process_custom_hmms(custom_hmm_loc, custom_hmm_name, logger)
-    custom_hmm_cutoffs_locs= process_custom_hmm_cutoffs(custom_hmm_cutoffs_loc, custom_hmm_name, logger)
-    logger.info('Retrieved database locations and descriptions')
-
-    # iterate over list of fastas and annotate each individually
-    annotations_list = list()
-    names = pd.concat([])
-    for fasta_loc in fasta_locs:
-        # get name of file e.g. /home/shaffemi/my_genome.fa -> my_genome
-        fasta_name = get_fasta_name(fasta_loc)
-        logger.info('Annotating %s' % fasta_name)
-        fasta_dir = path.join(tmp_dir, fasta_name)
-        mkdir(fasta_dir)
-        annotations_list.append(
-            annotate_fasta(fasta_loc, fasta_name, fasta_dir, db_handler, logger, min_contig_size, prodigal_mode,
-                           trans_table, custom_db_locs, custom_hmm_locs, custom_hmm_cutoffs_locs,
-                           bit_score_threshold, rbh_bit_score_threshold, kofam_use_dbcan2_thresholds,
-                           skip_trnascan, threads, rename_bins, keep_tmp_dir, verbose))
-    logger.info('Annotations complete, processing annotations')
-    all_annotations = merge_annotations(annotations_list, output_dir)
-
-    # clean up
-    if not keep_tmp_dir:
-        rmtree(tmp_dir)
-    return all_annotations
-
-
 # TODO: Add force flag to remove output dir if it already exists
 # TODO: Add continute flag to continue if output directory already exists
 # TODO: make fasta loc either a string or list to remove annotate_bins_cmd and annotate_called_genes_cmd?
diff --git a/mag_annotator/summarize_vgfs.py b/mag_annotator/summarize_vgfs.py
index c091689d..c5c8760b 100644
--- a/mag_annotator/summarize_vgfs.py
+++ b/mag_annotator/summarize_vgfs.py
@@ -243,10 +243,7 @@ def summarize_vgfs(input_file, output_dir, groupby_column='scaffold', max_auxili
     # get potential AMGs
     potential_amgs = filter_to_amgs(annotations, max_aux=max_auxiliary_score,
                                     remove_transposons=remove_transposons, remove_fs=remove_fs)
-    check_columns(annotations, logger)
     check_columns(potential_amgs, logger)
-    annotations.kegg_hit
-    annotations.iloc[0]
     logger.info('Determined potential amgs')
 
     # make distillate

From fab203a990e234eee8e798f932f3475ec07e8dbb Mon Sep 17 00:00:00 2001
From: Rory M Flynn <rmflynn@colostate.com>
Date: Wed, 4 Jan 2023 14:32:44 -0700
Subject: [PATCH 06/10] AMG summary improved, some warnings prevented

This update changes the AMG summary so that it will provide all
information from both the Metabolic DB and The AMG DB. It is a bit of a
kitchen sink approach but we can always cut it down if needed and most
dram-v users I would call advanced.

There are also some small quality of life improvements.
---
 mag_annotator/summarize_genomes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mag_annotator/summarize_genomes.py b/mag_annotator/summarize_genomes.py
index 3c2d22a9..c58cb130 100644
--- a/mag_annotator/summarize_genomes.py
+++ b/mag_annotator/summarize_genomes.py
@@ -97,7 +97,7 @@ def fill_genome_summary_frame_gene_names(annotations, genome_summary_frame, grou
     for genome, frame in annotations.groupby(groupby_column, sort=False):
         # make dict of identifiers to gene names
         id_gene_dict = defaultdict(list)
-        for gene, ids in get_ids_from_annotations_by_row(frame).iteritems():
+        for gene, ids in get_ids_from_annotations_by_row(frame).items():
             for id_ in ids:
                 id_gene_dict[id_].append(gene)
         # fill in genome summary_frame
@@ -316,7 +316,7 @@ def get_module_step_coverage(kos, module_net):
 def make_module_coverage_df(annotation_df, module_nets):
     kos_to_genes = defaultdict(list)
     ko_id_name = 'kegg_id' if 'kegg_id' in annotation_df.columns else 'ko_id'
-    for gene_id, ko_list in annotation_df[ko_id_name].iteritems():
+    for gene_id, ko_list in annotation_df[ko_id_name].items():
         if type(ko_list) is str:
             for ko in ko_list.split(','):
                 kos_to_genes[ko].append(gene_id)

From 135c08b00f6176c377654f1527ec92573c09190e Mon Sep 17 00:00:00 2001
From: Rory M Flynn <rmflynn@colostate.com>
Date: Wed, 4 Jan 2023 14:36:46 -0700
Subject: [PATCH 07/10] A version bump, a new release will be made

---
 mag_annotator/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mag_annotator/__init__.py b/mag_annotator/__init__.py
index 4e7c72a5..9e0feee7 100644
--- a/mag_annotator/__init__.py
+++ b/mag_annotator/__init__.py
@@ -1 +1 @@
-__version__ = '1.4.3'
+__version__ = '1.4.4'

From 7b253d79c512ff5fcce7b3c557b04bf2eb36f2b3 Mon Sep 17 00:00:00 2001
From: Rory M Flynn <rmflynn@colostate.com>
Date: Wed, 4 Jan 2023 14:42:22 -0700
Subject: [PATCH 08/10] Output folders now required in almost all cases

---
 scripts/DRAM-v.py | 6 +++---
 scripts/DRAM.py   | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/scripts/DRAM-v.py b/scripts/DRAM-v.py
index 38abc0e9..93b110f3 100644
--- a/scripts/DRAM-v.py
+++ b/scripts/DRAM-v.py
@@ -27,7 +27,7 @@
     annotate_parser.add_argument('-i', '--input_fasta', help="fasta file, output from ", required=True)
     annotate_parser.add_argument('-v', '--virsorter_affi_contigs', help="VirSorter VIRSorter_affi-contigs.tab "
                                                                         "output file")
-    annotate_parser.add_argument('-o', '--output_dir', help="output directory")
+    annotate_parser.add_argument('-o', '--output_dir', help="output directory", required=True)
     annotate_parser.add_argument('--min_contig_size', type=int, default=2500,
                                  help='minimum contig size to be used for gene prediction')
     annotate_parser.add_argument('--split_contigs', action='store_true', default=False,
@@ -78,7 +78,7 @@
 
     # parser for summarizing genomes
     distill_parser.add_argument("-i", "--input_file", help="Annotations path")
-    distill_parser.add_argument("-o", "--output_dir", help="Directory to write summarized genomes")
+    distill_parser.add_argument("-o", "--output_dir", help="Directory to write summarized genomes", required=True)
     distill_parser.add_argument("--groupby_column", help="Column from annotations to group as VGF units",
                                 default='scaffold')
     distill_parser.add_argument("--max_auxiliary_score", type=int, default=3,
@@ -134,7 +134,7 @@
 
     # parser for getting gene neighborhoods
     neighborhood_parser.add_argument("-i", "--input_file", help="Annotations path")
-    neighborhood_parser.add_argument("-o", "--output_dir", help="Directory to write gene neighborhoods")
+    neighborhood_parser.add_argument("-o", "--output_dir", help="Directory to write gene neighborhoods", required=True)
     neighborhood_parser.add_argument("--genes", nargs='*', help="Gene names from DRAM to find neighborhoods around")
     neighborhood_parser.add_argument("--identifiers", nargs='*',
                                      help="Database identifiers assigned by DRAM to find neighborhoods around")
diff --git a/scripts/DRAM.py b/scripts/DRAM.py
index 2430a68d..50228449 100644
--- a/scripts/DRAM.py
+++ b/scripts/DRAM.py
@@ -32,7 +32,7 @@
     annotate_parser.add_argument('-i', '--input_fasta', action='append',
                                  help="fasta file, optionally with wildcards to point to multiple fastas",
                                  required=True)
-    annotate_parser.add_argument('-o', '--output_dir', help="output directory")
+    annotate_parser.add_argument('-o', '--output_dir', help="output directory", required=True)
     annotate_parser.add_argument('--min_contig_size', type=int, default=2500,
                                  help='minimum contig size to be used for gene prediction')
     annotate_parser.add_argument('--config_loc', default=None,
@@ -130,7 +130,7 @@
 
     # parser for summarizing genomes
     distill_parser.add_argument("-i", "--input_file", help="Annotations path")
-    distill_parser.add_argument("-o", "--output_dir", help="Directory to write summarized genomes")
+    distill_parser.add_argument("-o", "--output_dir", required=True, help="Directory to write summarized genomes")
     distill_parser.add_argument('--log_file_path', 
                                        help="A name and loctation for the log file")
     distill_parser.add_argument("--rrna_path", help="rRNA output from annotation")
@@ -179,7 +179,7 @@
 
     # parser for getting gene neighborhoods
     neighborhood_parser.add_argument("-i", "--input_file", help="Annotations path")
-    neighborhood_parser.add_argument("-o", "--output_dir", help="Directory to write gene neighborhoods")
+    neighborhood_parser.add_argument("-o", "--output_dir", required=True, help="Directory to write gene neighborhoods")
     neighborhood_parser.add_argument("--genes", nargs='*', help="Gene names from DRAM to find neighborhoods around")
     neighborhood_parser.add_argument("--identifiers", nargs='*',
                                      help="Database identifiers assigned by DRAM to find neighborhoods around")
@@ -199,7 +199,7 @@
     # TODO: Make it work with append so you can use multiple -i's
     merge_annotations_parser.add_argument("-i", "--input_dirs", help="Path with wildcards pointing to DRAM annotation "
                                                                      "output directories")
-    merge_annotations_parser.add_argument("-o", "--output_dir", help="Path to output merged annotations files")
+    merge_annotations_parser.add_argument("-o", "--output_dir", required=True,  help="Path to output merged annotations files")
     merge_annotations_parser.set_defaults(func=merge_annotations_cmd)
 
     args = parser.parse_args()

From 7202d7154723755f349e8b0667b5d2895423d57e Mon Sep 17 00:00:00 2001
From: Rory M Flynn <rmflynn@colostate.com>
Date: Wed, 4 Jan 2023 15:16:59 -0700
Subject: [PATCH 09/10] Fix one test and remove some commented code

---
 mag_annotator/database_processing.py | 48 ----------------------------
 tests/test_summarize_vgfs.py         |  6 ++--
 2 files changed, 4 insertions(+), 50 deletions(-)

diff --git a/mag_annotator/database_processing.py b/mag_annotator/database_processing.py
index 72dff9ea..cee0eefa 100644
--- a/mag_annotator/database_processing.py
+++ b/mag_annotator/database_processing.py
@@ -26,54 +26,6 @@
 DEFAULT_MMMSPRO_DB_NAME = 'db'
 
 
-# KOFAM_CITATION = ("Aramaki T., Blanc-Mathieu R., Endo H., Ohkubo K., Kanehisa "
-#                   "M., Goto S., Ogata H.\nKofamKOALA: KEGG ortholog assignment"
-#                   " based on profile HMM and adaptive score threshold.\nBioinf"
-#                   "ormatics. 2019 Nov 19. pii: btz859. doi: 10.1093/bioinforma"
-#                   "tics/btz859."
-#                   ) # arguably not for kofam but the closest I saw
-# VIRAL_REFSEQ_CITATION = ("Brister JR, Ako-Adjei D, Bao Y, Blinkova O. NCBI vir"
-#                          "al genomes resource. Nucleic Acids Res. 2015 Jan;43("
-#                          "Database issue):D571-7 PubMed PubMedCentral"
-#                          ) # Three options but this one is viral specific
-# KEGG_CITATION = ("Kanehisa, M., Furumichi, M., Sato, Y., Ishiguro-Watanabe, M."
-#                  ", and Tanabe, M.; KEGG: integrating viruses and cellular org"
-#                  "anisms. Nucleic Acids Res. 49, D545-D551 (2021)."
-#                  )
-# PFAM_CITATION = ("Pfam: The protein families database in 2021: J. Mistry, S. C"
-#                  "huguransky, L. Williams, M. Qureshi, G.A. Salazar, E.L.L. So"
-#                  "nnhammer, S.C.E. Tosatto, L. Paladin, S. Raj, L.J. Richardso"
-#                  "n, R.D. Finn, A. Bateman"
-#                  )
-# PEPTIDASE_CITATION = ("Rawlings, N.D., Barrett, A.J., Thomas, P.D., Huang, X.,"
-#                       " Bateman, A. & Finn, R.D. (2018) The MEROPS database of"
-#                       " proteolytic enzymes, their substrates and inhibitors i"
-#                       "n 2017 and a comparison with peptidases in the PANTHER "
-#                       "database. Nucleic Acids Res 46, D624-D632."
-#                       )
-# VOGDB_CITATION = ("Thannesberger, J., Hellinger, H. J., Klymiuk, I., Kastner, M"
-#                   ". T., Rieder, F. J., Schneider, M., ... & Steininger, C. (20"
-#                   "17). Viruses comprise an extensive pool of mobile genetic el"
-#                   "ements in eukaryote cell cultures and human clinical samples"
-#                   ". The FASEB Journal, 31(5), 1987-2000."
-#                   )
-# UNIREF_CITATION = ("Wang Y, Wang Q, Huang H, Huang W, Chen Y, McGarvey PB, Wu C"
-#                    "H, Arighi CN, UniProt Consortium. A crowdsourcing open plat"
-#                    "form for literature curation in UniProt Plos Biology. 19(12"
-#                    "):e3001464 (2021)"
-#                    )
-# DBCAN_CITATION = ("Yin Y*, Mao X*, Yang JC, Chen X, Mao F and Xu Y, dbCAN: a we"
-#                   "b resource for automated carbohydrate-active enzyme annotati"
-#                   "on, Nucleic Acids Res. 2012 Jul;40(Web Server issue):W445-51"
-#                   ) # again a citation for the tool not the db
-# DRAM_CITATION = ("M. Shaffer, M. A. Borton, B. B. McGivern, A. A. Zayed, S. L. "
-#                  "La Rosa, L. M. Solden, P. Liu, A. B. Narrowe, J. Rodríguez-Ra"
-#                  "mos, B. Bolduc et al., \"Dram for distilling microbial metabo"
-#                  "lism to automate the curation of microbiome function,\" Nucle"
-#                  "ic acids research, vol. 48, no. 16, pp. 8883–8900, 2020."
-#                  )
-
-
 KOFAM_CITATION = ("T. Aramaki, R. Blanc-Mathieu, H. Endo, K. Ohkubo, M. Kanehisa"
                   ", S. Goto, and H. Ogata, \"Kofamkoala: Kegg ortholog assignme"
                   "nt based on profile hmm and adaptive score threshold,\" Bioin"
diff --git a/tests/test_summarize_vgfs.py b/tests/test_summarize_vgfs.py
index b9e5681f..9e890274 100644
--- a/tests/test_summarize_vgfs.py
+++ b/tests/test_summarize_vgfs.py
@@ -46,8 +46,10 @@ def genome_summary_frame():
                                      columns=['gene_description', 'module', 'sheet', 'header', 'subheader']))
 
 
-def test_make_viral_distillate(pamgs, genome_summary_frame):
-    test_viral_distillate = make_viral_distillate(pamgs, genome_summary_frame)
+def test_make_viral_distillate(pamgs, genome_summary_frame, logger):
+    amg_database = pd.DataFrame()
+    test_viral_distillate = make_viral_distillate(pamgs, genome_summary_frame, amg_database=amg_database, logger=logger)
+    breakpoint()
     viral_distillate = pd.DataFrame([['gene_3', 'scaffold_1', 'K00001', 'description', 'main', 'header1', 'subheader1',
                                       'module1', 3, 'MFTJ'],
                                      ['gene_3', 'scaffold_1', 'K00001', 'description3', 'other', 'header1',

From e59237687370c32833b839d66f6baf89d2d22b24 Mon Sep 17 00:00:00 2001
From: Rory M Flynn <rmflynn@colostate.com>
Date: Thu, 5 Jan 2023 15:23:15 -0700
Subject: [PATCH 10/10] add a test for the new amg summary

I could not fix the sql someone more familiar with sql is free to do so
I will make a branch to deal with this
---
 mag_annotator/database_handler.py |   5 +-
 mag_annotator/database_setup.py   |  20 ---
 tests/test_summarize_vgfs.py      | 201 ++++++++++++++++++++++++------
 3 files changed, 165 insertions(+), 61 deletions(-)

diff --git a/mag_annotator/database_handler.py b/mag_annotator/database_handler.py
index 78a8bcd7..a94bca9f 100644
--- a/mag_annotator/database_handler.py
+++ b/mag_annotator/database_handler.py
@@ -5,11 +5,10 @@
 import logging
 from shutil import copy2
 import warnings
-from sqlalchemy import create_engine
-from sqlalchemy.orm import sessionmaker
-
 from datetime import datetime
 from functools import partial
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
 
 import pandas as pd
 
diff --git a/mag_annotator/database_setup.py b/mag_annotator/database_setup.py
index c0c2216d..dbe686c6 100644
--- a/mag_annotator/database_setup.py
+++ b/mag_annotator/database_setup.py
@@ -82,25 +82,6 @@ def serialize(self):
             'dbcan_subfam_ec': self.ec,
         }
 
-
-# DBCAN_SUBFAM_EC_TABLE_NAME = 'dbcan_subfam_ec'
-
-
-# class DbcanSubfamEC(Base):
-#     __tablename__ = DBCAN_SUBFAM_EC_TABLE_NAME
-# 
-#     id = Column(String(30), primary_key=True, nullable=False, index=True)
-# 
-#     description = Column(String(1000))
-# 
-#     @property
-#     def serialize(self):
-#         return {
-#             'dbcan_id': self.id,
-#             'dbcan_subfam_ec': self.description,
-#         }
-
-
 VIRAL_DESCRIPTION_TABLE_NAME = 'viral_description'
 
 
@@ -164,7 +145,6 @@ def create_description_db(db_loc):
                             UNIREF_DESCRIPTION_TABLE_NAME: UniRefDescription,
                             PFAM_DESCRIPTION_TABLE_NAME: PfamDescription,
                             DBCAN_DESCRIPTION_TABLE_NAME: DbcanDescription,
-                            # DBCAN_SUBFAM_EC_TABLE_NAME: DbcanSubfamEC,
                             VIRAL_DESCRIPTION_TABLE_NAME: ViralDescription,
                             PEPTIDASE_DESCRIPTION_TABLE_NAME: PeptidaseDescription,
                             VOGDB_DESCRIPTION_TABLE_NAME: VOGDBDescription}
diff --git a/tests/test_summarize_vgfs.py b/tests/test_summarize_vgfs.py
index 9e890274..01639a33 100644
--- a/tests/test_summarize_vgfs.py
+++ b/tests/test_summarize_vgfs.py
@@ -1,28 +1,49 @@
 import pytest
+import logging
+from mag_annotator.utils import setup_logger
 
 import pandas as pd
 import altair as alt
 
-from mag_annotator.summarize_vgfs import filter_to_amgs, get_strand_switches, make_viral_distillate, make_vgf_order, \
-    make_amg_count_column, make_viral_functional_df, make_viral_functional_heatmap
+from mag_annotator.summarize_vgfs import (
+    filter_to_amgs,
+    get_strand_switches,
+    make_viral_distillate,
+    make_vgf_order,
+    make_amg_count_column,
+    make_viral_functional_df,
+    make_viral_functional_heatmap,
+)
 
 
 @pytest.fixture()
 def annotations():
-    return pd.DataFrame([['scaffold_1', 'K99999', 'VTF', 1],
-                         ['scaffold_1', 'K12345', 'M', 4],
-                         ['scaffold_1', 'K00001', 'MFTJ', 3],
-                         ['scaffold_1', 'K11111', 'MKE', 2]],
-                        index=['gene_1', 'gene_2', 'gene_3', 'gene_4'],
-                        columns=['scaffold', 'ko_id', 'amg_flags', 'auxiliary_score'])
+    return pd.DataFrame(
+        [
+            ["scaffold_1", "K99999", "VTF", 1],
+            ["scaffold_1", "K12345", "M", 4],
+            ["scaffold_1", "K00001", "MFTJ", 3],
+            ["scaffold_1", "K11111", "MKE", 2],
+        ],
+        index=["gene_1", "gene_2", "gene_3", "gene_4"],
+        columns=["scaffold", "ko_id", "amg_flags", "auxiliary_score"],
+    )
 
 
 @pytest.fixture()
 def pamgs():
-    return pd.DataFrame([['scaffold_1', 'K00001', 'MFTJ', 3],
-                         ['scaffold_1', 'K11111', 'MKE', 2]],
-                        index=['gene_3', 'gene_4'],
-                        columns=['scaffold', 'ko_id', 'amg_flags', 'auxiliary_score'])
+    return pd.DataFrame(
+        [["scaffold_1", "K00001", "MFTJ", 3], ["scaffold_1", "K11111", "MKE", 2]],
+        index=["gene_3", "gene_4"],
+        columns=["scaffold", "ko_id", "amg_flags", "auxiliary_score"],
+    )
+
+
+@pytest.fixture()
+def logger(tmpdir):
+    logger = logging.getLogger("test_log")
+    setup_logger(logger)
+    return logger
 
 
 def test_filter_to_amgs(annotations, pamgs):
@@ -31,37 +52,129 @@ def test_filter_to_amgs(annotations, pamgs):
 
 
 def test_get_strand_switches():
-    assert get_strand_switches(['+', '-', '+']) == 2
-    assert get_strand_switches(['+', '+', '+']) == 0
-    assert get_strand_switches(['-']) == 0
-    assert get_strand_switches(['+', '+', '-', '-']) == 1
+    assert get_strand_switches(["+", "-", "+"]) == 2
+    assert get_strand_switches(["+", "+", "+"]) == 0
+    assert get_strand_switches(["-"]) == 0
+    assert get_strand_switches(["+", "+", "-", "-"]) == 1
 
 
 @pytest.fixture()
 def genome_summary_frame():
-    return pd.DataFrame(pd.DataFrame([['description', 'module1', 'main', 'header1', 'subheader1'],
-                                      ['description2', 'module2', 'main', 'header1', 'subheader1'],
-                                      ['description3', 'module3', 'other', 'header1', 'subheader1']],
-                                     index=['K00001', 'K12345', 'K00001'],
-                                     columns=['gene_description', 'module', 'sheet', 'header', 'subheader']))
+    return pd.DataFrame(
+        pd.DataFrame(
+            [
+                ["description", "module1", "main", "header1", "subheader1"],
+                ["description2", "module2", "main", "header1", "subheader1"],
+                ["description3", "module3", "other", "header1", "subheader1"],
+            ],
+            index=["K00001", "K12345", "K00001"],
+            columns=["gene_description", "module", "sheet", "header", "subheader"],
+        )
+    )
 
 
 def test_make_viral_distillate(pamgs, genome_summary_frame, logger):
-    amg_database = pd.DataFrame()
-    test_viral_distillate = make_viral_distillate(pamgs, genome_summary_frame, amg_database=amg_database, logger=logger)
-    breakpoint()
-    viral_distillate = pd.DataFrame([['gene_3', 'scaffold_1', 'K00001', 'description', 'main', 'header1', 'subheader1',
-                                      'module1', 3, 'MFTJ'],
-                                     ['gene_3', 'scaffold_1', 'K00001', 'description3', 'other', 'header1',
-                                      'subheader1', 'module3', 3, 'MFTJ'],
-                                     ['gene_4', 'scaffold_1', '', '', '', '', '', '', 2, 'MKE']],
-                                    columns=['gene', 'scaffold', 'gene_id', 'gene_description', 'category', 'header',
-                                             'subheader', 'module', 'auxiliary_score', 'amg_flags'])
-    pd.testing.assert_frame_equal(test_viral_distillate, viral_distillate)
+    amg_database = pd.DataFrame(
+        [
+            [
+                "K11111",
+                "gene_4",
+                "gene_4",
+                "d4",
+                "module3",
+                "metabolism3",
+                "reference2",
+                "true",
+            ]
+        ],
+        columns=[
+            "KO",
+            "EC",
+            "PFAM",
+            "gene",
+            "module",
+            "metabolism",
+            "reference",
+            "verified",
+        ],
+    )
+    output_viral_distillate = make_viral_distillate(
+        pamgs, genome_summary_frame, amg_database=amg_database, logger=logger
+    )
+    expected_viral_distillate = pd.DataFrame(
+        [
+            [
+                "gene_3",
+                "scaffold_1",
+                "K00001",
+                "description",
+                "header1",
+                "subheader1",
+                "module1",
+                3,
+                "MFTJ",
+                "main",
+                "genome_summary_form",
+                "",
+                "",
+            ],
+            [
+                "gene_3",
+                "scaffold_1",
+                "K00001",
+                "description3",
+                "header1",
+                "subheader1",
+                "module3",
+                3,
+                "MFTJ",
+                "other",
+                "genome_summary_form",
+                "",
+                "",
+                "",
+            ],
+            [
+                "gene_4",
+                "scaffold_1",
+                "K11111",
+                "d4",
+                "",
+                "",
+                "module3",
+                2,
+                "MKE",
+                "",
+                "amg_database",
+                "metabolism3",
+                "reference2",
+                "true",
+            ],
+        ],
+        columns=[
+            "gene",
+            "scaffold",
+            "gene_id",
+            "gene_description",
+            "header",
+            "subheader",
+            "module",
+            "auxiliary_score",
+            "amg_flags",
+            "sheet",
+            "gene_id_origin",
+            "metabolism",
+            "reference",
+            "verified",
+        ],
+    )
+    output_viral_distillate = output_viral_distillate.sort_index(axis=1).fillna("")
+    expected_viral_distillate = expected_viral_distillate.sort_index(axis=1).fillna("")
+    pd.testing.assert_frame_equal(output_viral_distillate, expected_viral_distillate)
 
 
 def test_make_vgf_order(pamgs):
-    assert make_vgf_order(pamgs) == ['scaffold_1']
+    assert make_vgf_order(pamgs) == ["scaffold_1"]
 
 
 def test_make_amg_count_column(pamgs):
@@ -70,14 +183,26 @@ def test_make_amg_count_column(pamgs):
 
 @pytest.fixture()
 def viral_functional_df():
-    return pd.DataFrame([['main', 'module1', 'gene_3', 'K00001', 'scaffold_1', True],
-                         ['other', 'module3', 'gene_3', 'K00001', 'scaffold_1', True]],
-                        columns=['Category', 'Function', 'AMG Genes', 'Genes Present', 'Contig Name',
-                                 'Present in Contig'])
+    return pd.DataFrame(
+        [
+            ["main", "module1", "gene_3", "K00001", "scaffold_1", True],
+            ["other", "module3", "gene_3", "K00001", "scaffold_1", True],
+        ],
+        columns=[
+            "Category",
+            "Function",
+            "AMG Genes",
+            "Genes Present",
+            "Contig Name",
+            "Present in Contig",
+        ],
+    )
 
 
 def test_make_viral_functional_df(pamgs, genome_summary_frame, viral_functional_df):
-    test_viral_function_df = make_viral_functional_df(pamgs, genome_summary_frame, 'scaffold')
+    test_viral_function_df = make_viral_functional_df(
+        pamgs, genome_summary_frame, "scaffold"
+    )
     pd.testing.assert_frame_equal(test_viral_function_df, viral_functional_df)