From 5d2da4d41d4ca1bb3b55e2a2761c8d7ac64f7f35 Mon Sep 17 00:00:00 2001
From: Kirill Bessonov <kbessonov@gmail.com>
Date: Thu, 1 Aug 2024 11:36:39 -0400
Subject: [PATCH] Fixed bugs related to batch mode output files displaying only
 last sample results instead of results from All samples. Also added automatic
 clceaner for the main output files to prevent excesive file output growth
 especially of Blastn results

---
 ectyper/definitions.py           |  4 +++-
 ectyper/ectyper.py               | 30 +++++++++++++++++++++---------
 ectyper/predictionFunctions.py   | 12 +++++++++---
 ectyper/speciesIdentification.py |  2 +-
 4 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/ectyper/definitions.py b/ectyper/definitions.py
index 747dd1e..234c1d5 100644
--- a/ectyper/definitions.py
+++ b/ectyper/definitions.py
@@ -46,4 +46,6 @@
              'GeneLengths','DatabaseVer','Warnings','Pathotype', 'PathotypeCounts', 'PathotypeGenes', 'PathotypeGeneNames', 'PathotypeAccessions', 'PathotypeAlleleIDs', 
              'PathotypeIdentities(%)','PathotypeCoverages(%)','PathotypeGeneLengthRatios','PathotypeRuleIDs', 'PathotypeGeneCounts', 'PathoDBVer',
              'StxSubtypes','StxAccessions','StxAlleleIDs', 'StxIdentities(%)','StxCoverages(%)','StxLengths',
-             'StxContigNames', 'StxContigNum','StxCoordinates']
\ No newline at end of file
+             'StxContigNames', 'StxContigNum','StxCoordinates']
+OUTPUT_FILES_LIST = ['blastn_output_alleles.txt', 'blastn_pathotype_alleles_overall.txt', 'mash_output.txt', 
+                     'stx1_allhits_annotated_df.txt', 'stx2_allhits_annotated_df.txt']
\ No newline at end of file
diff --git a/ectyper/ectyper.py b/ectyper/ectyper.py
index b4b3b66..b4bee16 100644
--- a/ectyper/ectyper.py
+++ b/ectyper/ectyper.py
@@ -62,7 +62,7 @@ def run_program():
     Creates all required files and controls function execution.
     :return: success or failure
     """
-    
+    LOG.setLevel(logging.INFO)
     args = commandLineOptions.parse_command_line()
     
     
@@ -76,7 +76,7 @@ def run_program():
         LOG.setLevel(logging.DEBUG)
     else:
         fh.setLevel(logging.INFO)
-        LOG.setLevel(logging.INFO)
+        
     LOG.addHandler(fh)
 
     #try to load database
@@ -235,7 +235,7 @@ def run_program():
     
     if args.debug == False:
         shutil.rmtree(temp_dir, ignore_errors=True)
-    LOG.info("ECTyper has finished successfully.")
+    LOG.info(f"ECTyper has finished successfully. Results available at {os.path.abspath(args.output)}")
 
 def getOantigenHighSimilarGroup(final_predictions, sample):
     pred_Otypes = final_predictions[sample]['O']["serogroup"].split("/") #if call is a mixed call
@@ -263,8 +263,8 @@ def create_output_directory(output_dir):
     :param output_dir: The user-specified output directory, if any
     :return: The output directory
     """
-    # If no output directory is specified for the run, create a one based on
-    # time
+    # If no output directory is specified for the run, create a one based on time
+    
 
 
     if output_dir is None:
@@ -283,6 +283,13 @@ def create_output_directory(output_dir):
 
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
+
+    # clean previous ECTyper output files if the directory was used in previous runs 
+    for file in definitions.OUTPUT_FILES_LIST:
+        path2file = os.path.join(output_dir,file)
+        if os.path.exists(path2file):
+            LOG.info(f"Cleaning ECTyper previous files. Removing previously generated {path2file} ...")
+            os.remove(path2file) 
     return out_dir
 
 
@@ -398,9 +405,14 @@ def genome_group_prediction(g_group, alleles_fasta, args, temp_dir, ectyperdb_di
             blast_output_file,
             ectyperdb_dict,
             args);
-
-    blast_output_file_path = os.path.join(args.output,"blast_output_alleles.txt")
-    blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , sep="\t", index=False)
-    LOG.info("BLAST output file against reference alleles is written at {}".format(blast_output_file_path))
+    
+    blast_output_file_path = os.path.join(args.output,f"blastn_output_alleles.txt")
+    if os.path.exists(blast_output_file_path) == False:
+        blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , sep="\t", index=False)
+        LOG.info("BLAST output file against reference alleles is written at {}".format(blast_output_file_path))
+    else:
+        blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , mode="a", header=False, sep="\t", index=False)
+        LOG.info("Appending BLAST output file against reference alleles at {}".format(blast_output_file_path))
+    
 
     return db_prediction_dict
diff --git a/ectyper/predictionFunctions.py b/ectyper/predictionFunctions.py
index 272bc5b..216a4eb 100644
--- a/ectyper/predictionFunctions.py
+++ b/ectyper/predictionFunctions.py
@@ -134,7 +134,11 @@ def shiga_toxing_subtyping(pathotype_genes_tmp_df, output_dir, debug):
             if debug:
                 stx_df_out_filename = f'{gene}_allhits_annotated_df.txt'
                 LOG.debug(f"Wrote {gene} annotated potential hits dataframe to {output_dir}/{stx_df_out_filename}")
-                stx_toxin_df.to_csv(os.path.join(output_dir,stx_df_out_filename), sep="\t", index=False)
+                path2stx_df = os.path.join(output_dir,stx_df_out_filename)
+                if os.path.exists(path2stx_df) == False:
+                    stx_toxin_df.to_csv(path2stx_df, sep="\t", index=False)
+                else:
+                    stx_toxin_df.to_csv(path2stx_df , mode="a", header=False, sep="\t", index=False)     
             # get top hit for each common gene range. Provide mixed call if >1 hits share the same 'bitscore'
             stx_subtypes_dict={}
             for range_id in stx_toxin_df['rangeid'].unique():
@@ -312,7 +316,9 @@ def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_gen
     #write pathotype blastn results
     if debug == True:
         LOG.debug(f"Writting overall pathotype BLASTn results to {output_dir}/blastn_pathotype_alleles_overall.txt")
-        pathotype_genes_overall_df.to_csv(f'{output_dir}/blastn_pathotype_alleles_overall.txt',sep="\t", index=False)
+        path2pathotype_df = f'{output_dir}/blastn_pathotype_alleles_overall.txt'
+        pathotype_genes_overall_df.to_csv(path2pathotype_df,sep="\t", index=False)
+        
     
 
     return predictions_pathotype_dict
@@ -361,7 +367,7 @@ def predict_serotype(blast_output_file, ectyper_dict, args):
     # Make prediction for each genome based on blast output
     for genome_name, per_genome_df in output_df.groupby('genome_name'):
         predictions_dict[genome_name] = get_prediction(per_genome_df)
-    LOG.info("Serotype prediction successfully completed")
+    LOG.info(f"Serotype prediction successfully completed for {genome_name}")
     LOG.debug("Predictions dict:\n{}".format(predictions_dict))
 
     return predictions_dict, output_df
diff --git a/ectyper/speciesIdentification.py b/ectyper/speciesIdentification.py
index 0b74294..4b528e2 100644
--- a/ectyper/speciesIdentification.py
+++ b/ectyper/speciesIdentification.py
@@ -185,7 +185,7 @@ def get_species(file, args, cores=1):
 
     if args.debug:
         LOG.debug("Wrote MASH against reference sketch results to {}".format(args.output))
-        with open(file=args.output+"/mash_output.txt", mode="w") as fp:
+        with open(file=args.output+"/mash_output.txt", mode="a") as fp:
             fp.write(sort_output.stdout.decode("utf-8"))
         fp.close()