Fixed bugs related to batch mode output files displaying only last sa…

…mple results instead of results from All samples. Also added automatic clceaner for the main output files to prevent excesive file output growth especially of Blastn results
phac-nml · Aug 1, 2024 · 5d2da4d · 5d2da4d
1 parent 8002091
commit 5d2da4d
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 14 deletions.
diff --git a/ectyper/definitions.py b/ectyper/definitions.py
@@ -46,4 +46,6 @@
              'GeneLengths','DatabaseVer','Warnings','Pathotype', 'PathotypeCounts', 'PathotypeGenes', 'PathotypeGeneNames', 'PathotypeAccessions', 'PathotypeAlleleIDs', 
              'PathotypeIdentities(%)','PathotypeCoverages(%)','PathotypeGeneLengthRatios','PathotypeRuleIDs', 'PathotypeGeneCounts', 'PathoDBVer',
              'StxSubtypes','StxAccessions','StxAlleleIDs', 'StxIdentities(%)','StxCoverages(%)','StxLengths',
-             'StxContigNames', 'StxContigNum','StxCoordinates']
+             'StxContigNames', 'StxContigNum','StxCoordinates']
+OUTPUT_FILES_LIST = ['blastn_output_alleles.txt', 'blastn_pathotype_alleles_overall.txt', 'mash_output.txt', 
+                     'stx1_allhits_annotated_df.txt', 'stx2_allhits_annotated_df.txt']
diff --git a/ectyper/ectyper.py b/ectyper/ectyper.py
@@ -62,7 +62,7 @@ def run_program():
     Creates all required files and controls function execution.
     :return: success or failure
     """
-
+    LOG.setLevel(logging.INFO)
     args = commandLineOptions.parse_command_line()
 
 
@@ -76,7 +76,7 @@ def run_program():
         LOG.setLevel(logging.DEBUG)
     else:
         fh.setLevel(logging.INFO)
-        LOG.setLevel(logging.INFO)
+
     LOG.addHandler(fh)
 
     #try to load database
@@ -235,7 +235,7 @@ def run_program():
 
     if args.debug == False:
         shutil.rmtree(temp_dir, ignore_errors=True)
-    LOG.info("ECTyper has finished successfully.")
+    LOG.info(f"ECTyper has finished successfully. Results available at {os.path.abspath(args.output)}")
 
 def getOantigenHighSimilarGroup(final_predictions, sample):
     pred_Otypes = final_predictions[sample]['O']["serogroup"].split("/") #if call is a mixed call
@@ -263,8 +263,8 @@ def create_output_directory(output_dir):
     :param output_dir: The user-specified output directory, if any
     :return: The output directory
     """
-    # If no output directory is specified for the run, create a one based on
-    # time
+    # If no output directory is specified for the run, create a one based on time
+
 
 
     if output_dir is None:
@@ -283,6 +283,13 @@ def create_output_directory(output_dir):
 
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
+
+    # clean previous ECTyper output files if the directory was used in previous runs 
+    for file in definitions.OUTPUT_FILES_LIST:
+        path2file = os.path.join(output_dir,file)
+        if os.path.exists(path2file):
+            LOG.info(f"Cleaning ECTyper previous files. Removing previously generated {path2file} ...")
+            os.remove(path2file) 
     return out_dir
 
 
@@ -398,9 +405,14 @@ def genome_group_prediction(g_group, alleles_fasta, args, temp_dir, ectyperdb_di
             blast_output_file,
             ectyperdb_dict,
             args);
-
-    blast_output_file_path = os.path.join(args.output,"blast_output_alleles.txt")
-    blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , sep="\t", index=False)
-    LOG.info("BLAST output file against reference alleles is written at {}".format(blast_output_file_path))
+
+    blast_output_file_path = os.path.join(args.output,f"blastn_output_alleles.txt")
+    if os.path.exists(blast_output_file_path) == False:
+        blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , sep="\t", index=False)
+        LOG.info("BLAST output file against reference alleles is written at {}".format(blast_output_file_path))
+    else:
+        blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , mode="a", header=False, sep="\t", index=False)
+        LOG.info("Appending BLAST output file against reference alleles at {}".format(blast_output_file_path))
+
 
     return db_prediction_dict
diff --git a/ectyper/predictionFunctions.py b/ectyper/predictionFunctions.py
@@ -134,7 +134,11 @@ def shiga_toxing_subtyping(pathotype_genes_tmp_df, output_dir, debug):
             if debug:
                 stx_df_out_filename = f'{gene}_allhits_annotated_df.txt'
                 LOG.debug(f"Wrote {gene} annotated potential hits dataframe to {output_dir}/{stx_df_out_filename}")
-                stx_toxin_df.to_csv(os.path.join(output_dir,stx_df_out_filename), sep="\t", index=False)
+                path2stx_df = os.path.join(output_dir,stx_df_out_filename)
+                if os.path.exists(path2stx_df) == False:
+                    stx_toxin_df.to_csv(path2stx_df, sep="\t", index=False)
+                else:
+                    stx_toxin_df.to_csv(path2stx_df , mode="a", header=False, sep="\t", index=False)     
             # get top hit for each common gene range. Provide mixed call if >1 hits share the same 'bitscore'
             stx_subtypes_dict={}
             for range_id in stx_toxin_df['rangeid'].unique():
@@ -312,7 +316,9 @@ def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_gen
     #write pathotype blastn results
     if debug == True:
         LOG.debug(f"Writting overall pathotype BLASTn results to {output_dir}/blastn_pathotype_alleles_overall.txt")
-        pathotype_genes_overall_df.to_csv(f'{output_dir}/blastn_pathotype_alleles_overall.txt',sep="\t", index=False)
+        path2pathotype_df = f'{output_dir}/blastn_pathotype_alleles_overall.txt'
+        pathotype_genes_overall_df.to_csv(path2pathotype_df,sep="\t", index=False)
+
 
 
     return predictions_pathotype_dict
@@ -361,7 +367,7 @@ def predict_serotype(blast_output_file, ectyper_dict, args):
     # Make prediction for each genome based on blast output
     for genome_name, per_genome_df in output_df.groupby('genome_name'):
         predictions_dict[genome_name] = get_prediction(per_genome_df)
-    LOG.info("Serotype prediction successfully completed")
+    LOG.info(f"Serotype prediction successfully completed for {genome_name}")
     LOG.debug("Predictions dict:\n{}".format(predictions_dict))
 
     return predictions_dict, output_df

diff --git a/ectyper/speciesIdentification.py b/ectyper/speciesIdentification.py
@@ -185,7 +185,7 @@ def get_species(file, args, cores=1):
 
     if args.debug:
         LOG.debug("Wrote MASH against reference sketch results to {}".format(args.output))
-        with open(file=args.output+"/mash_output.txt", mode="w") as fp:
+        with open(file=args.output+"/mash_output.txt", mode="a") as fp:
             fp.write(sort_output.stdout.decode("utf-8"))
         fp.close()