Skip to content

Commit

Permalink
Fixed bugs related to batch mode output files displaying only last sa…
Browse files Browse the repository at this point in the history
…mple results instead of results from All samples. Also added automatic clceaner for the main output files to prevent excesive file output growth especially of Blastn results
  • Loading branch information
kbessonov1984 committed Aug 1, 2024
1 parent 8002091 commit 5d2da4d
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 14 deletions.
4 changes: 3 additions & 1 deletion ectyper/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,6 @@
'GeneLengths','DatabaseVer','Warnings','Pathotype', 'PathotypeCounts', 'PathotypeGenes', 'PathotypeGeneNames', 'PathotypeAccessions', 'PathotypeAlleleIDs',
'PathotypeIdentities(%)','PathotypeCoverages(%)','PathotypeGeneLengthRatios','PathotypeRuleIDs', 'PathotypeGeneCounts', 'PathoDBVer',
'StxSubtypes','StxAccessions','StxAlleleIDs', 'StxIdentities(%)','StxCoverages(%)','StxLengths',
'StxContigNames', 'StxContigNum','StxCoordinates']
'StxContigNames', 'StxContigNum','StxCoordinates']
OUTPUT_FILES_LIST = ['blastn_output_alleles.txt', 'blastn_pathotype_alleles_overall.txt', 'mash_output.txt',
'stx1_allhits_annotated_df.txt', 'stx2_allhits_annotated_df.txt']
30 changes: 21 additions & 9 deletions ectyper/ectyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def run_program():
Creates all required files and controls function execution.
:return: success or failure
"""

LOG.setLevel(logging.INFO)
args = commandLineOptions.parse_command_line()


Expand All @@ -76,7 +76,7 @@ def run_program():
LOG.setLevel(logging.DEBUG)
else:
fh.setLevel(logging.INFO)
LOG.setLevel(logging.INFO)

LOG.addHandler(fh)

#try to load database
Expand Down Expand Up @@ -235,7 +235,7 @@ def run_program():

if args.debug == False:
shutil.rmtree(temp_dir, ignore_errors=True)
LOG.info("ECTyper has finished successfully.")
LOG.info(f"ECTyper has finished successfully. Results available at {os.path.abspath(args.output)}")

def getOantigenHighSimilarGroup(final_predictions, sample):
pred_Otypes = final_predictions[sample]['O']["serogroup"].split("/") #if call is a mixed call
Expand Down Expand Up @@ -263,8 +263,8 @@ def create_output_directory(output_dir):
:param output_dir: The user-specified output directory, if any
:return: The output directory
"""
# If no output directory is specified for the run, create a one based on
# time
# If no output directory is specified for the run, create a one based on time



if output_dir is None:
Expand All @@ -283,6 +283,13 @@ def create_output_directory(output_dir):

if not os.path.exists(out_dir):
os.makedirs(out_dir)

# clean previous ECTyper output files if the directory was used in previous runs
for file in definitions.OUTPUT_FILES_LIST:
path2file = os.path.join(output_dir,file)
if os.path.exists(path2file):
LOG.info(f"Cleaning ECTyper previous files. Removing previously generated {path2file} ...")
os.remove(path2file)
return out_dir


Expand Down Expand Up @@ -398,9 +405,14 @@ def genome_group_prediction(g_group, alleles_fasta, args, temp_dir, ectyperdb_di
blast_output_file,
ectyperdb_dict,
args);

blast_output_file_path = os.path.join(args.output,"blast_output_alleles.txt")
blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , sep="\t", index=False)
LOG.info("BLAST output file against reference alleles is written at {}".format(blast_output_file_path))

blast_output_file_path = os.path.join(args.output,f"blastn_output_alleles.txt")
if os.path.exists(blast_output_file_path) == False:
blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , sep="\t", index=False)
LOG.info("BLAST output file against reference alleles is written at {}".format(blast_output_file_path))
else:
blast_output_df[sorted(blast_output_df.columns)].to_csv(blast_output_file_path , mode="a", header=False, sep="\t", index=False)
LOG.info("Appending BLAST output file against reference alleles at {}".format(blast_output_file_path))


return db_prediction_dict
12 changes: 9 additions & 3 deletions ectyper/predictionFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,11 @@ def shiga_toxing_subtyping(pathotype_genes_tmp_df, output_dir, debug):
if debug:
stx_df_out_filename = f'{gene}_allhits_annotated_df.txt'
LOG.debug(f"Wrote {gene} annotated potential hits dataframe to {output_dir}/{stx_df_out_filename}")
stx_toxin_df.to_csv(os.path.join(output_dir,stx_df_out_filename), sep="\t", index=False)
path2stx_df = os.path.join(output_dir,stx_df_out_filename)
if os.path.exists(path2stx_df) == False:
stx_toxin_df.to_csv(path2stx_df, sep="\t", index=False)
else:
stx_toxin_df.to_csv(path2stx_df , mode="a", header=False, sep="\t", index=False)
# get top hit for each common gene range. Provide mixed call if >1 hits share the same 'bitscore'
stx_subtypes_dict={}
for range_id in stx_toxin_df['rangeid'].unique():
Expand Down Expand Up @@ -312,7 +316,9 @@ def predict_pathotype_and_shiga_toxin_subtype(ecoli_genome_files_dict, other_gen
#write pathotype blastn results
if debug == True:
LOG.debug(f"Writting overall pathotype BLASTn results to {output_dir}/blastn_pathotype_alleles_overall.txt")
pathotype_genes_overall_df.to_csv(f'{output_dir}/blastn_pathotype_alleles_overall.txt',sep="\t", index=False)
path2pathotype_df = f'{output_dir}/blastn_pathotype_alleles_overall.txt'
pathotype_genes_overall_df.to_csv(path2pathotype_df,sep="\t", index=False)



return predictions_pathotype_dict
Expand Down Expand Up @@ -361,7 +367,7 @@ def predict_serotype(blast_output_file, ectyper_dict, args):
# Make prediction for each genome based on blast output
for genome_name, per_genome_df in output_df.groupby('genome_name'):
predictions_dict[genome_name] = get_prediction(per_genome_df)
LOG.info("Serotype prediction successfully completed")
LOG.info(f"Serotype prediction successfully completed for {genome_name}")
LOG.debug("Predictions dict:\n{}".format(predictions_dict))

return predictions_dict, output_df
Expand Down
2 changes: 1 addition & 1 deletion ectyper/speciesIdentification.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def get_species(file, args, cores=1):

if args.debug:
LOG.debug("Wrote MASH against reference sketch results to {}".format(args.output))
with open(file=args.output+"/mash_output.txt", mode="w") as fp:
with open(file=args.output+"/mash_output.txt", mode="a") as fp:
fp.write(sort_output.stdout.decode("utf-8"))
fp.close()

Expand Down

0 comments on commit 5d2da4d

Please sign in to comment.