Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added 2 QC thresholds to ANI task to reduce false positives #168

Merged
merged 28 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
3667868
added ani_threshold Float input to animummer task. also exposed cpus …
kapsakcj Aug 24, 2023
697c28b
fixed awk syntax
kapsakcj Aug 24, 2023
07a60a0
added ani_docker to export_taxon_tables task and illumina pe workflow
kapsakcj Aug 24, 2023
20cb4e3
remove "The"
kapsakcj Aug 24, 2023
d7de47e
added ani_docker output to theiaprok_fasta wf
kapsakcj Aug 25, 2023
ff17564
removed comma that I accidentally added to call block of qc_check_tab…
kapsakcj Aug 25, 2023
7e49db0
added ani_docker output to theiaprok illumina se and ONT workflows
kapsakcj Aug 25, 2023
5dc45e4
update CI
kapsakcj Aug 25, 2023
5fd1c38
Merge remote-tracking branch 'origin/main' into cjk-ani-threshold
kapsakcj Sep 1, 2023
3cc9766
update CI
kapsakcj Sep 1, 2023
a556675
mummer_ani task: added percent_base_aligned_threshold with 70 as defa…
kapsakcj Oct 6, 2023
aea0235
clarified messages when 2 thresholds are not passed
kapsakcj Oct 6, 2023
818065a
added ani_threshold Float input to animummer task. also exposed cpus …
kapsakcj Aug 24, 2023
3c49ebc
fixed awk syntax
kapsakcj Aug 24, 2023
dbbb92c
added ani_docker to export_taxon_tables task and illumina pe workflow
kapsakcj Aug 24, 2023
55dfc2f
remove "The"
kapsakcj Aug 24, 2023
c62449e
added ani_docker output to theiaprok_fasta wf
kapsakcj Aug 25, 2023
c9340fe
removed comma that I accidentally added to call block of qc_check_tab…
kapsakcj Aug 25, 2023
cb94465
added ani_docker output to theiaprok illumina se and ONT workflows
kapsakcj Aug 25, 2023
f08ee35
update CI
kapsakcj Aug 25, 2023
be3a734
mummer_ani task: added percent_base_aligned_threshold with 70 as defa…
kapsakcj Oct 6, 2023
3e23ba7
clarified messages when 2 thresholds are not passed
kapsakcj Oct 6, 2023
5fdd8f0
Merge branch 'cjk-ani-threshold' of https://github.com/theiagen/publi…
kapsakcj Oct 10, 2023
1c86b20
removed accidental duplication in code introduced during merge confli…
kapsakcj Oct 10, 2023
d25e956
update CI for theiaprok illumina pe and se
kapsakcj Oct 10, 2023
c66ba61
Merge remote-tracking branch 'origin/main' into cjk-ani-threshold
kapsakcj Oct 20, 2023
b500a8a
lower default ani_threshold to 80 (CDC standard) from 85
kapsakcj Oct 20, 2023
5a54052
update CI
kapsakcj Oct 20, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 46 additions & 18 deletions tasks/quality_control/task_mummer_ani.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,17 @@ task animummer {
String samplename
File? ref_genome
Float mash_filter = 0.9
String docker="us-docker.pkg.dev/general-theiagen/staphb/mummer:4.0.0-rgdv2"
# these 2 thresholds were set as they are used by CDC enterics lab/PulseNet for ANI thresholds
Float ani_threshold = 80.0
Float percent_bases_aligned_threshold = 70.0
String docker= "us-docker.pkg.dev/general-theiagen/staphb/mummer:4.0.0-rgdv2"
Int cpus = 4
Int memory = 8
Int disk_size = 100
}
command <<<
# capture and version
mummer --version | tee MUMMER_VERSION
mummer --version | tee MUMMER_VERSION.txt

# set the reference genome
# if not defined by user, then use all 43 genomes in RGDv2
Expand Down Expand Up @@ -42,18 +47,22 @@ task animummer {
echo "~{samplename} did not surpass the minimum mash genetic distance filter, thus ANI was not performed"
echo "The output TSV only contains the header line"
# set output variables as 0s or descriptive strings
echo "0.0" > ANI_HIGHEST_PERCENT_BASES_ALIGNED
echo "0.0" > ANI_HIGHEST_PERCENT
echo "ANI skipped due to high genetic divergence from reference genomes" > ANI_TOP_SPECIES_MATCH
echo "0.0" > ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt
echo "0.0" > ANI_HIGHEST_PERCENT.txt
echo "ANI skipped due to high genetic divergence from reference genomes" > ANI_TOP_SPECIES_MATCH.txt
# if output TSV has greater than 1 lines, then parse for appropriate outputs
else
## parse out highest percentBases aligned
cut -f 5 ~{samplename}.ani-mummer.out.tsv | sort -nr | head -n 1 | tee ANI_HIGHEST_PERCENT_BASES_ALIGNED
echo "highest percent bases aligned is: $(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED)"
cut -f 5 ~{samplename}.ani-mummer.out.tsv | sort -nr | head -n 1 | tee ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt
echo "highest percent bases aligned is: $(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt)"
ANI_HIGHEST_PERCENT_BASES_ALIGNED=$(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt)

## parse out ANI value using highest percentBases aligned value
grep "$(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED)" ~{samplename}.ani-mummer.out.tsv | cut -f 3 | tee ANI_HIGHEST_PERCENT
echo "ANI value is: $(cat ANI_HIGHEST_PERCENT)"
grep "$(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt)" ~{samplename}.ani-mummer.out.tsv | cut -f 3 | tee ANI_HIGHEST_PERCENT.txt
echo "Highest ANI value is: $(cat ANI_HIGHEST_PERCENT.txt)"
# set ANI_HIGHEST_PERCENT as a bash variable (float)
ANI_HIGHEST_PERCENT=$(cat ANI_HIGHEST_PERCENT.txt)


# have to separate out results for ani_top_species match because user-defined reference genome FASTAs will not be named as they are in RGDv2
if [[ -z "~{ref_genome}" ]]; then
Expand All @@ -63,27 +72,46 @@ task animummer {
# cut on periods to pull out genus_species (in future this will inlcude lineages for Listeria and other sub-species designations)
# have to create assembly_file_basename bash variable since output TSV does not include full path to assembly file, only filename
assembly_file_basename=$(basename ~{assembly})
grep "$(cat ANI_HIGHEST_PERCENT)" ~{samplename}.ani-mummer.out.tsv | cut -f 1,2 | sed "s|${assembly_file_basename}||g" | xargs | cut -d '.' -f 3 | tee ANI_TOP_SPECIES_MATCH
echo "ANI top species match is: $(cat ANI_TOP_SPECIES_MATCH)"
grep "${ANI_HIGHEST_PERCENT}" ~{samplename}.ani-mummer.out.tsv | cut -f 1,2 | sed "s|${assembly_file_basename}||g" | xargs | cut -d '.' -f 3 | tee ANI_TOP_SPECIES_MATCH.txt
echo "ANI top species match is: $(cat ANI_TOP_SPECIES_MATCH.txt)"

# if ANI threshold or percent_bases_aligned_threshold is defined by user (they both are by default), compare to highest ANI value and corresponding percent_bases_aligned value and only output ANI_top_species_match if both thresholds are surpassed
if [[ -n "~{ani_threshold}" || -n "~{percent_bases_aligned_threshold}" ]]; then
echo "Comparing user-defined ANI threshold to highest ANI value..."
# compare ANI_HIGHEST_PERCENT to ani_threshold using awk
if ! awk "BEGIN{ exit ($ANI_HIGHEST_PERCENT < ~{ani_threshold} )}"; then
echo "The highest ANI value $ANI_HIGHEST_PERCENT is less than the user-defined ANI threshold of ~{ani_threshold}"
echo "ANI top species match did not surpass the user-defined ANI threshold of ~{ani_threshold}" > ANI_TOP_SPECIES_MATCH.txt
# else if: compare percent_bases_aligned_threshold to ANI_HIGHEST_PERCENT_BASES_ALIGNED using awk
elif ! awk "BEGIN{ exit (${ANI_HIGHEST_PERCENT_BASES_ALIGNED} < ~{percent_bases_aligned_threshold} )}"; then
echo "The highest ANI percent bases aligned value ${ANI_HIGHEST_PERCENT_BASES_ALIGNED} is less than the user-defined threshold of ~{percent_bases_aligned_threshold}"
# overwrite ANI_TOP_SPECIES_MATCH.txt when percent_bases_aligned threshold is not surpassed
echo "ANI top species match did not surpass the user-defined percent bases aligned threshold of ~{percent_bases_aligned_threshold}" > ANI_TOP_SPECIES_MATCH.txt
else
echo "The highest ANI value ${ANI_HIGHEST_PERCENT} is greater than the user-defined threshold ~{ani_threshold}"
echo "The highest percent bases aligned value ${ANI_HIGHEST_PERCENT_BASES_ALIGNED} is greater than the user-defined threshold ~{percent_bases_aligned_threshold}"
fi
fi
else
# User specified a reference genome, use fasta filename as output string
basename "${REF_GENOME}" > ANI_TOP_SPECIES_MATCH
basename "${REF_GENOME}" > ANI_TOP_SPECIES_MATCH.txt
echo "Reference genome used for ANI is: ${REF_GENOME}"
fi
fi

>>>
output {
Float ani_highest_percent = read_float("ANI_HIGHEST_PERCENT")
Float ani_highest_percent_bases_aligned = read_float("ANI_HIGHEST_PERCENT_BASES_ALIGNED")
Float ani_highest_percent = read_float("ANI_HIGHEST_PERCENT.txt")
Float ani_highest_percent_bases_aligned = read_float("ANI_HIGHEST_PERCENT_BASES_ALIGNED.txt")
File ani_output_tsv = "~{samplename}.ani-mummer.out.tsv"
String ani_top_species_match = read_string("ANI_TOP_SPECIES_MATCH")
String ani_mummer_version = read_string("MUMMER_VERSION")
String ani_top_species_match = read_string("ANI_TOP_SPECIES_MATCH.txt")
String ani_mummer_version = read_string("MUMMER_VERSION.txt")
String ani_docker = "~{docker}"
}
runtime {
docker: "~{docker}"
memory: "8 GB"
cpu: 4
memory: "~{memory} GB"
cpu: cpus
disks: "local-disk " + disk_size + " SSD"
disk: disk_size + " GB"
maxRetries: 3
Expand Down
2 changes: 2 additions & 0 deletions tasks/utilities/task_broad_terra_tools.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ task export_taxon_tables {
File? ani_output_tsv
String? ani_top_species_match
String? ani_mummer_version
String? ani_docker
String? kmerfinder_docker
File? kmerfinder_results_tsv
String? kmerfinder_top_hit
Expand Down Expand Up @@ -595,6 +596,7 @@ task export_taxon_tables {
"ani_output_tsv": "~{ani_output_tsv}",
"ani_top_species_match": "~{ani_top_species_match}",
"ani_mummer_version": "~{ani_mummer_version}",
"ani_docker": "~{ani_docker}",
"kmerfinder_docker": "~{kmerfinder_docker}",
"kmerfinder_results_tsv": "~{kmerfinder_results_tsv}",
"kmerfinder_top_hit": "~{kmerfinder_top_hit}",
Expand Down
8 changes: 4 additions & 4 deletions tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@
- path: miniwdl_run/wdl/tasks/quality_control/task_fastq_scan.wdl
contains: ["version", "fastq_scan", "output"]
- path: miniwdl_run/wdl/tasks/quality_control/task_mummer_ani.wdl
md5sum: 21e8b1cbe4c8be26d1ac8b8013970166
md5sum: d50a40d533d51834d0000971dc2c2014
- path: miniwdl_run/wdl/tasks/quality_control/task_ncbi_scrub.wdl
contains: ["version", "scrub", "output"]
- path: miniwdl_run/wdl/tasks/quality_control/task_qc_check_phb.wdl
Expand Down Expand Up @@ -624,17 +624,17 @@
- path: miniwdl_run/wdl/tasks/species_typing/task_ts_mlst.wdl
md5sum: d49ae0b02e798af0636eb2721bb434b4
- path: miniwdl_run/wdl/tasks/task_versioning.wdl
md5sum: 9c02b95d23a602d9842fdeac883037b1
md5sum: 3469cf6787f279ffa81fa50f6257fcd7
- path: miniwdl_run/wdl/tasks/taxon_id/task_gambit.wdl
md5sum: 08987d952c67c6ff6debf6898af15f9a
- path: miniwdl_run/wdl/tasks/taxon_id/task_kraken2.wdl
md5sum: a1f287e6e6feaf2d7d3c74a70e3b5a28
- path: miniwdl_run/wdl/tasks/taxon_id/task_midas.wdl
md5sum: 024971d1439dff7d59c0a26a824bd2c6
- path: miniwdl_run/wdl/tasks/utilities/task_broad_terra_tools.wdl
md5sum: 43ef050bde1fb8755f38e697a1794918
md5sum: 26ac9a20c8043a28d373bfe0ca361cc6
- path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_pe.wdl
md5sum: 0c87a7279c4870a821c3dc1db9a6a94b
md5sum: ac4971ad992c3e8abee5d3817928a8f2
- path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl
md5sum: 00bd2489b2a7aa5b88340a940961a857
- path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_pe.wdl
Expand Down
8 changes: 4 additions & 4 deletions tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@
- path: miniwdl_run/wdl/tasks/quality_control/task_fastq_scan.wdl
contains: ["version", "fastq_scan", "output"]
- path: miniwdl_run/wdl/tasks/quality_control/task_mummer_ani.wdl
md5sum: 21e8b1cbe4c8be26d1ac8b8013970166
md5sum: d50a40d533d51834d0000971dc2c2014
- path: miniwdl_run/wdl/tasks/quality_control/task_qc_check_phb.wdl
contains: ["version", "qc_check_table", "output"]
- path: miniwdl_run/wdl/tasks/quality_control/task_quast.wdl
Expand Down Expand Up @@ -592,17 +592,17 @@
- path: miniwdl_run/wdl/tasks/species_typing/task_ts_mlst.wdl
md5sum: d49ae0b02e798af0636eb2721bb434b4
- path: miniwdl_run/wdl/tasks/task_versioning.wdl
md5sum: 9c02b95d23a602d9842fdeac883037b1
md5sum: 3469cf6787f279ffa81fa50f6257fcd7
- path: miniwdl_run/wdl/tasks/taxon_id/task_gambit.wdl
md5sum: 08987d952c67c6ff6debf6898af15f9a
- path: miniwdl_run/wdl/tasks/taxon_id/task_kraken2.wdl
md5sum: a1f287e6e6feaf2d7d3c74a70e3b5a28
- path: miniwdl_run/wdl/tasks/taxon_id/task_midas.wdl
md5sum: 024971d1439dff7d59c0a26a824bd2c6
- path: miniwdl_run/wdl/tasks/utilities/task_broad_terra_tools.wdl
md5sum: 43ef050bde1fb8755f38e697a1794918
md5sum: 26ac9a20c8043a28d373bfe0ca361cc6
- path: miniwdl_run/wdl/workflows/theiaprok/wf_theiaprok_illumina_se.wdl
md5sum: e1d9e75dae5176ceeb95b88a5d3bbba7
md5sum: 7303247badeb82119ccef528f7367f89
- path: miniwdl_run/wdl/workflows/utilities/wf_merlin_magic.wdl
md5sum: 00bd2489b2a7aa5b88340a940961a857
- path: miniwdl_run/wdl/workflows/utilities/wf_read_QC_trim_se.wdl
Expand Down
2 changes: 2 additions & 0 deletions workflows/theiaprok/wf_theiaprok_fasta.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ workflow theiaprok_fasta {
ani_output_tsv = ani.ani_output_tsv,
ani_top_species_match = ani.ani_top_species_match,
ani_mummer_version = ani.ani_mummer_version,
ani_docker = ani.ani_docker,
kmerfinder_docker = kmerfinder.kmerfinder_docker,
kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv,
kmerfinder_top_hit = kmerfinder.kmerfinder_top_hit,
Expand Down Expand Up @@ -427,6 +428,7 @@ workflow theiaprok_fasta {
File? ani_output_tsv = ani.ani_output_tsv
String? ani_top_species_match = ani.ani_top_species_match
String? ani_mummer_version = ani.ani_mummer_version
String? ani_mummer_docker = ani.ani_docker
# kmerfinder outputs
String? kmerfinder_docker = kmerfinder.kmerfinder_docker
File? kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv
Expand Down
2 changes: 2 additions & 0 deletions workflows/theiaprok/wf_theiaprok_illumina_pe.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ workflow theiaprok_illumina_pe {
ani_output_tsv = ani.ani_output_tsv,
ani_top_species_match = ani.ani_top_species_match,
ani_mummer_version = ani.ani_mummer_version,
ani_docker = ani.ani_docker,
kmerfinder_docker = kmerfinder.kmerfinder_docker,
kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv,
kmerfinder_top_hit = kmerfinder.kmerfinder_top_hit,
Expand Down Expand Up @@ -621,6 +622,7 @@ workflow theiaprok_illumina_pe {
File? ani_output_tsv = ani.ani_output_tsv
String? ani_top_species_match = ani.ani_top_species_match
String? ani_mummer_version = ani.ani_mummer_version
String? ani_mummer_docker = ani.ani_docker
# kmerfinder outputs
String? kmerfinder_docker = kmerfinder.kmerfinder_docker
File? kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv
Expand Down
2 changes: 2 additions & 0 deletions workflows/theiaprok/wf_theiaprok_illumina_se.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ workflow theiaprok_illumina_se {
ani_output_tsv = ani.ani_output_tsv,
ani_top_species_match = ani.ani_top_species_match,
ani_mummer_version = ani.ani_mummer_version,
ani_docker = ani.ani_docker,
kmerfinder_docker = kmerfinder.kmerfinder_docker,
kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv,
kmerfinder_top_hit = kmerfinder.kmerfinder_top_hit,
Expand Down Expand Up @@ -573,6 +574,7 @@ workflow theiaprok_illumina_se {
File? ani_output_tsv = ani.ani_output_tsv
String? ani_top_species_match = ani.ani_top_species_match
String? ani_mummer_version = ani.ani_mummer_version
String? ani_mummer_docker = ani.ani_docker
# kmerfinder outputs
String? kmerfinder_docker = kmerfinder.kmerfinder_docker
File? kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv
Expand Down
2 changes: 2 additions & 0 deletions workflows/theiaprok/wf_theiaprok_ont.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ workflow theiaprok_ont {
ani_output_tsv = ani.ani_output_tsv,
ani_top_species_match = ani.ani_top_species_match,
ani_mummer_version = ani.ani_mummer_version,
ani_docker = ani.ani_docker,
kmerfinder_docker = kmerfinder.kmerfinder_docker,
kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv,
kmerfinder_top_hit = kmerfinder.kmerfinder_top_hit,
Expand Down Expand Up @@ -547,6 +548,7 @@ workflow theiaprok_ont {
File? ani_output_tsv = ani.ani_output_tsv
String? ani_top_species_match = ani.ani_top_species_match
String? ani_mummer_version = ani.ani_mummer_version
String? ani_mummer_docker = ani.ani_docker
# kmerfinder outputs
String? kmerfinder_docker = kmerfinder.kmerfinder_docker
File? kmerfinder_results_tsv = kmerfinder.kmerfinder_results_tsv
Expand Down