diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md index 5849d2f08..5ce7225b4 100644 --- a/docs/workflows/genomic_characterization/theiacov.md +++ b/docs/workflows/genomic_characterization/theiacov.md @@ -1104,6 +1104,7 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | pangolin_notes | String | Lineage notes as determined by Pangolin | CL, FASTA, ONT, PE, SE | | pangolin_versions | String | All Pangolin software and database versions | CL, FASTA, ONT, PE, SE | | percent_reference_coverage | Float | Percent coverage of the reference genome after performing primer trimming; calculated as assembly_length_unambiguous / length of the reference genome (SC2: 29903) x 100 | CL, FASTA, ONT, PE, SE | +| percentage_mapped_reads | String |Percentage of reads that successfully aligned to the reference genome. This value is calculated by number of mapped reads / total number of reads x 100. | ONT, PE, SE | | primer_bed_name | String | Name of the primer bed files used for primer trimming | CL, ONT, PE, SE | | primer_trimmed_read_percent | Float | Percentage of read data with primers trimmed as determined by iVar trim | PE, SE | | qc_check | String | The results of the QC Check task | CL, FASTA, ONT, PE, SE | diff --git a/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl b/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl index 9fe5f843b..359eb4ea1 100644 --- a/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl +++ b/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl @@ -14,11 +14,11 @@ task stats_n_coverage { samtools --version | head -n1 | tee VERSION samtools stats ~{bamfile} > ~{samplename}.stats.txt - samtools coverage ~{bamfile} -m -o ~{samplename}.cov.hist samtools coverage ~{bamfile} -o ~{samplename}.cov.txt samtools flagstat ~{bamfile} > ~{samplename}.flagstat.txt + # Extracting coverage, depth, meanbaseq, and meanmapq coverage=$(cut -f 6 ~{samplename}.cov.txt | tail -n 1) depth=$(cut -f 7 ~{samplename}.cov.txt | tail -n 1) meanbaseq=$(cut -f 8 ~{samplename}.cov.txt | tail -n 1) @@ -33,6 +33,34 @@ task stats_n_coverage { echo $depth | tee DEPTH echo $meanbaseq | tee MEANBASEQ echo $meanmapq | tee MEANMAPQ + + # Parsing stats.txt for total and mapped reads + total_reads=$(grep "^SN" ~{samplename}.stats.txt | grep "raw total sequences:" | cut -f 3) + mapped_reads=$(grep "^SN" ~{samplename}.stats.txt | grep "reads mapped:" | cut -f 3) + + # Check for empty values and set defaults to avoid errors + if [ -z "$total_reads" ]; then total_reads="1"; fi # Avoid division by zero + if [ -z "$mapped_reads" ]; then mapped_reads="0"; fi + + # Calculate the percentage of mapped reads + percentage_mapped_reads=$(awk "BEGIN {printf \"%.2f\", ($mapped_reads / $total_reads) * 100}") + + # If the percentage calculation fails, default to 0.0 + if [ -z "$percentage_mapped_reads" ]; then percentage_mapped_reads="0.0"; fi + + # Output the result + echo $percentage_mapped_reads | tee PERCENTAGE_MAPPED_READS + + #output all metrics in one txt file + # Output header row (for CSV) + echo -e "Statistic\tValue" > ~{samplename}_metrics.txt + + # Output each statistic as a row + echo -e "Coverage\t$coverage" >> ~{samplename}_metrics.txt + echo -e "Depth\t$depth" >> ~{samplename}_metrics.txt + echo -e "Mean Base Quality\t$meanbaseq" >> ~{samplename}_metrics.txt + echo -e "Mean Mapping Quality\t$meanmapq" >> ~{samplename}_metrics.txt + echo -e "Percentage Mapped Reads\t$percentage_mapped_reads" >> ~{samplename}_metrics.txt >>> output { String date = read_string("DATE") @@ -45,6 +73,9 @@ task stats_n_coverage { Float depth = read_string("DEPTH") Float meanbaseq = read_string("MEANBASEQ") Float meanmapq = read_string("MEANMAPQ") + Float percentage_mapped_reads = read_string("PERCENTAGE_MAPPED_READS") + File metrics_txt = "~{samplename}_metrics.txt" + } runtime { docker: docker diff --git a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml index 48ffe30c9..e42083370 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_clearlabs.yml @@ -318,7 +318,7 @@ - path: miniwdl_run/call-pangolin4/work/clearlabs.pangolin_report.csv md5sum: 151390c419b00ca44eb83e2bbfb96996 - path: miniwdl_run/call-stats_n_coverage/command - md5sum: 51da320ddc7de2ffeb263f0ddd85ced6 + md5sum: 218acd850fc53050a663b1cdc856bdbe - path: miniwdl_run/call-stats_n_coverage/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-stats_n_coverage/outputs.json @@ -350,7 +350,7 @@ - path: miniwdl_run/call-stats_n_coverage/work/clearlabs.stats.txt md5sum: bfed5344c91ce6f4db1f688cac0a3ab9 - path: miniwdl_run/call-stats_n_coverage_primtrim/command - md5sum: a84f90b8877babe54bf8c068d244fbe8 + md5sum: eb5a87024061836b8c244b0cd050bb6c - path: miniwdl_run/call-stats_n_coverage_primtrim/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-stats_n_coverage_primtrim/outputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml index b1bb6da13..0d8d12351 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_pe.yml @@ -205,7 +205,7 @@ md5sum: 511e696afe25f8b96a84d68ec5a8af8a # stats n coverage primer trim - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/command - md5sum: 260c3887be6d99b18caf6d3914c5737f + md5sum: cd4e1a2a33c10e9513e01c95eb641bdc - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/outputs.json @@ -288,7 +288,7 @@ md5sum: 6c63395a125f8618334b8af2de4e2d88 # stats n coverage - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/command - md5sum: 1ffac4cc3e9bdd84a0f9228e8e5ca5d9 + md5sum: b4e5c57051c06349d24b01d00d383733 - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/outputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml index 9af5b61c9..d79e610c7 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_illumina_se.yml @@ -157,7 +157,7 @@ md5sum: 603c3cbc771ca910b96d3c032aafe7c9 # stats n coverage primer trim - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/command - md5sum: 67cac223adcf059a9dfaa9f28ed34f68 + md5sum: e9aa93dda1866b935220de3a60a28455 - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage_primtrim/outputs.json @@ -240,7 +240,7 @@ md5sum: 03c5ecf22fdfdb6b240ac3880281a056 # stats n coverage - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/command - md5sum: 3dacccb252429a0ff46c079a75a09377 + md5sum: f1c13608d58271a8dcd476f442fc181c - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-ivar_consensus/call-stats_n_coverage/outputs.json diff --git a/tests/workflows/theiacov/test_wf_theiacov_ont.yml b/tests/workflows/theiacov/test_wf_theiacov_ont.yml index 1772e16b4..233cd6e4e 100644 --- a/tests/workflows/theiacov/test_wf_theiacov_ont.yml +++ b/tests/workflows/theiacov/test_wf_theiacov_ont.yml @@ -232,7 +232,7 @@ md5sum: 32c0be4fb7f3030bf9c74c0a836d4f2e - path: miniwdl_run/call-raw_check_reads/work/_miniwdl_inputs/0/ont.fastq.gz - path: miniwdl_run/call-stats_n_coverage/command - md5sum: 93414eacbbb9d7c4813bb54a8a507078 + md5sum: 22559ad4e4c2af9c55c563551e95e819 - path: miniwdl_run/call-stats_n_coverage/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-stats_n_coverage/outputs.json @@ -257,7 +257,7 @@ - path: miniwdl_run/call-stats_n_coverage/work/ont.flagstat.txt - path: miniwdl_run/call-stats_n_coverage/work/ont.stats.txt - path: miniwdl_run/call-stats_n_coverage_primtrim/command - md5sum: c6e7de70dfdbb1858229e44777b84110 + md5sum: ac19abff17f090e6da63cee8b831b212 - path: miniwdl_run/call-stats_n_coverage_primtrim/inputs.json contains: ["bamfile", "samplename"] - path: miniwdl_run/call-stats_n_coverage_primtrim/outputs.json diff --git a/workflows/theiacov/wf_theiacov_clearlabs.wdl b/workflows/theiacov/wf_theiacov_clearlabs.wdl index 8932c983f..0368bef95 100644 --- a/workflows/theiacov/wf_theiacov_clearlabs.wdl +++ b/workflows/theiacov/wf_theiacov_clearlabs.wdl @@ -207,6 +207,8 @@ workflow theiacov_clearlabs { Int number_Degenerate = consensus_qc.number_Degenerate Int number_Total = consensus_qc.number_Total Float percent_reference_coverage = consensus_qc.percent_reference_coverage + # Percentage mapped reads + Float percentage_mapped_reads = stats_n_coverage.percentage_mapped_reads # SC2 specific coverage outputs Float? sc2_s_gene_mean_coverage = gene_coverage.sc2_s_gene_depth Float? sc2_s_gene_percent_coverage = gene_coverage.sc2_s_gene_percent_coverage diff --git a/workflows/theiacov/wf_theiacov_illumina_pe.wdl b/workflows/theiacov/wf_theiacov_illumina_pe.wdl index ece3c201a..3aa1694ef 100644 --- a/workflows/theiacov/wf_theiacov_illumina_pe.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_pe.wdl @@ -144,7 +144,7 @@ workflow theiacov_illumina_pe { trim_primers = trim_primers } } - # perform flu-specific tasks + # for flu organisms call flu_track if (organism_parameters.standardized_organism == "flu") { call run_flu_track.flu_track { input: @@ -439,6 +439,7 @@ workflow theiacov_illumina_pe { # QC_Check Results String? qc_check = qc_check_task.qc_check File? qc_standard = qc_check_task.qc_standard - + # Capture percentage_mapped_reads from ivar_consensus task or flu_track task + String percentage_mapped_reads = select_first([ivar_consensus.percentage_mapped_reads, flu_track.percentage_mapped_reads,""]) } } diff --git a/workflows/theiacov/wf_theiacov_illumina_se.wdl b/workflows/theiacov/wf_theiacov_illumina_se.wdl index fa1044c24..4ae59f5dc 100644 --- a/workflows/theiacov/wf_theiacov_illumina_se.wdl +++ b/workflows/theiacov/wf_theiacov_illumina_se.wdl @@ -317,5 +317,7 @@ workflow theiacov_illumina_se { # QC_Check Results String? qc_check = qc_check_task.qc_check File? qc_standard = qc_check_task.qc_standard + # Capture percentage_mapped_reads from ivar_consensus task + String? percentage_mapped_reads = ivar_consensus.percentage_mapped_reads } } \ No newline at end of file diff --git a/workflows/theiacov/wf_theiacov_ont.wdl b/workflows/theiacov/wf_theiacov_ont.wdl index 984934062..e1614d013 100644 --- a/workflows/theiacov/wf_theiacov_ont.wdl +++ b/workflows/theiacov/wf_theiacov_ont.wdl @@ -149,7 +149,7 @@ workflow theiacov_ont { standardized_organism = organism_parameters.standardized_organism, seq_method = seq_method } - } + } # nanoplot for basic QC metrics call nanoplot_task.nanoplot as nanoplot_raw { input: @@ -427,5 +427,7 @@ workflow theiacov_ont { # QC_Check Results String? qc_check = qc_check_task.qc_check File? qc_standard = qc_check_task.qc_standard + # Non-flu specific outputs + String percentage_mapped_reads = select_first([flu_track.percentage_mapped_reads, stats_n_coverage.percentage_mapped_reads,""]) } } \ No newline at end of file diff --git a/workflows/utilities/wf_flu_track.wdl b/workflows/utilities/wf_flu_track.wdl index 71e8e952d..13c6bc36e 100644 --- a/workflows/utilities/wf_flu_track.wdl +++ b/workflows/utilities/wf_flu_track.wdl @@ -112,6 +112,14 @@ workflow flu_track { docker = assembly_metrics_docker } } + # Calculate the percentage of mapped reads for flu samples + if (defined(irma.seg_ha_bam) || defined(irma.seg_na_bam)) { + call assembly_metrics.stats_n_coverage as flu_stats_n_coverage { + input: + samplename = samplename, + bamfile = select_first([irma.seg_ha_bam, irma.seg_na_bam]) + } + } # combine HA & NA assembly coverages String ha_na_assembly_coverage_string = "HA: " + select_first([ha_assembly_coverage.depth, ""]) + ", NA: " + select_first([na_assembly_coverage.depth, ""]) # ABRICATE will run if assembly is provided, or was generated with IRMA @@ -249,13 +257,14 @@ workflow flu_track { File? irma_mp_segment_fasta = irma.seg_mp_assembly File? irma_np_segment_fasta = irma.seg_np_assembly File? irma_ns_segment_fasta = irma.seg_ns_assembly - Array[File] irma_assemblies = irma.irma_assemblies Array[File] irma_vcfs = irma.irma_vcfs Array[File] irma_bams = irma.irma_bams File? irma_ha_bam = irma.seg_ha_bam File? irma_na_bam = irma.seg_na_bam String ha_na_assembly_coverage = ha_na_assembly_coverage_string + # calulate mapped reads percentage for flu samples + Float? percentage_mapped_reads = flu_stats_n_coverage.percentage_mapped_reads # GenoFLU outputs String? genoflu_version = genoflu.genoflu_version String? genoflu_genotype = genoflu.genoflu_genotype diff --git a/workflows/utilities/wf_ivar_consensus.wdl b/workflows/utilities/wf_ivar_consensus.wdl index 8dadbd2b9..4e7112943 100644 --- a/workflows/utilities/wf_ivar_consensus.wdl +++ b/workflows/utilities/wf_ivar_consensus.wdl @@ -153,6 +153,8 @@ workflow ivar_consensus { String meanmapq_trim = select_first([stats_n_coverage_primtrim.meanmapq, stats_n_coverage.meanmapq,""]) String assembly_mean_coverage = select_first([stats_n_coverage_primtrim.depth, stats_n_coverage.depth,""]) String samtools_version_stats = stats_n_coverage.samtools_version - + + # Assembly metrics + String percentage_mapped_reads = select_first([stats_n_coverage_primtrim.percentage_mapped_reads, stats_n_coverage.percentage_mapped_reads,""]) } }