From 7b265b2578cdf306e61414d681bf60f806f9cca3 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 24 Sep 2024 19:19:52 +0000 Subject: [PATCH 01/48] make theiameta_panel --- tasks/taxon_id/task_krakentools.wdl | 58 +++++++++ .../wf_theiameta_panel_illumina_pe.wdl | 116 ++++++++++++++++++ workflows/utilities/wf_morgana_magic.wdl | 112 +++++++++++++++++ .../utilities/wf_organism_parameters.wdl | 43 +++++-- 4 files changed, 322 insertions(+), 7 deletions(-) create mode 100644 tasks/taxon_id/task_krakentools.wdl create mode 100644 workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl create mode 100644 workflows/utilities/wf_morgana_magic.wdl diff --git a/tasks/taxon_id/task_krakentools.wdl b/tasks/taxon_id/task_krakentools.wdl new file mode 100644 index 000000000..7cbdea235 --- /dev/null +++ b/tasks/taxon_id/task_krakentools.wdl @@ -0,0 +1,58 @@ +version 1.0 + +task extract_kraken_reads { + input { + File kraken2_output + File kraken2_report + File read1 + File read2 + Int taxon_id + + Int cpu = 1 + Int disk_size = 100 + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/krakentools:d4a2fbe" + Int memory = 4 + } + command <<< + gunzip -c ~{kraken2_output} > kraken2_output_unzipped.txt + + python3 /KrakenTools/extract_kraken_reads.py \ + -k kraken2_output_unzipped.txt \ + -s1 ~{read1} \ + -s2 ~{read2} \ + --taxid ~{taxon_id} \ + --report ~{kraken2_report} \ + --include-parents \ + --include-children \ + --fastq-output \ + --output ~{taxon_id}_1.fastq \ + --output2 ~{taxon_id}_2.fastq + + if [ -s ~{taxon_id}_1.fastq ]; then + echo "DEBUG: Taxon ~{taxon_id} reads extracted" + echo "true" > CONTINUE + else + echo "DEBUG: No reads were extracted for taxon ~{taxon_id}, removing empty files" + echo "false" > CONTINUE + fi + + gzip ~{taxon_id}_1.fastq + gzip ~{taxon_id}_2.fastq + + >>> + output { + File extracted_read1 = "~{taxon_id}_1.fastq.gz" + File extracted_read2 = "~{taxon_id}_2.fastq.gz" + String krakentools_docker = docker + Boolean success = read_boolean("CONTINUE") + } + runtime { + cpu: cpu + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " SSD" + docker: docker + memory: "~{memory} GB" + preemptible: 1 + maxRetries: 3 + } +} \ No newline at end of file diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl new file mode 100644 index 000000000..1965d5089 --- /dev/null +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -0,0 +1,116 @@ +version 1.0 + +import "../../tasks/alignment/task_minimap2.wdl" as minimap2_task +import "../../tasks/assembly/task_metaspades.wdl" as metaspades_task +import "../../tasks/quality_control/basic_statistics/task_fastq_scan.wdl" as fastq_scan +import "../../tasks/quality_control/basic_statistics/task_quast.wdl" as quast_task +import "../../tasks/quality_control/read_filtering/task_pilon.wdl" as pilon_task +import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken_task +import "../../tasks/taxon_id/task_krakentools.wdl" as krakentools_task +import "../../tasks/taxon_id/contamination/task_krona.wdl" as krona_task +import "../../tasks/utilities/data_handling/task_parse_mapping.wdl" as parse_mapping_task +import "../utilities/wf_morgana_magic.wdl" as morgana_magic_workflow +import "../utilities/wf_read_QC_trim_pe.wdl" as read_qc_trim_pe + +workflow theiameta_panel_illumina_pe { + input { + String samplename + File read1 + File read2 + Array[Int]? taxon_ids # suggest using a workspace element if user wants to modify? + File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz" + } + call kraken_task.kraken2_standalone as kraken2_raw { + input: + samplename = samplename, + read1 = read1, + read2 = read2, + kraken2_db = kraken2_db + } + call krona_task.krona as krona_raw { + input: + kraken2_report = kraken2_raw.kraken2_report, + samplename = samplename + } + call read_qc_trim_pe.read_QC_trim_pe as read_QC_trim { + input: + samplename = samplename, + read1 = read1, + read2 = read2, + workflow_series = "theiameta" + } + call kraken_task.kraken2_standalone as kraken2_clean { + input: + samplename = samplename, + read1 = read_QC_trim.read1_clean, + read2 = read_QC_trim.read2_clean, + kraken2_db = kraken2_db + } + call krona_task.krona as krona_clean { + input: + kraken2_report = kraken2_clean.kraken2_report, + samplename = samplename + } + scatter (taxon_id in taxon_ids) { + call krakentools_task.extract_kraken_reads as krakentools { + input: + kraken2_output = kraken2_clean.kraken2_classified_report, + kraken2_report = kraken2_clean.kraken2_report, + read1 = read_QC_trim.read1_clean, + read2 = read_QC_trim.read2_clean, + taxon_id = taxon_id + } + if (krakentools.success) { + call fastq_scan.fastq_scan_pe as fastq_scan_binned { + input: + read1 = krakentools.extracted_read1, + read2 = krakentools.extracted_read2 + } + #### ADJUST IN THE FUTURE; SETTING TO 100 FOR TESTING + if (fastq_scan_binned.read1_seq > 100) { + call metaspades_task.metaspades_pe { + input: + read1_cleaned = krakentools.extracted_read1, + read2_cleaned = krakentools.extracted_read2, + samplename = "~{samplename}_~{taxon_id}" + } + call minimap2_task.minimap2 as minimap2_assembly_correction { + input: + query1 = krakentools.extracted_read1, + query2 = krakentools.extracted_read2, + reference = metaspades_pe.assembly_fasta, + samplename = "~{samplename}_~{taxon_id}", + mode = "sr", + output_sam = true + } + call parse_mapping_task.sam_to_sorted_bam as sort_bam_assembly_correction { + input: + sam = minimap2_assembly_correction.minimap2_out, + samplename = "~{samplename}_~{taxon_id}" + } + call pilon_task.pilon { + input: + assembly = metaspades_pe.assembly_fasta, + bam = sort_bam_assembly_correction.bam, + bai = sort_bam_assembly_correction.bai, + samplename = "~{samplename}_~{taxon_id}" + } + call quast_task.quast { + input: + assembly = pilon.assembly_fasta, + samplename = "~{samplename}_~{taxon_id}", + min_contig_length = 1 + } + call morgana_magic_workflow.morgana_magic { + input: + samplename = "~{samplename}_~{taxon_id}", + assembly_fasta = pilon.assembly_fasta, + read1 = krakentools.extracted_read1, + read2 = krakentools.extracted_read2, + taxon_id = taxon_id + } + } + # DO OUTPUTS????Q + } + } +} \ No newline at end of file diff --git a/workflows/utilities/wf_morgana_magic.wdl b/workflows/utilities/wf_morgana_magic.wdl new file mode 100644 index 000000000..9dd322c8f --- /dev/null +++ b/workflows/utilities/wf_morgana_magic.wdl @@ -0,0 +1,112 @@ +version 1.0 + +import "../../tasks/quality_control/advanced_metrics/task_vadr.wdl" as vadr_task +import "../../tasks/quality_control/basic_statistics/task_consensus_qc.wdl" as consensus_qc_task +import "../../tasks/species_typing/betacoronavirus/task_pangolin.wdl" as pangolin +import "../../tasks/species_typing/lentivirus/task_quasitools.wdl" as quasitools +import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade_task +import "../utilities/wf_organism_parameters.wdl" as set_organism_defaults + +workflow morgana_magic { + input { + String samplename + File assembly_fasta + File read1 + File read2 + String taxon_id + } + call set_organism_defaults.organism_parameters { + input: + taxon_id = taxon_id, + organism = "temporary" + } + call consensus_qc_task.consensus_qc { + input: + assembly_fasta = assembly_fasta, + reference_genome = organism_parameters.reference, + genome_length = organism_parameters.genome_length + } + if (organism_parameters.standardized_organism == "sars-cov-2") { + call pangolin.pangolin4 { + input: + samplename = samplename, + fasta = assembly_fasta, + docker = organism_parameters.pangolin_docker + } + } + if (organism_parameters.standardized_organism == "MPXV" || organism_parameters.standardized_organism == "sars-cov-2" || organism_parameters.standardized_organism == "rsv_a" || organism_parameters.standardized_organism == "rsv_b") { + # tasks specific to either MPXV, sars-cov-2, or RSV-A/RSV-B + call nextclade_task.nextclade_v3 { + input: + genome_fasta = assembly_fasta, + dataset_name = organism_parameters.nextclade_dataset_name, + dataset_tag = organism_parameters.nextclade_dataset_tag + } + call nextclade_task.nextclade_output_parser { + input: + nextclade_tsv = nextclade_v3.nextclade_tsv, + organism = organism_parameters.standardized_organism + } + } + if (organism_parameters.standardized_organism == "MPXV" || organism_parameters.standardized_organism == "sars-cov-2" || organism_parameters.standardized_organism == "WNV" || organism_parameters.standardized_organism == "flu" || organism_parameters.standardized_organism == "rsv_a" || organism_parameters.standardized_organism == "rsv_b") { + # tasks specific to MPXV, sars-cov-2, WNV, flu, rsv_a, and rsv_b + call vadr_task.vadr { + input: + genome_fasta = assembly_fasta, + assembly_length_unambiguous = consensus_qc.number_ATCG, + vadr_opts = organism_parameters.vadr_opts, + max_length = organism_parameters.vadr_maxlength, + skip_length = organism_parameters.vadr_skiplength, + memory = organism_parameters.vadr_memory + } + } + if (organism_parameters.standardized_organism == "HIV") { + call quasitools.quasitools as quasitools_illumina_pe { + input: + read1 = read1, + read2 = read2, + samplename = samplename + } + } + output { + String organism = organism_parameters.standardized_organism + # Pangolin outputs + String? pango_lineage = pangolin4.pangolin_lineage + String? pango_lineage_expanded = pangolin4.pangolin_lineage_expanded + String? pangolin_conflicts = pangolin4.pangolin_conflicts + String? pangolin_notes = pangolin4.pangolin_notes + String? pangolin_assignment_version = pangolin4.pangolin_assignment_version + File? pango_lineage_report = pangolin4.pango_lineage_report + String? pangolin_docker = pangolin4.pangolin_docker + String? pangolin_versions = pangolin4.pangolin_versions + # Nextclade outputs for all organisms + String nextclade_version = select_first([nextclade_v3.nextclade_version, ""]) + String nextclade_docker = select_first([nextclade_v3.nextclade_docker, ""]) + # Nextclade outputs for non-flu + File? nextclade_json = nextclade_v3.nextclade_json + File? auspice_json = nextclade_v3.auspice_json + File? nextclade_tsv = nextclade_v3.nextclade_tsv + String nextclade_ds_tag = organism_parameters.nextclade_dataset_tag + String? nextclade_aa_subs = nextclade_output_parser.nextclade_aa_subs + String? nextclade_aa_dels = nextclade_output_parser.nextclade_aa_dels + String? nextclade_clade = nextclade_output_parser.nextclade_clade + String? nextclade_lineage = nextclade_output_parser.nextclade_lineage + String? nextclade_qc = nextclade_output_parser.nextclade_qc + # VADR Annotation QC + File? vadr_alerts_list = vadr.alerts_list + File? vadr_feature_tbl_pass = vadr.feature_tbl_pass + File? vadr_feature_tbl_fail = vadr.feature_tbl_fail + File? vadr_classification_summary_file = vadr.classification_summary_file + File? vadr_all_outputs_tar_gz = vadr.outputs_tgz + String? vadr_num_alerts = vadr.num_alerts + String? vadr_docker = vadr.vadr_docker + File? vadr_fastas_zip_archive = vadr.vadr_fastas_zip_archive + # HIV Outputs + String? quasitools_version = quasitools_illumina_pe.quasitools_version + String? quasitools_date = quasitools_illumina_pe.quasitools_date + File? quasitools_coverage_file = quasitools_illumina_pe.coverage_file + File? quasitools_dr_report = quasitools_illumina_pe.dr_report + File? quasitools_hydra_vcf = quasitools_illumina_pe.hydra_vcf + File? quasitools_mutations_report = quasitools_illumina_pe.mutations_report + } +} \ No newline at end of file diff --git a/workflows/utilities/wf_organism_parameters.wdl b/workflows/utilities/wf_organism_parameters.wdl index 43a2fa373..da9c4133a 100644 --- a/workflows/utilities/wf_organism_parameters.wdl +++ b/workflows/utilities/wf_organism_parameters.wdl @@ -6,6 +6,7 @@ workflow organism_parameters { } input { String organism + String taxon_id # hiv information String hiv_primer_version = "v1" @@ -48,7 +49,35 @@ workflow organism_parameters { Float? narrow_bandwidth Float? proportion_wide } - if (organism == "sars-cov-2" || organism == "SARS-CoV-2") { + if (defined(taxon_id)) { + if (taxon_id == "2697049") { + String sc2 = "sars-cov-2" + } + if (taxon_id == "10244") { + String mpox = "MPXV" + } + if (taxon_id == "11082") { + String wnv = "WNV" + } + if (taxon_id == "11320") { + # flu A + String flua = "flu" + } + if (taxon_id == "11520") { + # flu B + String flub = "flu" + } + if (taxon_id == "12814") { + String rsva = "rsv_a" + } + if (taxon_id == "12815") { + String rsvb = "rsv_b" + } + if (taxon_id == "11676") { + String hiv = "HIV" + } + } + if (organism == "sars-cov-2" || organism == "SARS-CoV-2" || defined(sc2)) { String sc2_org_name = "sars-cov-2" String sc2_reference_genome = "gs://theiagen-public-files-rp/terra/augur-sars-cov-2-references/MN908947.fasta" String sc2_gene_locations_bed = "gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed" @@ -61,7 +90,7 @@ workflow organism_parameters { String sc2_vadr_options = "--noseqnamemax --glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn --out_allfasta" Int sc2_vadr_memory = 8 } - if (organism == "MPXV" || organism == "mpox" || organism == "monkeypox" || organism == "Monkeypox virus" || organism == "Mpox") { + if (organism == "MPXV" || organism == "mpox" || organism == "monkeypox" || organism == "Monkeypox virus" || organism == "Mpox" || defined(mpox)) { String mpox_org_name = "MPXV" String mpox_reference_genome = "gs://theiagen-public-files/terra/mpxv-files/MPXV.MT903345.reference.fasta" String mpox_gene_locations_bed = "gs://theiagen-public-files/terra/mpxv-files/mpox_gene_locations.bed" @@ -88,7 +117,7 @@ workflow organism_parameters { Float mpox_narrow_bandwidth = 0.1666667 Float mpox_proportion_wide = 0.0 } - if (organism == "WNV" || organism == "wnv" || organism == "West Nile virus") { + if (organism == "WNV" || organism == "wnv" || organism == "West Nile virus" || defined(wnv)) { String wnv_org_name = "WNV" String wnv_reference_genome = "gs://theiagen-public-files/terra/theiacov-files/WNV/NC_009942.1_wnv_L1.fasta" String wnv_kraken_target_organism = "West Nile virus" @@ -101,7 +130,7 @@ workflow organism_parameters { String wnv_nextclade_ds_tag = "NA" String wnv_nextclade_ds_name = "NA" } - if (organism == "flu" || organism == "influenza" || organism == "Flu" || organism == "Influenza") { + if (organism == "flu" || organism == "influenza" || organism == "Flu" || organism == "Influenza" || defined(flua) || defined (flub)) { String flu_org_name = "flu" Int flu_genome_len = 13500 @@ -185,7 +214,7 @@ workflow organism_parameters { } } } - if (organism == "rsv_a" || organism == "rsv-a" || organism == "RSV-A" || organism == "RSV_A") { + if (organism == "rsv_a" || organism == "rsv-a" || organism == "RSV-A" || organism == "RSV_A" || defined(rsva)) { String rsv_a_org_name = "rsv_a" String rsv_a_reference_genome = "gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.fasta" String rsv_a_nextclade_ds_tag = "2024-08-01--22-31-31Z" @@ -209,7 +238,7 @@ workflow organism_parameters { Float rsv_a_narrow_bandwidth = 0.1666667 Float rsv_a_proportion_wide = 0.0 } - if (organism == "rsv_b" || organism == "rsv-b" || organism == "RSV-B" || organism == "RSV_B") { + if (organism == "rsv_b" || organism == "rsv-b" || organism == "RSV-B" || organism == "RSV_B" || defined(rsvb)) { String rsv_b_org_name = "rsv_b" String rsv_b_reference_genome = "gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.fasta" String rsv_b_nextclade_ds_tag = "2024-08-01--22-31-31Z" @@ -233,7 +262,7 @@ workflow organism_parameters { Float rsv_b_narrow_bandwidth = 0.1666667 Float rsv_b_proportion_wide = 0.0 } - if (organism == "HIV" && hiv_primer_version == "v1") { + if (organism == "HIV" && hiv_primer_version == "v1" || defined(hiv)) { String hiv_v1_org_name = "HIV" String hiv_v1_reference_genome = "gs://theiagen-public-files/terra/hivgc-files/NC_001802.1.fasta" String hiv_v1_reference_gff = "gs://theiagen-public-files/terra/hivgc-files/NC_001802.1.gff3" From b3bd529409e1c8551a5e044acd2e0e5a5ff33d7b Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 24 Sep 2024 19:23:43 +0000 Subject: [PATCH 02/48] rename taxon id vars in org param --- .../utilities/wf_organism_parameters.wdl | 50 ++++++++++--------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/workflows/utilities/wf_organism_parameters.wdl b/workflows/utilities/wf_organism_parameters.wdl index da9c4133a..cda034558 100644 --- a/workflows/utilities/wf_organism_parameters.wdl +++ b/workflows/utilities/wf_organism_parameters.wdl @@ -6,7 +6,7 @@ workflow organism_parameters { } input { String organism - String taxon_id + String? taxon_id # hiv information String hiv_primer_version = "v1" @@ -49,35 +49,36 @@ workflow organism_parameters { Float? narrow_bandwidth Float? proportion_wide } + # for morgana_magic & theiameta_panel compatibility if (defined(taxon_id)) { - if (taxon_id == "2697049") { - String sc2 = "sars-cov-2" + if (select_first([taxon_id]) == "2697049") { + String sars_cov_2_taxon_id = "sars-cov-2" } - if (taxon_id == "10244") { - String mpox = "MPXV" + if (select_first([taxon_id]) == "10244") { + String mpox_taxon_id = "MPXV" } - if (taxon_id == "11082") { - String wnv = "WNV" + if (select_first([taxon_id]) == "11082") { + String wnv_taxon_id = "WNV" } - if (taxon_id == "11320") { + if (select_first([taxon_id]) == "11320") { # flu A - String flua = "flu" + String flu_a_taxon_id = "flu" } - if (taxon_id == "11520") { + if (select_first([taxon_id]) == "11520") { # flu B - String flub = "flu" + String flu_b_taxon_id = "flu" } - if (taxon_id == "12814") { - String rsva = "rsv_a" + if (select_first([taxon_id]) == "12814") { + String rsv_a_taxon_id = "rsv_a" } - if (taxon_id == "12815") { - String rsvb = "rsv_b" + if (select_first([taxon_id]) == "12815") { + String rsv_b_taxon_id = "rsv_b" } - if (taxon_id == "11676") { - String hiv = "HIV" + if (select_first([taxon_id]) == "11676") { + String hiv_taxon_id = "HIV" } } - if (organism == "sars-cov-2" || organism == "SARS-CoV-2" || defined(sc2)) { + if (organism == "sars-cov-2" || organism == "SARS-CoV-2" || defined(sars_cov_2_taxon_id)) { String sc2_org_name = "sars-cov-2" String sc2_reference_genome = "gs://theiagen-public-files-rp/terra/augur-sars-cov-2-references/MN908947.fasta" String sc2_gene_locations_bed = "gs://theiagen-public-files-rp/terra/sars-cov-2-files/sc2_gene_locations.bed" @@ -90,7 +91,7 @@ workflow organism_parameters { String sc2_vadr_options = "--noseqnamemax --glsearch -s -r --nomisc --mkey sarscov2 --lowsim5seq 6 --lowsim3seq 6 --alt_fail lowscore,insertnn,deletinn --out_allfasta" Int sc2_vadr_memory = 8 } - if (organism == "MPXV" || organism == "mpox" || organism == "monkeypox" || organism == "Monkeypox virus" || organism == "Mpox" || defined(mpox)) { + if (organism == "MPXV" || organism == "mpox" || organism == "monkeypox" || organism == "Monkeypox virus" || organism == "Mpox" || defined(mpox_taxon_id)) { String mpox_org_name = "MPXV" String mpox_reference_genome = "gs://theiagen-public-files/terra/mpxv-files/MPXV.MT903345.reference.fasta" String mpox_gene_locations_bed = "gs://theiagen-public-files/terra/mpxv-files/mpox_gene_locations.bed" @@ -117,7 +118,7 @@ workflow organism_parameters { Float mpox_narrow_bandwidth = 0.1666667 Float mpox_proportion_wide = 0.0 } - if (organism == "WNV" || organism == "wnv" || organism == "West Nile virus" || defined(wnv)) { + if (organism == "WNV" || organism == "wnv" || organism == "West Nile virus" || defined(wnv_taxon_id)) { String wnv_org_name = "WNV" String wnv_reference_genome = "gs://theiagen-public-files/terra/theiacov-files/WNV/NC_009942.1_wnv_L1.fasta" String wnv_kraken_target_organism = "West Nile virus" @@ -130,7 +131,7 @@ workflow organism_parameters { String wnv_nextclade_ds_tag = "NA" String wnv_nextclade_ds_name = "NA" } - if (organism == "flu" || organism == "influenza" || organism == "Flu" || organism == "Influenza" || defined(flua) || defined (flub)) { + if (organism == "flu" || organism == "influenza" || organism == "Flu" || organism == "Influenza" || defined(flu_a_taxon_id) || defined (flu_b_taxon_id)) { String flu_org_name = "flu" Int flu_genome_len = 13500 @@ -214,7 +215,7 @@ workflow organism_parameters { } } } - if (organism == "rsv_a" || organism == "rsv-a" || organism == "RSV-A" || organism == "RSV_A" || defined(rsva)) { + if (organism == "rsv_a" || organism == "rsv-a" || organism == "RSV-A" || organism == "RSV_A" || defined(rsv_a_taxon_id)) { String rsv_a_org_name = "rsv_a" String rsv_a_reference_genome = "gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_a.fasta" String rsv_a_nextclade_ds_tag = "2024-08-01--22-31-31Z" @@ -238,7 +239,7 @@ workflow organism_parameters { Float rsv_a_narrow_bandwidth = 0.1666667 Float rsv_a_proportion_wide = 0.0 } - if (organism == "rsv_b" || organism == "rsv-b" || organism == "RSV-B" || organism == "RSV_B" || defined(rsvb)) { + if (organism == "rsv_b" || organism == "rsv-b" || organism == "RSV-B" || organism == "RSV_B" || defined(rsv_b_taxon_id)) { String rsv_b_org_name = "rsv_b" String rsv_b_reference_genome = "gs://theiagen-public-files-rp/terra/rsv_references/reference_rsv_b.fasta" String rsv_b_nextclade_ds_tag = "2024-08-01--22-31-31Z" @@ -262,7 +263,8 @@ workflow organism_parameters { Float rsv_b_narrow_bandwidth = 0.1666667 Float rsv_b_proportion_wide = 0.0 } - if (organism == "HIV" && hiv_primer_version == "v1" || defined(hiv)) { + # assuming HIV v1 for now for taxon_id -- this is not accurate and will need reexamination + if (organism == "HIV" && hiv_primer_version == "v1" || defined(hiv_taxon_id)) { String hiv_v1_org_name = "HIV" String hiv_v1_reference_genome = "gs://theiagen-public-files/terra/hivgc-files/NC_001802.1.fasta" String hiv_v1_reference_gff = "gs://theiagen-public-files/terra/hivgc-files/NC_001802.1.gff3" From 593e5b587fc121a3116d033ee7839f8b590d3b78 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 24 Sep 2024 20:00:04 +0000 Subject: [PATCH 03/48] language --- workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl | 5 ++++- workflows/utilities/wf_morgana_magic.wdl | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index 1965d5089..f245047fb 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -20,6 +20,7 @@ workflow theiameta_panel_illumina_pe { Array[Int]? taxon_ids # suggest using a workspace element if user wants to modify? File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz" } + # kraken does not run as part of the theiameta track in read_QC_trim -- we may want to change that call kraken_task.kraken2_standalone as kraken2_raw { input: samplename = samplename, @@ -54,6 +55,8 @@ workflow theiameta_panel_illumina_pe { scatter (taxon_id in taxon_ids) { call krakentools_task.extract_kraken_reads as krakentools { input: + # we should consider changing the classified_report name so + # it won't be confused with the actual kraken2 report kraken2_output = kraken2_clean.kraken2_classified_report, kraken2_report = kraken2_clean.kraken2_report, read1 = read_QC_trim.read1_clean, @@ -66,7 +69,7 @@ workflow theiameta_panel_illumina_pe { read1 = krakentools.extracted_read1, read2 = krakentools.extracted_read2 } - #### ADJUST IN THE FUTURE; SETTING TO 100 FOR TESTING + #### ADJUST IN THE FUTURE; SETTING TO 100 FOR TESTING #### if (fastq_scan_binned.read1_seq > 100) { call metaspades_task.metaspades_pe { input: diff --git a/workflows/utilities/wf_morgana_magic.wdl b/workflows/utilities/wf_morgana_magic.wdl index 9dd322c8f..a860983f6 100644 --- a/workflows/utilities/wf_morgana_magic.wdl +++ b/workflows/utilities/wf_morgana_magic.wdl @@ -18,7 +18,7 @@ workflow morgana_magic { call set_organism_defaults.organism_parameters { input: taxon_id = taxon_id, - organism = "temporary" + organism = "To Be Determined" } call consensus_qc_task.consensus_qc { input: From 01c1223ae73799f69c699a7599cd4590ed683705 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Thu, 26 Sep 2024 19:54:03 +0000 Subject: [PATCH 04/48] progress --- .../data_handling/task_gather_scatter.wdl | 68 +++++++++++++++++++ .../wf_theiameta_panel_illumina_pe.wdl | 38 ++++++++++- workflows/utilities/wf_morgana_magic.wdl | 3 + 3 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 tasks/utilities/data_handling/task_gather_scatter.wdl diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl new file mode 100644 index 000000000..3d04db664 --- /dev/null +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -0,0 +1,68 @@ +version 1.0 + +task gather_scatter { + input { + Array[Int]? taxon_ids + Array[String?] organism + Array[File?] extracted_read1 + Array[File?] extracted_read2 + Array[Int?] fastq_scan_num_reads_binned1 + Array[Int?] fastq_scan_num_reads_binned2 + Array[String?] fastq_scan_num_reads_binned_pairs + Array[File?] pilon_assembly_fasta ### maybe????? + Array[Int?] quast_genome_length + Array[Int?] quast_number_contigs + Array[Int?] quast_n50 + Array[Float?] quast_gc_percent + Array[String?] pango_lineage + Array[String?] pango_lineage_expanded + Array[String?] pangolin_conflicts + Array[String?] pangolin_notes + Array[String?] pangolin_assignment_version + Array[String?] pangolin_versions + Array[String?] pangolin_docker + Array[String?] nextclade_version + Array[String?] nextclade_docker + Array[String?] nextclade_ds_tag + Array[String?] nextclade_aa_subs + Array[String?] nextclade_aa_dels + Array[String?] nextclade_clade + Array[String?] nextclade_lineage + Array[String?] nextclade_qc + + + } + command <<< + echo "taxon_ids: ~{sep="," taxon_ids}" + echo "organism: ~{sep="," organism}" + echo "extracted_read1: ~{sep="," extracted_read1}" + echo "extracted_read2: ~{sep="," extracted_read2}" + echo "fastq_scan_num_reads_binned1: ~{sep="," fastq_scan_num_reads_binned1}" + echo "fastq_scan_num_reads_binned2: ~{sep="," fastq_scan_num_reads_binned2}" + echo "fastq_scan_num_reads_binned_pairs: ~{sep="," fastq_scan_num_reads_binned_pairs}" + echo "pilon_assembly_fasta: ~{sep="," pilon_assembly_fasta}" + echo "quast_genome_length: ~{sep="," quast_genome_length}" + echo "quast_number_contigs: ~{sep="," quast_number_contigs}" + echo "quast_n50: ~{sep="," quast_n50}" + echo "quast_gc_percent: ~{sep="," quast_gc_percent}" + echo "pango_lineage: ~{sep="," pango_lineage}" + echo "pango_lineage_expanded: ~{sep="," pango_lineage_expanded}" + echo "pangolin_conflicts: ~{sep="," pangolin_conflicts}" + echo "pangolin_notes: ~{sep="," pangolin_notes}" + echo "pangolin_assignment_version: ~{sep="," pangolin_assignment_version}" + echo "pangolin_versions: ~{sep="," pangolin_versions}" + echo "pangolin_docker: ~{sep="," pangolin_docker}" + echo "nextclade_version: ~{sep="," nextclade_version}" + echo "nextclade_docker: ~{sep="," nextclade_docker}" + echo "nextclade_ds_tag: ~{sep="," nextclade_ds_tag}" + echo "nextclade_aa_subs: ~{sep="," nextclade_aa_subs}" + echo "nextclade_aa_dels: ~{sep="," nextclade_aa_dels}" + echo "nextclade_clade: ~{sep="," nextclade_clade}" + echo "nextclade_lineage: ~{sep="," nextclade_lineage}" + echo "nextclade_qc: ~{sep="," nextclade_qc}" + + >>> + output { + Array[Int]? taxon_ids_out = taxon_ids + } +} \ No newline at end of file diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index f245047fb..ac91580f8 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -8,6 +8,7 @@ import "../../tasks/quality_control/read_filtering/task_pilon.wdl" as pilon_task import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken_task import "../../tasks/taxon_id/task_krakentools.wdl" as krakentools_task import "../../tasks/taxon_id/contamination/task_krona.wdl" as krona_task +import "../../tasks/utilities/data_handling/task_gather_scatter.wdl" as gather_scatter_task import "../../tasks/utilities/data_handling/task_parse_mapping.wdl" as parse_mapping_task import "../utilities/wf_morgana_magic.wdl" as morgana_magic_workflow import "../utilities/wf_read_QC_trim_pe.wdl" as read_qc_trim_pe @@ -71,6 +72,7 @@ workflow theiameta_panel_illumina_pe { } #### ADJUST IN THE FUTURE; SETTING TO 100 FOR TESTING #### if (fastq_scan_binned.read1_seq > 100) { + String did_attempt_assembly = "Assembly attempted" call metaspades_task.metaspades_pe { input: read1_cleaned = krakentools.extracted_read1, @@ -110,10 +112,42 @@ workflow theiameta_panel_illumina_pe { assembly_fasta = pilon.assembly_fasta, read1 = krakentools.extracted_read1, read2 = krakentools.extracted_read2, - taxon_id = taxon_id + taxon_id = "~{taxon_id}" } } - # DO OUTPUTS????Q } } + call gather_scatter_task.gather_scatter { + input: + taxon_ids = select_first([taxon_ids]), + organism = morgana_magic.organism, + extracted_read1 = krakentools.extracted_read1, + extracted_read2 = krakentools.extracted_read2, + fastq_scan_num_reads_binned1 = fastq_scan_binned.read1_seq, + fastq_scan_num_reads_binned2 = fastq_scan_binned.read2_seq, + fastq_scan_num_reads_binned_pairs = fastq_scan_binned.read_pairs, + pilon_assembly_fasta = pilon.assembly_fasta, + quast_genome_length = quast.genome_length, + quast_number_contigs = quast.number_contigs, + quast_n50 = quast.n50_value, + quast_gc_percent = quast.gc_percent, + pango_lineage = morgana_magic.pango_lineage, + pango_lineage_expanded = morgana_magic.pango_lineage_expanded, + pangolin_conflicts = morgana_magic.pangolin_conflicts, + pangolin_notes = morgana_magic.pangolin_notes, + pangolin_assignment_version = morgana_magic.pangolin_assignment_version, + pangolin_versions = morgana_magic.pangolin_versions, + pangolin_docker = morgana_magic.pangolin_docker, + nextclade_version = morgana_magic.nextclade_version, + nextclade_docker = morgana_magic.nextclade_docker, + nextclade_ds_tag = morgana_magic.nextclade_ds_tag, + nextclade_aa_subs = morgana_magic.nextclade_aa_subs, + nextclade_aa_dels = morgana_magic.nextclade_aa_dels, + nextclade_clade = morgana_magic.nextclade_clade, + nextclade_lineage = morgana_magic.nextclade_lineage, + nextclade_qc = morgana_magic.nextclade_qc + } + output { + Array[String?] identified_organisms = select_first([morgana_magic.organism]) + } } \ No newline at end of file diff --git a/workflows/utilities/wf_morgana_magic.wdl b/workflows/utilities/wf_morgana_magic.wdl index a860983f6..e6c6664c4 100644 --- a/workflows/utilities/wf_morgana_magic.wdl +++ b/workflows/utilities/wf_morgana_magic.wdl @@ -15,6 +15,7 @@ workflow morgana_magic { File read2 String taxon_id } + #### need to add more flu characterization call set_organism_defaults.organism_parameters { input: taxon_id = taxon_id, @@ -48,6 +49,7 @@ workflow morgana_magic { organism = organism_parameters.standardized_organism } } + ##### is running vadr even something we want to do???? if (organism_parameters.standardized_organism == "MPXV" || organism_parameters.standardized_organism == "sars-cov-2" || organism_parameters.standardized_organism == "WNV" || organism_parameters.standardized_organism == "flu" || organism_parameters.standardized_organism == "rsv_a" || organism_parameters.standardized_organism == "rsv_b") { # tasks specific to MPXV, sars-cov-2, WNV, flu, rsv_a, and rsv_b call vadr_task.vadr { @@ -60,6 +62,7 @@ workflow morgana_magic { memory = organism_parameters.vadr_memory } } + ##### is running quasitools even something we want to do???? if (organism_parameters.standardized_organism == "HIV") { call quasitools.quasitools as quasitools_illumina_pe { input: From 2c630222cd6406198940e382d48be212dc50b097 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 9 Oct 2024 16:03:13 +0000 Subject: [PATCH 05/48] notes --- tasks/taxon_id/task_krakentools.wdl | 3 +++ tasks/utilities/data_handling/task_gather_scatter.wdl | 4 ++++ workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl | 4 +++- workflows/utilities/wf_morgana_magic.wdl | 1 + workflows/utilities/wf_organism_parameters.wdl | 1 + 5 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tasks/taxon_id/task_krakentools.wdl b/tasks/taxon_id/task_krakentools.wdl index 7cbdea235..ea21adecb 100644 --- a/tasks/taxon_id/task_krakentools.wdl +++ b/tasks/taxon_id/task_krakentools.wdl @@ -36,6 +36,8 @@ task extract_kraken_reads { echo "false" > CONTINUE fi + grep ~{taxon_id} ~{kraken2_report} | awk '{for (i=6; i <= NF; ++i) print $i}' | tr '\n' ' ' | xargs > ORGANISM_NAME + gzip ~{taxon_id}_1.fastq gzip ~{taxon_id}_2.fastq @@ -43,6 +45,7 @@ task extract_kraken_reads { output { File extracted_read1 = "~{taxon_id}_1.fastq.gz" File extracted_read2 = "~{taxon_id}_2.fastq.gz" + String organism_name = read_string("ORGANISM_NAME") ### fix String krakentools_docker = docker Boolean success = read_boolean("CONTINUE") } diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index 3d04db664..afc56980e 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -61,6 +61,10 @@ task gather_scatter { echo "nextclade_lineage: ~{sep="," nextclade_lineage}" echo "nextclade_qc: ~{sep="," nextclade_qc}" + + # turn into tsv? + # output to file + >>> output { Array[Int]? taxon_ids_out = taxon_ids diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index ac91580f8..67ab83036 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -19,6 +19,8 @@ workflow theiameta_panel_illumina_pe { File read1 File read2 Array[Int]? taxon_ids # suggest using a workspace element if user wants to modify? + + Int minimum_read_number = 100 File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz" } # kraken does not run as part of the theiameta track in read_QC_trim -- we may want to change that @@ -71,7 +73,7 @@ workflow theiameta_panel_illumina_pe { read2 = krakentools.extracted_read2 } #### ADJUST IN THE FUTURE; SETTING TO 100 FOR TESTING #### - if (fastq_scan_binned.read1_seq > 100) { + if (fastq_scan_binned.read1_seq > ~{minimum_read_number}) { String did_attempt_assembly = "Assembly attempted" call metaspades_task.metaspades_pe { input: diff --git a/workflows/utilities/wf_morgana_magic.wdl b/workflows/utilities/wf_morgana_magic.wdl index e6c6664c4..3d266cfe3 100644 --- a/workflows/utilities/wf_morgana_magic.wdl +++ b/workflows/utilities/wf_morgana_magic.wdl @@ -49,6 +49,7 @@ workflow morgana_magic { organism = organism_parameters.standardized_organism } } + ### add flu ##### is running vadr even something we want to do???? if (organism_parameters.standardized_organism == "MPXV" || organism_parameters.standardized_organism == "sars-cov-2" || organism_parameters.standardized_organism == "WNV" || organism_parameters.standardized_organism == "flu" || organism_parameters.standardized_organism == "rsv_a" || organism_parameters.standardized_organism == "rsv_b") { # tasks specific to MPXV, sars-cov-2, WNV, flu, rsv_a, and rsv_b diff --git a/workflows/utilities/wf_organism_parameters.wdl b/workflows/utilities/wf_organism_parameters.wdl index cda034558..4fa3b4e1c 100644 --- a/workflows/utilities/wf_organism_parameters.wdl +++ b/workflows/utilities/wf_organism_parameters.wdl @@ -63,6 +63,7 @@ workflow organism_parameters { if (select_first([taxon_id]) == "11320") { # flu A String flu_a_taxon_id = "flu" + # also do flu type } if (select_first([taxon_id]) == "11520") { # flu B From 83e8add66262cc2e4fb1095803dd153e27e33d0d Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Thu, 10 Oct 2024 19:23:02 +0000 Subject: [PATCH 06/48] finish --- .dockstore.yml | 5 + tasks/taxon_id/contamination/task_kraken2.wdl | 2 +- .../data_handling/task_gather_scatter.wdl | 112 ++++++++++++------ .../wf_theiameta_panel_illumina_pe.wdl | 71 +++++++---- workflows/utilities/wf_flu_track.wdl | 4 +- workflows/utilities/wf_morgana_magic.wdl | 83 ++++++++----- 6 files changed, 189 insertions(+), 88 deletions(-) diff --git a/.dockstore.yml b/.dockstore.yml index 5306d30ed..caaa987d3 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -225,6 +225,11 @@ workflows: primaryDescriptorPath: /workflows/theiameta/wf_theiameta_illumina_pe.wdl testParameterFiles: - /tests/inputs/empty.json + - name: TheiaMeta_Panel_Illumina_PE_PHB + subclass: WDL + primaryDescriptorPath: /workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl + testParameterFiles: + - /tests/inputs/empty.json - name: Snippy_Streamline_PHB subclass: WDL primaryDescriptorPath: /workflows/phylogenetics/wf_snippy_streamline.wdl diff --git a/tasks/taxon_id/contamination/task_kraken2.wdl b/tasks/taxon_id/contamination/task_kraken2.wdl index fb1522c75..01ef7b234 100644 --- a/tasks/taxon_id/contamination/task_kraken2.wdl +++ b/tasks/taxon_id/contamination/task_kraken2.wdl @@ -147,8 +147,8 @@ task kraken2_standalone { File kraken2_unclassified_read1 = "~{samplename}.unclassified_1.fastq.gz" File? kraken2_unclassified_read2 = "~{samplename}.unclassified_2.fastq.gz" File kraken2_classified_read1 = "~{samplename}.classified_1.fastq.gz" - Float kraken2_percent_human = read_float("PERCENT_HUMAN") File? kraken2_classified_read2 = "~{samplename}.classified_2.fastq.gz" + Float kraken2_percent_human = read_float("PERCENT_HUMAN") String kraken2_database = kraken2_db } runtime { diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index afc56980e..243b31530 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -2,18 +2,33 @@ version 1.0 task gather_scatter { input { + String samplename Array[Int]? taxon_ids + # krakentools outputs Array[String?] organism Array[File?] extracted_read1 Array[File?] extracted_read2 + Array[String]? krakentools_docker + # fastq_scan outputs Array[Int?] fastq_scan_num_reads_binned1 Array[Int?] fastq_scan_num_reads_binned2 Array[String?] fastq_scan_num_reads_binned_pairs + Array[String?] fastq_scan_docker + Array[String?] fastq_scan_version + # Assembly Array[File?] pilon_assembly_fasta ### maybe????? + # quast outputs Array[Int?] quast_genome_length Array[Int?] quast_number_contigs Array[Int?] quast_n50 Array[Float?] quast_gc_percent + # consensus qc outputs + Array[Int?] number_N + Array[Int?] number_ATCG + Array[Int?] number_Degenerate + Array[Int?] number_Total + Array[Float?] percent_reference_coverage + # pangolin outputs Array[String?] pango_lineage Array[String?] pango_lineage_expanded Array[String?] pangolin_conflicts @@ -21,6 +36,7 @@ task gather_scatter { Array[String?] pangolin_assignment_version Array[String?] pangolin_versions Array[String?] pangolin_docker + # Nextclade outputs for non-flu Array[String?] nextclade_version Array[String?] nextclade_docker Array[String?] nextclade_ds_tag @@ -29,44 +45,70 @@ task gather_scatter { Array[String?] nextclade_clade Array[String?] nextclade_lineage Array[String?] nextclade_qc - - + # Nextclade outputs for flu HA + Array[String?] nextclade_ds_tag_flu_ha + Array[String?] nextclade_aa_subs_flu_ha + Array[String?] nextclade_aa_dels_flu_ha + Array[String?] nextclade_clade_flu_ha + Array[String?] nextclade_qc_flu_ha + # Nextclade outputs for flu NA + Array[String?] nextclade_ds_tag_flu_na + Array[String?] nextclade_aa_subs_flu_na + Array[String?] nextclade_aa_dels_flu_na + Array[String?] nextclade_clade_flu_na + Array[String?] nextclade_qc_flu_na } command <<< - echo "taxon_ids: ~{sep="," taxon_ids}" - echo "organism: ~{sep="," organism}" - echo "extracted_read1: ~{sep="," extracted_read1}" - echo "extracted_read2: ~{sep="," extracted_read2}" - echo "fastq_scan_num_reads_binned1: ~{sep="," fastq_scan_num_reads_binned1}" - echo "fastq_scan_num_reads_binned2: ~{sep="," fastq_scan_num_reads_binned2}" - echo "fastq_scan_num_reads_binned_pairs: ~{sep="," fastq_scan_num_reads_binned_pairs}" - echo "pilon_assembly_fasta: ~{sep="," pilon_assembly_fasta}" - echo "quast_genome_length: ~{sep="," quast_genome_length}" - echo "quast_number_contigs: ~{sep="," quast_number_contigs}" - echo "quast_n50: ~{sep="," quast_n50}" - echo "quast_gc_percent: ~{sep="," quast_gc_percent}" - echo "pango_lineage: ~{sep="," pango_lineage}" - echo "pango_lineage_expanded: ~{sep="," pango_lineage_expanded}" - echo "pangolin_conflicts: ~{sep="," pangolin_conflicts}" - echo "pangolin_notes: ~{sep="," pangolin_notes}" - echo "pangolin_assignment_version: ~{sep="," pangolin_assignment_version}" - echo "pangolin_versions: ~{sep="," pangolin_versions}" - echo "pangolin_docker: ~{sep="," pangolin_docker}" - echo "nextclade_version: ~{sep="," nextclade_version}" - echo "nextclade_docker: ~{sep="," nextclade_docker}" - echo "nextclade_ds_tag: ~{sep="," nextclade_ds_tag}" - echo "nextclade_aa_subs: ~{sep="," nextclade_aa_subs}" - echo "nextclade_aa_dels: ~{sep="," nextclade_aa_dels}" - echo "nextclade_clade: ~{sep="," nextclade_clade}" - echo "nextclade_lineage: ~{sep="," nextclade_lineage}" - echo "nextclade_qc: ~{sep="," nextclade_qc}" - - - # turn into tsv? - # output to file - + ( + echo -e "taxon_ids\torganism\textracted_read1\textracted_read2\tkrakentools_docker\tfastq_scan_num_reads_binned1\tfastq_scan_num_reads_binned2\tfastq_scan_num_reads_binned_pairs\tfastq_scan_docker\tfastq_scan_version\tpilon_assembly_fasta\tquast_genome_length\tquast_number_contigs\tquast_n50\tquast_gc_percent\tnumber_N\tnumber_ATCG\tnumber_Degenerate\tnumber_Total\tpercent_reference_coverage\tpango_lineage\tpango_lineage_expanded\tpangolin_conflicts\tpangolin_notes\tpangolin_assignment_version\tpangolin_versions\tpangolin_docker\tnextclade_version\tnextclade_docker\tnextclade_ds_tag\tnextclade_aa_subs\tnextclade_aa_dels\tnextclade_clade\tnextclade_lineage\tnextclade_qc\tnextclade_ds_tag_flu_ha\tnextclade_aa_subs_flu_ha\tnextclade_aa_dels_flu_ha\tnextclade_clade_flu_ha\tnextclade_qc_flu_ha\tnextclade_ds_tag_flu_na\tnextclade_aa_subs_flu_na\tnextclade_aa_dels_flu_na\tnextclade_clade_flu_na\tnextclade_qc_flu_na" + paste <(echo "~{sep="\n" taxon_ids}") \ + <(echo "~{sep="\n" organism}") \ + <(echo "~{sep="\n" extracted_read1}") \ + <(echo "~{sep="\n" extracted_read2}") \ + <(echo "~{sep="\n" krakentools_docker}") \ + <(echo "~{sep="\n" fastq_scan_num_reads_binned1}") \ + <(echo "~{sep="\n" fastq_scan_num_reads_binned2}") \ + <(echo "~{sep="\n" fastq_scan_num_reads_binned_pairs}") \ + <(echo "~{sep="\n" fastq_scan_docker}") \ + <(echo "~{sep="\n" fastq_scan_version}") \ + <(echo "~{sep="\n" pilon_assembly_fasta}") \ + <(echo "~{sep="\n" quast_genome_length}") \ + <(echo "~{sep="\n" quast_number_contigs}") \ + <(echo "~{sep="\n" quast_n50}") \ + <(echo "~{sep="\n" quast_gc_percent}") \ + <(echo "~{sep="\n" number_N}") \ + <(echo "~{sep="\n" number_ATCG}") \ + <(echo "~{sep="\n" number_Degenerate}") \ + <(echo "~{sep="\n" number_Total}") \ + <(echo "~{sep="\n" percent_reference_coverage}") \ + <(echo "~{sep="\n" pango_lineage}") \ + <(echo "~{sep="\n" pango_lineage_expanded}") \ + <(echo "~{sep="\n" pangolin_conflicts}") \ + <(echo "~{sep="\n" pangolin_notes}") \ + <(echo "~{sep="\n" pangolin_assignment_version}") \ + <(echo "~{sep="\n" pangolin_versions}") \ + <(echo "~{sep="\n" pangolin_docker}") \ + <(echo "~{sep="\n" nextclade_version}") \ + <(echo "~{sep="\n" nextclade_docker}") \ + <(echo "~{sep="\n" nextclade_ds_tag}") \ + <(echo "~{sep="\n" nextclade_aa_subs}") \ + <(echo "~{sep="\n" nextclade_aa_dels}") \ + <(echo "~{sep="\n" nextclade_clade}") \ + <(echo "~{sep="\n" nextclade_lineage}") \ + <(echo "~{sep="\n" nextclade_qc}") \ + <(echo "~{sep="\n" nextclade_ds_tag_flu_ha}") \ + <(echo "~{sep="\n" nextclade_aa_subs_flu_ha}") \ + <(echo "~{sep="\n" nextclade_aa_dels_flu_ha}") \ + <(echo "~{sep="\n" nextclade_clade_flu_ha}") \ + <(echo "~{sep="\n" nextclade_qc_flu_ha}") \ + <(echo "~{sep="\n" nextclade_ds_tag_flu_na}") \ + <(echo "~{sep="\n" nextclade_aa_subs_flu_na}") \ + <(echo "~{sep="\n" nextclade_aa_dels_flu_na}") \ + <(echo "~{sep="\n" nextclade_clade_flu_na}") \ + <(echo "~{sep="\n" nextclade_qc_flu_na}") + ) > ~{samplename}.results.tsv >>> output { - Array[Int]? taxon_ids_out = taxon_ids + File gathered_results = "~{samplename}.results.tsv" } } \ No newline at end of file diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index 67ab83036..30e4e1041 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -23,19 +23,6 @@ workflow theiameta_panel_illumina_pe { Int minimum_read_number = 100 File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz" } - # kraken does not run as part of the theiameta track in read_QC_trim -- we may want to change that - call kraken_task.kraken2_standalone as kraken2_raw { - input: - samplename = samplename, - read1 = read1, - read2 = read2, - kraken2_db = kraken2_db - } - call krona_task.krona as krona_raw { - input: - kraken2_report = kraken2_raw.kraken2_report, - samplename = samplename - } call read_qc_trim_pe.read_QC_trim_pe as read_QC_trim { input: samplename = samplename, @@ -43,16 +30,17 @@ workflow theiameta_panel_illumina_pe { read2 = read2, workflow_series = "theiameta" } - call kraken_task.kraken2_standalone as kraken2_clean { + # kraken does not run as part of the theiameta track in read_QC_trim -- we may want to change that + call kraken_task.kraken2_standalone as kraken2 { input: samplename = samplename, read1 = read_QC_trim.read1_clean, read2 = read_QC_trim.read2_clean, kraken2_db = kraken2_db } - call krona_task.krona as krona_clean { + call krona_task.krona as krona { input: - kraken2_report = kraken2_clean.kraken2_report, + kraken2_report = kraken2.kraken2_report, samplename = samplename } scatter (taxon_id in taxon_ids) { @@ -60,8 +48,8 @@ workflow theiameta_panel_illumina_pe { input: # we should consider changing the classified_report name so # it won't be confused with the actual kraken2 report - kraken2_output = kraken2_clean.kraken2_classified_report, - kraken2_report = kraken2_clean.kraken2_report, + kraken2_output = kraken2.kraken2_classified_report, + kraken2_report = kraken2.kraken2_report, read1 = read_QC_trim.read1_clean, read2 = read_QC_trim.read2_clean, taxon_id = taxon_id @@ -73,7 +61,7 @@ workflow theiameta_panel_illumina_pe { read2 = krakentools.extracted_read2 } #### ADJUST IN THE FUTURE; SETTING TO 100 FOR TESTING #### - if (fastq_scan_binned.read1_seq > ~{minimum_read_number}) { + if (fastq_scan_binned.read1_seq > minimum_read_number) { String did_attempt_assembly = "Assembly attempted" call metaspades_task.metaspades_pe { input: @@ -114,25 +102,35 @@ workflow theiameta_panel_illumina_pe { assembly_fasta = pilon.assembly_fasta, read1 = krakentools.extracted_read1, read2 = krakentools.extracted_read2, - taxon_id = "~{taxon_id}" + taxon_id = "~{taxon_id}", + seq_method = "ILLUMINA" } } } } call gather_scatter_task.gather_scatter { input: + samplename = samplename, taxon_ids = select_first([taxon_ids]), - organism = morgana_magic.organism, + organism = krakentools.organism_name, extracted_read1 = krakentools.extracted_read1, extracted_read2 = krakentools.extracted_read2, + krakentools_docker = krakentools.krakentools_docker, fastq_scan_num_reads_binned1 = fastq_scan_binned.read1_seq, fastq_scan_num_reads_binned2 = fastq_scan_binned.read2_seq, fastq_scan_num_reads_binned_pairs = fastq_scan_binned.read_pairs, - pilon_assembly_fasta = pilon.assembly_fasta, + fastq_scan_docker = fastq_scan_binned.fastq_scan_docker, + fastq_scan_version = fastq_scan_binned.version, + pilon_assembly_fasta = pilon.assembly_fasta, # maybe?? quast_genome_length = quast.genome_length, quast_number_contigs = quast.number_contigs, quast_n50 = quast.n50_value, quast_gc_percent = quast.gc_percent, + number_N = morgana_magic.number_N, + number_ATCG = morgana_magic.number_ATCG, + number_Degenerate = morgana_magic.number_Degenerate, + number_Total = morgana_magic.number_Total, + percent_reference_coverage = morgana_magic.percent_reference_coverage, pango_lineage = morgana_magic.pango_lineage, pango_lineage_expanded = morgana_magic.pango_lineage_expanded, pangolin_conflicts = morgana_magic.pangolin_conflicts, @@ -147,9 +145,34 @@ workflow theiameta_panel_illumina_pe { nextclade_aa_dels = morgana_magic.nextclade_aa_dels, nextclade_clade = morgana_magic.nextclade_clade, nextclade_lineage = morgana_magic.nextclade_lineage, - nextclade_qc = morgana_magic.nextclade_qc + nextclade_qc = morgana_magic.nextclade_qc, + nextclade_ds_tag_flu_ha = morgana_magic.nextclade_ds_tag_flu_ha, + nextclade_aa_subs_flu_ha = morgana_magic.nextclade_aa_subs_flu_ha, + nextclade_aa_dels_flu_ha = morgana_magic.nextclade_aa_dels_flu_ha, + nextclade_clade_flu_ha = morgana_magic.nextclade_clade_flu_ha, + nextclade_qc_flu_ha = morgana_magic.nextclade_qc_flu_ha, + nextclade_ds_tag_flu_na = morgana_magic.nextclade_ds_tag_flu_na, + nextclade_aa_subs_flu_na = morgana_magic.nextclade_aa_subs_flu_na, + nextclade_aa_dels_flu_na = morgana_magic.nextclade_aa_dels_flu_na, + nextclade_clade_flu_na = morgana_magic.nextclade_clade_flu_na, + nextclade_qc_flu_na = morgana_magic.nextclade_qc_flu_na } output { - Array[String?] identified_organisms = select_first([morgana_magic.organism]) + # kraken2 outputs + String kraken2_version = kraken2.kraken2_version + String kraken2_database = kraken2.kraken2_database + String kraken2_docker = kraken2.kraken2_docker + File kraken2_report = kraken2.kraken2_report + File kraken2_classified_report = kraken2.kraken2_classified_report + # krona outputs + String krona_version = krona.krona_version + String krona_docker = krona.krona_docker + File krona_html = krona.krona_html + # krakentools outputs + Array[String] identified_organisms = krakentools.organism_name + # docker image??? -- work on figuring out how to make this not an array + # Array[String] krakentools_docker = select_first([krakentools.krakentools_docker]), + File results_by_taxon_tsv = gather_scatter.gathered_results + } } \ No newline at end of file diff --git a/workflows/utilities/wf_flu_track.wdl b/workflows/utilities/wf_flu_track.wdl index 71e8e952d..6cebdf4e8 100644 --- a/workflows/utilities/wf_flu_track.wdl +++ b/workflows/utilities/wf_flu_track.wdl @@ -75,6 +75,8 @@ workflow flu_track { Int? nextclade_output_parser_cpu Int? nextclade_output_parser_memory Int? nextclade_output_parser_disk_size + + Boolean analyze_flu_antiviral_substitutions = true } # IRMA will run if no assembly is provided (as in the case of TheiaCoV_FASTA) call irma_task.irma { @@ -167,7 +169,7 @@ workflow flu_track { } # if IRMA was run successfully, run the flu_antiviral substitutions task # this block must be placed beneath the previous block because it is used in this subworkflow - if (defined(irma.irma_assemblies)) { + if (defined(irma.irma_assemblies) && analyze_flu_antiviral_substitutions) { call flu_antiviral.flu_antiviral_substitutions { input: na_segment_assembly = irma.seg_na_assembly_padded, diff --git a/workflows/utilities/wf_morgana_magic.wdl b/workflows/utilities/wf_morgana_magic.wdl index 3d266cfe3..ba7a779db 100644 --- a/workflows/utilities/wf_morgana_magic.wdl +++ b/workflows/utilities/wf_morgana_magic.wdl @@ -1,11 +1,11 @@ version 1.0 -import "../../tasks/quality_control/advanced_metrics/task_vadr.wdl" as vadr_task import "../../tasks/quality_control/basic_statistics/task_consensus_qc.wdl" as consensus_qc_task import "../../tasks/species_typing/betacoronavirus/task_pangolin.wdl" as pangolin import "../../tasks/species_typing/lentivirus/task_quasitools.wdl" as quasitools import "../../tasks/taxon_id/task_nextclade.wdl" as nextclade_task import "../utilities/wf_organism_parameters.wdl" as set_organism_defaults +import "../utilities/wf_flu_track.wdl" as flu_track_wf workflow morgana_magic { input { @@ -14,6 +14,7 @@ workflow morgana_magic { File read1 File read2 String taxon_id + String seq_method } #### need to add more flu characterization call set_organism_defaults.organism_parameters { @@ -27,6 +28,17 @@ workflow morgana_magic { reference_genome = organism_parameters.reference, genome_length = organism_parameters.genome_length } + if (organism_parameters.standardized_organism == "flu") { + call flu_track_wf.flu_track { + input: + samplename = samplename, + read1 = read1, + read2 = read2, + seq_method = seq_method, + standardized_organism = organism_parameters.standardized_organism, + analyze_flu_antiviral_substitutions = false # don't try to look for antiviral substitutions?? or maybe? not sure + } + } if (organism_parameters.standardized_organism == "sars-cov-2") { call pangolin.pangolin4 { input: @@ -36,7 +48,6 @@ workflow morgana_magic { } } if (organism_parameters.standardized_organism == "MPXV" || organism_parameters.standardized_organism == "sars-cov-2" || organism_parameters.standardized_organism == "rsv_a" || organism_parameters.standardized_organism == "rsv_b") { - # tasks specific to either MPXV, sars-cov-2, or RSV-A/RSV-B call nextclade_task.nextclade_v3 { input: genome_fasta = assembly_fasta, @@ -49,20 +60,6 @@ workflow morgana_magic { organism = organism_parameters.standardized_organism } } - ### add flu - ##### is running vadr even something we want to do???? - if (organism_parameters.standardized_organism == "MPXV" || organism_parameters.standardized_organism == "sars-cov-2" || organism_parameters.standardized_organism == "WNV" || organism_parameters.standardized_organism == "flu" || organism_parameters.standardized_organism == "rsv_a" || organism_parameters.standardized_organism == "rsv_b") { - # tasks specific to MPXV, sars-cov-2, WNV, flu, rsv_a, and rsv_b - call vadr_task.vadr { - input: - genome_fasta = assembly_fasta, - assembly_length_unambiguous = consensus_qc.number_ATCG, - vadr_opts = organism_parameters.vadr_opts, - max_length = organism_parameters.vadr_maxlength, - skip_length = organism_parameters.vadr_skiplength, - memory = organism_parameters.vadr_memory - } - } ##### is running quasitools even something we want to do???? if (organism_parameters.standardized_organism == "HIV") { call quasitools.quasitools as quasitools_illumina_pe { @@ -74,6 +71,12 @@ workflow morgana_magic { } output { String organism = organism_parameters.standardized_organism + # Consensus QC outputs + Int number_N = consensus_qc.number_N + Int number_ATCG = consensus_qc.number_ATCG + Int number_Degenerate = consensus_qc.number_Degenerate + Int number_Total = consensus_qc.number_Total + Float percent_reference_coverage = consensus_qc.percent_reference_coverage # Pangolin outputs String? pango_lineage = pangolin4.pangolin_lineage String? pango_lineage_expanded = pangolin4.pangolin_lineage_expanded @@ -84,8 +87,8 @@ workflow morgana_magic { String? pangolin_docker = pangolin4.pangolin_docker String? pangolin_versions = pangolin4.pangolin_versions # Nextclade outputs for all organisms - String nextclade_version = select_first([nextclade_v3.nextclade_version, ""]) - String nextclade_docker = select_first([nextclade_v3.nextclade_docker, ""]) + String nextclade_version = select_first([nextclade_v3.nextclade_version, flu_track.nextclade_version, ""]) + String nextclade_docker = select_first([nextclade_v3.nextclade_docker, flu_track.nextclade_docker, ""]) # Nextclade outputs for non-flu File? nextclade_json = nextclade_v3.nextclade_json File? auspice_json = nextclade_v3.auspice_json @@ -96,15 +99,41 @@ workflow morgana_magic { String? nextclade_clade = nextclade_output_parser.nextclade_clade String? nextclade_lineage = nextclade_output_parser.nextclade_lineage String? nextclade_qc = nextclade_output_parser.nextclade_qc - # VADR Annotation QC - File? vadr_alerts_list = vadr.alerts_list - File? vadr_feature_tbl_pass = vadr.feature_tbl_pass - File? vadr_feature_tbl_fail = vadr.feature_tbl_fail - File? vadr_classification_summary_file = vadr.classification_summary_file - File? vadr_all_outputs_tar_gz = vadr.outputs_tgz - String? vadr_num_alerts = vadr.num_alerts - String? vadr_docker = vadr.vadr_docker - File? vadr_fastas_zip_archive = vadr.vadr_fastas_zip_archive + # Nextclade outputs for flu HA + File? nextclade_json_flu_ha = flu_track.nextclade_json_flu_ha + File? auspice_json_flu_ha = flu_track.auspice_json_flu_ha + File? nextclade_tsv_flu_ha = flu_track.nextclade_tsv_flu_ha + String? nextclade_ds_tag_flu_ha = flu_track.nextclade_ds_tag_flu_ha + String? nextclade_aa_subs_flu_ha = flu_track.nextclade_aa_subs_flu_ha + String? nextclade_aa_dels_flu_ha = flu_track.nextclade_aa_dels_flu_ha + String? nextclade_clade_flu_ha = flu_track.nextclade_clade_flu_ha + String? nextclade_qc_flu_ha = flu_track.nextclade_qc_flu_ha + # Nextclade outputs for flu NA + File? nextclade_json_flu_na = flu_track.nextclade_json_flu_na + File? auspice_json_flu_na = flu_track.auspice_json_flu_na + File? nextclade_tsv_flu_na = flu_track.nextclade_tsv_flu_na + String? nextclade_ds_tag_flu_na = flu_track.nextclade_ds_tag_flu_na + String? nextclade_aa_subs_flu_na = flu_track.nextclade_aa_subs_flu_na + String? nextclade_aa_dels_flu_na = flu_track.nextclade_aa_dels_flu_na + String? nextclade_clade_flu_na = flu_track.nextclade_clade_flu_na + String? nextclade_qc_flu_na = flu_track.nextclade_qc_flu_na + # Flu IRMA Outputs + String? irma_version = flu_track.irma_version + String? irma_docker = flu_track.irma_docker + String? irma_type = flu_track.irma_type + String? irma_subtype = flu_track.irma_subtype + String? irma_subtype_notes = flu_track.irma_subtype_notes + # Flu GenoFLU Outputs + String? genoflu_version = flu_track.genoflu_version + String? genoflu_genotype = flu_track.genoflu_genotype + String? genoflu_all_segments = flu_track.genoflu_all_segments + File? genoflu_output_tsv = flu_track.genoflu_output_tsv + # Flu Abricate Outputs + String? abricate_flu_type = flu_track.abricate_flu_type + String? abricate_flu_subtype = flu_track.abricate_flu_subtype + File? abricate_flu_results = flu_track.abricate_flu_results + String? abricate_flu_database = flu_track.abricate_flu_database + String? abricate_flu_version = flu_track.abricate_flu_version # HIV Outputs String? quasitools_version = quasitools_illumina_pe.quasitools_version String? quasitools_date = quasitools_illumina_pe.quasitools_date From e8e757d75cc871b33858bfe1686332eb151c4d66 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Thu, 10 Oct 2024 19:29:33 +0000 Subject: [PATCH 07/48] does this work? --- workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index 30e4e1041..057969f7d 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -111,7 +111,7 @@ workflow theiameta_panel_illumina_pe { call gather_scatter_task.gather_scatter { input: samplename = samplename, - taxon_ids = select_first([taxon_ids]), + taxon_ids = taxon_ids, organism = krakentools.organism_name, extracted_read1 = krakentools.extracted_read1, extracted_read2 = krakentools.extracted_read2, From 8416a8ee9ba14502c480f737b2ebc943425bd24e Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Thu, 10 Oct 2024 19:31:36 +0000 Subject: [PATCH 08/48] set required for now --- workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index 057969f7d..846f44ebe 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -18,7 +18,7 @@ workflow theiameta_panel_illumina_pe { String samplename File read1 File read2 - Array[Int]? taxon_ids # suggest using a workspace element if user wants to modify? + Array[Int] taxon_ids # suggest using a workspace element if user wants to modify? Int minimum_read_number = 100 File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz" From 8ddf11be2a603fd750772807f81acd9d16fb65f2 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Thu, 10 Oct 2024 19:48:36 +0000 Subject: [PATCH 09/48] correct terrible spelling --- docs/contributing/doc_contribution.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing/doc_contribution.md b/docs/contributing/doc_contribution.md index 8a32abf56..faca77259 100644 --- a/docs/contributing/doc_contribution.md +++ b/docs/contributing/doc_contribution.md @@ -43,7 +43,7 @@ A brief description of the documentation structure is as follows: - `assets/` - Contains images and other files used in the documentation. - `figures/` - Contains images, figures, and workflow diagrams used in the documentation. For workflows that contain many images (such as BaseSpace_Fetch), it is recommended to create a subdirectory for the workflow. - `files/` - Contains files that are used in the documentation. This may include example outputs or templates. For workflows that contain many files (such as TheiaValidate), it is recommended to create a subdirectory for the workflow. - - `logos/` - Contains Theiagen logos and symbols used int he documentation. + - `logos/` - Contains Theiagen logos and symbols used in the documentation. - `metadata_formatters/` - Contains the most up-to-date metadata formatters for our submission workflows. - `new_workflow_template.md` - A template for adding a new workflow page to the documentation. - `contributing/` - Contains the Markdown files for our contribution guides, such as this file From 236230fd398144fe9bcb44cb370fc9c7b431f6c7 Mon Sep 17 00:00:00 2001 From: cimendes Date: Fri, 11 Oct 2024 12:44:32 +0000 Subject: [PATCH 10/48] add runtime --- .../utilities/data_handling/task_gather_scatter.wdl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index 243b31530..ad260209a 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -57,6 +57,10 @@ task gather_scatter { Array[String?] nextclade_aa_dels_flu_na Array[String?] nextclade_clade_flu_na Array[String?] nextclade_qc_flu_na + String docker = "us-docker.pkg.dev/general-theiagen/quay/ubuntu:latest" + Int disk_size = 50 + Int cpu = 2 + Int memory = 8 } command <<< ( @@ -111,4 +115,13 @@ task gather_scatter { output { File gathered_results = "~{samplename}.results.tsv" } + runtime { + docker: "~{docker}" + memory: memory + " GB" + cpu: cpu + disks: "local-disk " + disk_size + " SSD" + disk: disk_size + " GB" + maxRetries: 0 + preemptible: 0 + } } \ No newline at end of file From 3d23bce75d1d9cc424cc847961927f22a53f0a86 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Fri, 11 Oct 2024 15:57:57 +0000 Subject: [PATCH 11/48] start documentation --- .../theiameta_panel.md | 44 +++++++++++++++++++ .../workflows_alphabetically.md | 1 + docs/workflows_overview/workflows_kingdom.md | 1 + docs/workflows_overview/workflows_type.md | 1 + mkdocs.yml | 3 ++ 5 files changed, 50 insertions(+) create mode 100644 docs/workflows/genomic_characterization/theiameta_panel.md diff --git a/docs/workflows/genomic_characterization/theiameta_panel.md b/docs/workflows/genomic_characterization/theiameta_panel.md new file mode 100644 index 000000000..3b1645577 --- /dev/null +++ b/docs/workflows/genomic_characterization/theiameta_panel.md @@ -0,0 +1,44 @@ +# TheiaMeta Panel + +## Quick Facts + +| **Workflow Type** | **Applicable Kingdom** | **Last Known Changes** | **Command-line Compatibility** | **Workflow Level** | +|---|---|---|---|---| +| [Genomice Characterization](../../workflows_overview/workflows_type.md/#genomic_characterization) | [Viral](../../workflows_overview/workflows_kingdom.md/#viral) | PHB v2.X.X | Yes | Sample-level | + +## TheiaMeta_Panel_Illumina_PE_PHB + +TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; however, it can be used for any panel that is sequenced using Illumina paired-end reads if the appropriate taxon IDs are provided. TheiaMeta_Panel performs taxonomic binning, and then assembles the bins into contigs. If the contigs are associated with a supported organism, genomic characterization will be performed. + +### Inputs + +| **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | +|---|---|---|---|---|---| +| task_name | **variable_name** | Type | Description | Default Value | Required/Optional | + +### Workflow Tasks + +Description of the workflow tasks + +??? task "`tool_name`: Description of tool" + Description of the task + + !!! techdetails "Tool Name Technical Details" + | | Links | + | --- | --- | + | Task | [link to task on GitHub] | + | Software Source Code | [link to tool's source code] | + | Software Documentation | [link to tool's documentation] | + | Original Publication | [link to tool's publication] | + +### Outputs + +| **Variable** | **Type** | **Description** | +|---|---|---| +| variable_name | Type | Description | + +## References (if applicable) + +> reference1 + +> reference2 diff --git a/docs/workflows_overview/workflows_alphabetically.md b/docs/workflows_overview/workflows_alphabetically.md index 46438f671..128e9bee7 100644 --- a/docs/workflows_overview/workflows_alphabetically.md +++ b/docs/workflows_overview/workflows_alphabetically.md @@ -41,6 +41,7 @@ title: Alphabetical Workflows | [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | | [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.0.1 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | | [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.0.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | +| [**TheiaMeta Panel**](../workflows/genomic_characterization/theiameta_panel.md) | Genome assembly and QC from metagenomic sequencing using a panel | Viral | Sample-level | Yes |2.X.X | [TheiaMeta_Panel_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Panel_PHB:main?tab=info) | | [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | | [**TheiaValidate**](../workflows/standalone/theiavalidate.md)| This workflow performs basic comparisons between user-designated columns in two separate tables. | Any taxa | | No | v2.0.0 | [TheiaValidate_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaValidate_PHB:main?tab=info) | | [**Transfer_Column_Content**](../workflows/data_export/transfer_column_content.md)| Transfer contents of a specified Terra data table column for many samples ("entities") to a GCP storage bucket location | Any taxa | Set-level | Yes | v1.3.0 | [Transfer_Column_Content_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Transfer_Column_Content_PHB:main?tab=info) | diff --git a/docs/workflows_overview/workflows_kingdom.md b/docs/workflows_overview/workflows_kingdom.md index a97a0e2a5..ab8575a64 100644 --- a/docs/workflows_overview/workflows_kingdom.md +++ b/docs/workflows_overview/workflows_kingdom.md @@ -75,6 +75,7 @@ title: Workflows by Kingdom | [**Terra_2_GISAID**](../workflows/public_data_sharing/terra_2_gisaid.md)| Upload of assembly data to GISAID | SARS-CoV-2, Viral | Set-level | Yes | v1.2.1 | [Terra_2_GISAID_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_GISAID_PHB:main?tab=info) | | [**Terra_2_NCBI**](../workflows/public_data_sharing/terra_2_ncbi.md)| Upload of sequence data to NCBI | Bacteria, Mycotics, Viral | Set-level | No | v2.1.0 | [Terra_2_NCBI_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Terra_2_NCBI_PHB:main?tab=info) | | [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | +| [**TheiaMeta Panel**](../workflows/genomic_characterization/theiameta_panel.md) | Genome assembly and QC from metagenomic sequencing using a panel | Viral | Sample-level | Yes | v2.X.X | [TheiaMeta_Panel_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Panel_PHB:main?tab=info) | | [**Usher_PHB**](../workflows/phylogenetic_placement/usher.md)| Use UShER to rapidly and accurately place your samples on any existing phylogenetic tree | Monkeypox virus, SARS-CoV-2, Viral | Sample-level, Set-level | Yes | v2.1.0 | [Usher_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/Usher_PHB:main?tab=info) | | [**VADR_Update**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v1.2.1 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | diff --git a/docs/workflows_overview/workflows_type.md b/docs/workflows_overview/workflows_type.md index ce2ff2f5c..5e7e024b6 100644 --- a/docs/workflows_overview/workflows_type.md +++ b/docs/workflows_overview/workflows_type.md @@ -24,6 +24,7 @@ title: Workflows by Type | [**TheiaCov Workflow Series**](../workflows/genomic_characterization/theiacov.md) | Viral genome assembly, QC and characterization from amplicon sequencing | HIV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level, Set-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaCoV_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_PE_PHB:main?tab=info), [TheiaCoV_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_Illumina_SE_PHB:main?tab=info), [TheiaCoV_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ONT_PHB:main?tab=info), [TheiaCoV_ClearLabs_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_ClearLabs_PHB:main?tab=info), [TheiaCoV_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_PHB:main?tab=info), [TheiaCoV_FASTA_Batch_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaCoV_FASTA_Batch_PHB:main?tab=info) | | [**TheiaEuk**](../workflows/genomic_characterization/theiaeuk.md) | Mycotic genome assembly, QC and characterization from WGS data | Mycotics | Sample-level | Some optional features incompatible, Yes | v2.0.1 | [TheiaEuk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaEuk_Illumina_PE_PHB:main?tab=info) | | [**TheiaMeta**](../workflows/genomic_characterization/theiameta.md) | Genome assembly and QC from metagenomic sequencing | Any taxa | Sample-level | Yes | v2.0.0 | [TheiaMeta_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Illumina_PE_PHB:main?tab=info) | +| [**TheiaMeta Panel**](../workflows/genomic_characterization/theiameta_panel.md) | Genome assembly and QC from metagenomic sequencing using a panel | Viral | Sample-level | Yes | v2.X.X | [TheiaMeta_Panel_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaMeta_Panel_PHB:main?tab=info) | | [**TheiaProk Workflow Series**](../workflows/genomic_characterization/theiaprok.md) | Bacterial genome assembly, QC and characterization from WGS data | Bacteria | Sample-level | Some optional features incompatible, Yes | v2.2.0 | [TheiaProk_Illumina_PE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_PE_PHB:main?tab=info), [TheiaProk_Illumina_SE_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_Illumina_SE_PHB:main?tab=info), [TheiaProk_ONT_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_ONT_PHB:main?tab=info), [TheiaProk_FASTA_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/TheiaProk_FASTA_PHB:main?tab=info) | | [**VADR_Update**](../workflows/genomic_characterization/vadr_update.md)| Update VADR assignments | HAV, Influenza, Monkeypox virus, RSV-A, RSV-B, SARS-CoV-2, Viral, WNV | Sample-level | Yes | v1.2.1 | [VADR_Update_PHB](https://dockstore.org/workflows/github.com/theiagen/public_health_bioinformatics/VADR_Update_PHB:main?tab=info) | diff --git a/mkdocs.yml b/mkdocs.yml index 4ecf0457b..fbe6df019 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,6 +25,7 @@ nav: - TheiaCoV Workflow Series: workflows/genomic_characterization/theiacov.md - TheiaEuk: workflows/genomic_characterization/theiaeuk.md - TheiaMeta: workflows/genomic_characterization/theiameta.md + - TheiaMeta_Panel: workflows/genomic_characterization/theiameta_panel.md - TheiaProk Workflow Series: workflows/genomic_characterization/theiaprok.md - VADR_Update: workflows/genomic_characterization/vadr_update.md - Phylogenetic Construction: @@ -114,6 +115,7 @@ nav: - Terra_2_GISAID: workflows/public_data_sharing/terra_2_gisaid.md - Terra_2_NCBI: workflows/public_data_sharing/terra_2_ncbi.md - TheiaCoV Workflow Series: workflows/genomic_characterization/theiacov.md + - TheiaMeta_Panel: workflows/genomic_characterization/theiameta_panel.md - Usher_PHB: workflows/phylogenetic_placement/usher.md - VADR_Update: workflows/genomic_characterization/vadr_update.md - Workflows Alphabetically: @@ -151,6 +153,7 @@ nav: - TheiaCoV Workflow Series: workflows/genomic_characterization/theiacov.md - TheiaEuk: workflows/genomic_characterization/theiaeuk.md - TheiaMeta: workflows/genomic_characterization/theiameta.md + - TheiaMeta_Panel: workflows/genomic_characterization/theiameta_panel.md - TheiaProk Workflow Series: workflows/genomic_characterization/theiaprok.md - TheiaValidate: workflows/standalone/theiavalidate.md - Transfer_Column_Content: workflows/data_export/transfer_column_content.md From 75c7224c1f20959fe22139584bd2726fa71f4d22 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 15 Oct 2024 16:22:09 +0000 Subject: [PATCH 12/48] add information on workflow tasks to documentation --- .../theiameta_panel.md | 102 ++++++++++++++++-- 1 file changed, 91 insertions(+), 11 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiameta_panel.md b/docs/workflows/genomic_characterization/theiameta_panel.md index 3b1645577..e5faa419d 100644 --- a/docs/workflows/genomic_characterization/theiameta_panel.md +++ b/docs/workflows/genomic_characterization/theiameta_panel.md @@ -8,7 +8,7 @@ ## TheiaMeta_Panel_Illumina_PE_PHB -TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; however, it can be used for any panel that is sequenced using Illumina paired-end reads if the appropriate taxon IDs are provided. TheiaMeta_Panel performs taxonomic binning, and then assembles the bins into contigs. If the contigs are associated with a supported organism, genomic characterization will be performed. +TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; however, it can be used for any panel that is sequenced using Illumina paired-end reads if the appropriate taxon IDs are provided. TheiaMeta_Panel performs taxonomic binning, and then assembles the bins into contigs. If the contigs are associated with a supported organism, genomic characterization will be performed. ### Inputs @@ -18,18 +18,98 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; ### Workflow Tasks -Description of the workflow tasks -??? task "`tool_name`: Description of tool" - Description of the task +??? task "`read_QC_trim`: Read Quality Trimming, Adapter Removal, Quantification, and Identification" - !!! techdetails "Tool Name Technical Details" - | | Links | - | --- | --- | - | Task | [link to task on GitHub] | - | Software Source Code | [link to tool's source code] | - | Software Documentation | [link to tool's documentation] | - | Original Publication | [link to tool's publication] | + `read_QC_trim` is a sub-workflow within TheiaMeta that removes low-quality reads, low-quality regions of reads, and sequencing adapters to improve data quality. It uses a number of tasks, described below. + + **Read quality trimming** + + Either `trimmomatic` or `fastp` can be used for read-quality trimming. Trimmomatic is used by default. Both tools trim low-quality regions of reads with a sliding window (with a window size of `trim_window_size`), cutting once the average quality within the window falls below `trim_quality_trim_score`. They will both discard the read if it is trimmed below `trim_minlen`. + + If fastp is selected for analysis, fastp also implements the additional read-trimming steps indicated below: + + | **Parameter** | **Explanation** | + | --- | --- | + | -g | enables polyG tail trimming | + | -5 20 | enables read end-trimming | + | -3 20 | enables read end-trimming | + | --detect_adapter_for_pe | enables adapter-trimming **only for paired-end reads** | + + **Adapter removal** + + The `BBDuk` task removes adapters from sequence reads. To do this: + + - [Repair](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/repair-guide/) from the [BBTools](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/) package reorders reads in paired fastq files to ensure the forward and reverse reads of a pair are in the same position in the two fastq files. + - [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) (*"Bestus Bioinformaticus" Decontamination Using Kmers*) is then used to trim the adapters and filter out all reads that have a 31-mer match to [PhiX](https://emea.illumina.com/products/by-type/sequencing-kits/cluster-gen-sequencing-reagents/phix-control-v3.html), which is commonly added to Illumina sequencing runs to monitor and/or improve overall run quality. + + ??? toggle "What are adapters and why do they need to be removed?" + Adapters are manufactured oligonucleotide sequences attached to DNA fragments during the library preparation process. In Illumina sequencing, these adapter sequences are required for attaching reads to flow cells. You can read more about Illumina adapters [here](https://emea.support.illumina.com/bulletins/2020/06/illumina-adapter-portfolio.html). For genome analysis, it's important to remove these sequences since they're not actually from your sample. If you don't remove them, the downstream analysis may be affected. + + **Read Quantification** + + There are two methods for read quantification to choose from: [`fastq-scan`](https://github.com/rpetit3/fastq-scan) (default) or [`fastqc`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Both quantify the forward and reverse reads in FASTQ files. In TheiaProk_Illumina_PE, they also provide the total number of read pairs. This task is run once with raw reads as input and once with clean reads as input. If QC has been performed correctly, you should expect **fewer** clean reads than raw reads. `fastqc` also provides a graphical visualization of the read quality. + + **Read Identification (optional)** + + The `MIDAS` task is for the identification of reads to detect contamination with non-target taxa. This task is optional and turned off by default. It can be used by setting the `call_midas` input variable to `true`. + + The MIDAS tool was originally designed for metagenomic sequencing data but has been co-opted for use with bacterial isolate WGS methods. It can be used to detect contamination present in raw sequencing data by estimating bacterial species abundance in bacterial isolate WGS data. If a secondary genus is detected above a relative frequency of 0.01 (1%), then the sample should fail QC and be investigated further for potential contamination. + + This task is similar to those used in commercial software, BioNumerics, for estimating secondary species abundance. + + ??? toggle "How are the MIDAS output columns determined?" + + Example MIDAS report in the `midas_report` column: + + | species_id | count_reads | coverage | relative_abundance | + | --- | --- | --- | --- | + | Salmonella_enterica_58156 | 3309 | 89.88006645 | 0.855888033 | + | Salmonella_enterica_58266 | 501 | 11.60606061 | 0.110519371 | + | Salmonella_enterica_53987 | 99 | 2.232896237 | 0.021262881 | + | Citrobacter_youngae_61659 | 46 | 0.995216227 | 0.009477003 | + | Escherichia_coli_58110 | 5 | 0.123668877 | 0.001177644 | + + MIDAS report column descriptions: + + - species_id: species identifier + - count_reads: number of reads mapped to marker genes + - coverage: estimated genome-coverage (i.e. read-depth) of species in metagenome + - relative_abundance: estimated relative abundance of species in metagenome + + The value in the `midas_primary_genus` column is derived by ordering the rows in order of "relative_abundance" and identifying the genus of top species in the "species_id" column (Salmonella). The value in the `midas_secondary_genus` column is derived from the genus of the second-most prevalent genus in the "species_id" column (Citrobacter). The `midas_secondary_genus_abundance` column is the "relative_abundance" of the second-most prevalent genus (0.009477003). The `midas_secondary_genus_coverage` is the "coverage" of the second-most prevalent genus (0.995216227). + + Alternatively to `MIDAS`, the `Kraken2` task can also be turned on through setting the `call_kraken` input variable as `true` for the identification of reads to detect contamination with non-target taxa. + + Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate) whole genome sequence data. A database must be provided if this optional module is activated, through the kraken_db optional input. A list of suggested databases can be found on [Kraken2 standalone documentation](../standalone/kraken2.md). + + !!! techdetails "read_QC_trim Technical Details" + + | | Links | + | --- | --- | + | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/workflows/wf_read_QC_trim.wdl) | + | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| + | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | + | Original Publication(s) | *[Trimmomatic: a flexible trimmer for Illumina sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
*[fastp: an ultra-fast all-in-one FASTQ preprocessor](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234?login=false)
*[An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography](https://pubmed.ncbi.nlm.nih.gov/27803195/)
*[Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | + +??? task "`kraken`: Taxonomic Classification" + + Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate, eukaryotic isolate, viral isolate, etc.) whole genome sequence data. + + Kraken2 is run on the set of raw reads, provided as input, as well as the set of clean reads that are resulted from the `read_QC_trim` workflow + + !!! info "Database-dependent" + The Kraken2 software is database-dependent and **taxonomic assignments are highly sensitive to the database used**. An appropriate database should contain the expected organism(s) (e.g. _Escherichia coli_) and other taxa that may be present in the reads (e.g. _Citrobacter freundii_, a common contaminant). + + !!! techdetails "Kraken2 Technical Details" + + | | Links | + | --- | --- | + | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | + | Software Documentation | | + | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | ### Outputs From e8312a5f401b974a980bd8fe6c08a7eaff52ccf8 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 15 Oct 2024 16:43:21 +0000 Subject: [PATCH 13/48] remove krona --- .../theiameta_panel.md | 51 ++++++++++--------- .../wf_theiameta_panel_illumina_pe.wdl | 10 ---- 2 files changed, 27 insertions(+), 34 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiameta_panel.md b/docs/workflows/genomic_characterization/theiameta_panel.md index e5faa419d..fc0a43ed9 100644 --- a/docs/workflows/genomic_characterization/theiameta_panel.md +++ b/docs/workflows/genomic_characterization/theiameta_panel.md @@ -18,16 +18,17 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; ### Workflow Tasks - ??? task "`read_QC_trim`: Read Quality Trimming, Adapter Removal, Quantification, and Identification" `read_QC_trim` is a sub-workflow within TheiaMeta that removes low-quality reads, low-quality regions of reads, and sequencing adapters to improve data quality. It uses a number of tasks, described below. **Read quality trimming** - Either `trimmomatic` or `fastp` can be used for read-quality trimming. Trimmomatic is used by default. Both tools trim low-quality regions of reads with a sliding window (with a window size of `trim_window_size`), cutting once the average quality within the window falls below `trim_quality_trim_score`. They will both discard the read if it is trimmed below `trim_minlen`. + Either `trimmomatic` or `fastp` can be used for read-quality trimming. Trimmomatic is used by default. Both tools trim low-quality regions of reads with a sliding window (with a window size of `trim_window_size`), cutting once the average quality within the window falls below `trim_quality_trim_score`. They will both discard the read if it is trimmed below `trim_min_length`. + + By default, the trim_min_length is set to 75 bp. This is likely _too high_ for data generated using the Illumina VSP panel. We recommend setting this parameter to `50` in this case. - If fastp is selected for analysis, fastp also implements the additional read-trimming steps indicated below: + If fastp is selected for analysis, fastp also implements the additional read-trimming parameters indicated below: | **Parameter** | **Explanation** | | --- | --- | @@ -48,19 +49,17 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; **Read Quantification** - There are two methods for read quantification to choose from: [`fastq-scan`](https://github.com/rpetit3/fastq-scan) (default) or [`fastqc`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Both quantify the forward and reverse reads in FASTQ files. In TheiaProk_Illumina_PE, they also provide the total number of read pairs. This task is run once with raw reads as input and once with clean reads as input. If QC has been performed correctly, you should expect **fewer** clean reads than raw reads. `fastqc` also provides a graphical visualization of the read quality. + There are two methods for read quantification to choose from: [`fastq-scan`](https://github.com/rpetit3/fastq-scan) (default) or [`fastqc`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Both quantify the forward and reverse reads in FASTQ files. In paired-end workflows, they also provide the total number of read pairs. This task is run once with raw reads as input and once with clean reads as input. If QC has been performed correctly, you should expect **fewer** clean reads than raw reads. `fastqc` also provides a graphical visualization of the read quality in an HTML file. **Read Identification (optional)** The `MIDAS` task is for the identification of reads to detect contamination with non-target taxa. This task is optional and turned off by default. It can be used by setting the `call_midas` input variable to `true`. - The MIDAS tool was originally designed for metagenomic sequencing data but has been co-opted for use with bacterial isolate WGS methods. It can be used to detect contamination present in raw sequencing data by estimating bacterial species abundance in bacterial isolate WGS data. If a secondary genus is detected above a relative frequency of 0.01 (1%), then the sample should fail QC and be investigated further for potential contamination. - - This task is similar to those used in commercial software, BioNumerics, for estimating secondary species abundance. + The MIDAS reference database, located at **`gs://theiagen-large-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz`**, is provided as the default. It is possible to provide a custom database. More information is available [here](https://github.com/snayfach/MIDAS/blob/master/docs/ref_db.md). ??? toggle "How are the MIDAS output columns determined?" - Example MIDAS report in the `midas_report` column: + Example MIDAS report in the ****`midas_report` column: | species_id | count_reads | coverage | relative_abundance | | --- | --- | --- | --- | @@ -76,41 +75,45 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; - count_reads: number of reads mapped to marker genes - coverage: estimated genome-coverage (i.e. read-depth) of species in metagenome - relative_abundance: estimated relative abundance of species in metagenome - - The value in the `midas_primary_genus` column is derived by ordering the rows in order of "relative_abundance" and identifying the genus of top species in the "species_id" column (Salmonella). The value in the `midas_secondary_genus` column is derived from the genus of the second-most prevalent genus in the "species_id" column (Citrobacter). The `midas_secondary_genus_abundance` column is the "relative_abundance" of the second-most prevalent genus (0.009477003). The `midas_secondary_genus_coverage` is the "coverage" of the second-most prevalent genus (0.995216227). - - Alternatively to `MIDAS`, the `Kraken2` task can also be turned on through setting the `call_kraken` input variable as `true` for the identification of reads to detect contamination with non-target taxa. - - Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate) whole genome sequence data. A database must be provided if this optional module is activated, through the kraken_db optional input. A list of suggested databases can be found on [Kraken2 standalone documentation](../standalone/kraken2.md). - + !!! techdetails "read_QC_trim Technical Details" | | Links | | --- | --- | - | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/workflows/wf_read_QC_trim.wdl) | - | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/quality_control/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_midas.wdl)
[task_kraken2.wdl](https://github.com/theiagen/public_health_bacterial_genomics/blob/main/tasks/taxon_id/task_kraken2.wdl) | - | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2)| - | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS); [Kraken2](https://github.com/DerrickWood/kraken2/wiki) | - | Original Publication(s) | *[Trimmomatic: a flexible trimmer for Illumina sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
*[fastp: an ultra-fast all-in-one FASTQ preprocessor](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234?login=false)
*[An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography](https://pubmed.ncbi.nlm.nih.gov/27803195/)
*[Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | + | Sub-workflow | [wf_read_QC_trim.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/workflows/utilities/wf_read_QC_trim.wdl) | + | Tasks | [task_fastp.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_fastp.wdl)
[task_trimmomatic.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_trimmomatic.wdl)
[task_bbduk.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_bbduk.wdl)
[task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl)
[task_midas.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_midas.wdl) | + | Software Source Code | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](https://github.com/usadellab/Trimmomatic); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS)| + | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS) | + | Original Publication(s) | [Trimmomatic: a flexible trimmer for Illumina sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
[fastp: an ultra-fast all-in-one FASTQ preprocessor](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234?login=false)
[An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography](https://pubmed.ncbi.nlm.nih.gov/27803195/) | -??? task "`kraken`: Taxonomic Classification" +??? task "`kraken2`: Taxonomic Classification" Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate, eukaryotic isolate, viral isolate, etc.) whole genome sequence data. - Kraken2 is run on the set of raw reads, provided as input, as well as the set of clean reads that are resulted from the `read_QC_trim` workflow + Kraken2 is run on the clean reads that result from the `read_QC_trim` subworkflow. By default, the Kraken2 database is set to the `k2_viral_20240112` database, located at `"gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz"`. !!! info "Database-dependent" The Kraken2 software is database-dependent and **taxonomic assignments are highly sensitive to the database used**. An appropriate database should contain the expected organism(s) (e.g. _Escherichia coli_) and other taxa that may be present in the reads (e.g. _Citrobacter freundii_, a common contaminant). !!! techdetails "Kraken2 Technical Details" - | | Links | | --- | --- | - | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_kraken2.wdl) | + | Task | [task_kraken2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/contamination/task_kraken2.wdl) | | Software Source Code | [Kraken2 on GitHub](https://github.com/DerrickWood/kraken2/) | | Software Documentation | | | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | +??? task "`KrakenTools extract_kraken_reads`: Read Binning" + KrakenTools is a collection of scripts that can be used to help downstream analysis of Kraken2 results. In particular, this task uses the `extract_kraken_reads` script, which extracts reads classified at any user-specified taxonomy IDs. All parent and children reads of the specified taxonomic ID are also extracted. + + !!! techdetails "KrakenTools Technical Details" + | | Links | + | --- | --- | + | Task | [task_kraken_tools.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_krakentools.wdl) + | Software Source Code | [KrakenTools on GitHub](https://github.com/jenniferlu717/KrakenTools) | + | Software Documentation | [KrakenTools on GitHub](https://github.com/jenniferlu717/KrakenTools) | + | Original Publication | [Metagenome analysis using the Kraken software suite](https://doi.org/10.1038/s41596-022-00738-y) | + ### Outputs | **Variable** | **Type** | **Description** | diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index 846f44ebe..c86b66c88 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -7,7 +7,6 @@ import "../../tasks/quality_control/basic_statistics/task_quast.wdl" as quast_ta import "../../tasks/quality_control/read_filtering/task_pilon.wdl" as pilon_task import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken_task import "../../tasks/taxon_id/task_krakentools.wdl" as krakentools_task -import "../../tasks/taxon_id/contamination/task_krona.wdl" as krona_task import "../../tasks/utilities/data_handling/task_gather_scatter.wdl" as gather_scatter_task import "../../tasks/utilities/data_handling/task_parse_mapping.wdl" as parse_mapping_task import "../utilities/wf_morgana_magic.wdl" as morgana_magic_workflow @@ -38,11 +37,6 @@ workflow theiameta_panel_illumina_pe { read2 = read_QC_trim.read2_clean, kraken2_db = kraken2_db } - call krona_task.krona as krona { - input: - kraken2_report = kraken2.kraken2_report, - samplename = samplename - } scatter (taxon_id in taxon_ids) { call krakentools_task.extract_kraken_reads as krakentools { input: @@ -164,10 +158,6 @@ workflow theiameta_panel_illumina_pe { String kraken2_docker = kraken2.kraken2_docker File kraken2_report = kraken2.kraken2_report File kraken2_classified_report = kraken2.kraken2_classified_report - # krona outputs - String krona_version = krona.krona_version - String krona_docker = krona.krona_docker - File krona_html = krona.krona_html # krakentools outputs Array[String] identified_organisms = krakentools.organism_name # docker image??? -- work on figuring out how to make this not an array From b5260e4a66f4b9486c33f0641f7ea0bb1bde6d35 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 16 Oct 2024 14:50:18 +0000 Subject: [PATCH 14/48] add + to everything???? --- .../data_handling/task_gather_scatter.wdl | 90 +++++++++---------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index ad260209a..f73b6f1b5 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -3,60 +3,60 @@ version 1.0 task gather_scatter { input { String samplename - Array[Int]? taxon_ids + Array[Int]?+ taxon_ids # krakentools outputs - Array[String?] organism - Array[File?] extracted_read1 - Array[File?] extracted_read2 - Array[String]? krakentools_docker + Array[String?]+ organism + Array[File?]+ extracted_read1 + Array[File?]+ extracted_read2 + Array[String]?+ krakentools_docker # fastq_scan outputs - Array[Int?] fastq_scan_num_reads_binned1 - Array[Int?] fastq_scan_num_reads_binned2 - Array[String?] fastq_scan_num_reads_binned_pairs - Array[String?] fastq_scan_docker - Array[String?] fastq_scan_version + Array[Int?]+ fastq_scan_num_reads_binned1 + Array[Int?]+ fastq_scan_num_reads_binned2 + Array[String?]+ fastq_scan_num_reads_binned_pairs + Array[String?]+ fastq_scan_docker + Array[String?]+ fastq_scan_version # Assembly - Array[File?] pilon_assembly_fasta ### maybe????? + Array[File?]+ pilon_assembly_fasta ### maybe????? # quast outputs - Array[Int?] quast_genome_length - Array[Int?] quast_number_contigs - Array[Int?] quast_n50 - Array[Float?] quast_gc_percent + Array[Int?]+ quast_genome_length + Array[Int?]+ quast_number_contigs + Array[Int?]+ quast_n50 + Array[Float?]+ quast_gc_percent # consensus qc outputs - Array[Int?] number_N - Array[Int?] number_ATCG - Array[Int?] number_Degenerate - Array[Int?] number_Total - Array[Float?] percent_reference_coverage + Array[Int?]+ number_N + Array[Int?]+ number_ATCG + Array[Int?]+ number_Degenerate + Array[Int?]+ number_Total + Array[Float?]+ percent_reference_coverage # pangolin outputs - Array[String?] pango_lineage - Array[String?] pango_lineage_expanded - Array[String?] pangolin_conflicts - Array[String?] pangolin_notes - Array[String?] pangolin_assignment_version - Array[String?] pangolin_versions - Array[String?] pangolin_docker + Array[String?]+ pango_lineage + Array[String?]+ pango_lineage_expanded + Array[String?]+ pangolin_conflicts + Array[String?]+ pangolin_notes + Array[String?]+ pangolin_assignment_version + Array[String?]+ pangolin_versions + Array[String?]+ pangolin_docker # Nextclade outputs for non-flu - Array[String?] nextclade_version - Array[String?] nextclade_docker - Array[String?] nextclade_ds_tag - Array[String?] nextclade_aa_subs - Array[String?] nextclade_aa_dels - Array[String?] nextclade_clade - Array[String?] nextclade_lineage - Array[String?] nextclade_qc + Array[String?]+ nextclade_version + Array[String?]+ nextclade_docker + Array[String?]+ nextclade_ds_tag + Array[String?]+ nextclade_aa_subs + Array[String?]+ nextclade_aa_dels + Array[String?]+ nextclade_clade + Array[String?]+ nextclade_lineage + Array[String?]+ nextclade_qc # Nextclade outputs for flu HA - Array[String?] nextclade_ds_tag_flu_ha - Array[String?] nextclade_aa_subs_flu_ha - Array[String?] nextclade_aa_dels_flu_ha - Array[String?] nextclade_clade_flu_ha - Array[String?] nextclade_qc_flu_ha + Array[String?]+ nextclade_ds_tag_flu_ha + Array[String?]+ nextclade_aa_subs_flu_ha + Array[String?]+ nextclade_aa_dels_flu_ha + Array[String?]+ nextclade_clade_flu_ha + Array[String?]+ nextclade_qc_flu_ha # Nextclade outputs for flu NA - Array[String?] nextclade_ds_tag_flu_na - Array[String?] nextclade_aa_subs_flu_na - Array[String?] nextclade_aa_dels_flu_na - Array[String?] nextclade_clade_flu_na - Array[String?] nextclade_qc_flu_na + Array[String?]+ nextclade_ds_tag_flu_na + Array[String?]+ nextclade_aa_subs_flu_na + Array[String?]+ nextclade_aa_dels_flu_na + Array[String?]+ nextclade_clade_flu_na + Array[String?]+ nextclade_qc_flu_na String docker = "us-docker.pkg.dev/general-theiagen/quay/ubuntu:latest" Int disk_size = 50 Int cpu = 2 From aad8ec482f959e54e3564f63ce9caf2fce2a00d3 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 16 Oct 2024 14:51:23 +0000 Subject: [PATCH 15/48] remove from one array --- tasks/utilities/data_handling/task_gather_scatter.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index f73b6f1b5..16a40d23a 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -3,7 +3,7 @@ version 1.0 task gather_scatter { input { String samplename - Array[Int]?+ taxon_ids + Array[Int]? taxon_ids # krakentools outputs Array[String?]+ organism Array[File?]+ extracted_read1 From 991d540864e3498748633031cdb68b81316b7782 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 16 Oct 2024 14:51:55 +0000 Subject: [PATCH 16/48] also remove from that one too --- tasks/utilities/data_handling/task_gather_scatter.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index 16a40d23a..fc392ae57 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -8,7 +8,7 @@ task gather_scatter { Array[String?]+ organism Array[File?]+ extracted_read1 Array[File?]+ extracted_read2 - Array[String]?+ krakentools_docker + Array[String]? krakentools_docker # fastq_scan outputs Array[Int?]+ fastq_scan_num_reads_binned1 Array[Int?]+ fastq_scan_num_reads_binned2 From f56ca68e94a561983794bef126c87864657ea68b Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 16 Oct 2024 17:35:08 +0000 Subject: [PATCH 17/48] trying something cRaZy --- .../data_handling/task_gather_scatter.wdl | 92 ++++++++++--------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index fc392ae57..87331d924 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -3,60 +3,60 @@ version 1.0 task gather_scatter { input { String samplename - Array[Int]? taxon_ids + Array[Int] taxon_ids = [0000000] # krakentools outputs - Array[String?]+ organism - Array[File?]+ extracted_read1 - Array[File?]+ extracted_read2 - Array[String]? krakentools_docker + Array[String?] organism = [""] + Array[File?] extracted_read1 = [""] + Array[File?] extracted_read2 = [""] + Array[String] krakentools_docker = [""] # fastq_scan outputs - Array[Int?]+ fastq_scan_num_reads_binned1 - Array[Int?]+ fastq_scan_num_reads_binned2 - Array[String?]+ fastq_scan_num_reads_binned_pairs - Array[String?]+ fastq_scan_docker - Array[String?]+ fastq_scan_version + Array[Int?] fastq_scan_num_reads_binned1 = [0000000] + Array[Int?] fastq_scan_num_reads_binned2 = [0000000] + Array[String?] fastq_scan_num_reads_binned_pairs = [""] + Array[String?] fastq_scan_docker = [""] + Array[String?] fastq_scan_version = [""] # Assembly - Array[File?]+ pilon_assembly_fasta ### maybe????? + Array[File?] pilon_assembly_fasta = [""]### maybe????? # quast outputs - Array[Int?]+ quast_genome_length - Array[Int?]+ quast_number_contigs - Array[Int?]+ quast_n50 - Array[Float?]+ quast_gc_percent + Array[Int?] quast_genome_length = [0000000] + Array[Int?] quast_number_contigs = [0000000] + Array[Int?] quast_n50 = [0000000] + Array[Float?] quast_gc_percent = [0000000] # consensus qc outputs - Array[Int?]+ number_N - Array[Int?]+ number_ATCG - Array[Int?]+ number_Degenerate - Array[Int?]+ number_Total - Array[Float?]+ percent_reference_coverage + Array[Int?] number_N = [0000000] + Array[Int?] number_ATCG = [0000000] + Array[Int?] number_Degenerate = [0000000] + Array[Int?] number_Total = [0000000] + Array[Float?] percent_reference_coverage = [0000000] # pangolin outputs - Array[String?]+ pango_lineage - Array[String?]+ pango_lineage_expanded - Array[String?]+ pangolin_conflicts - Array[String?]+ pangolin_notes - Array[String?]+ pangolin_assignment_version - Array[String?]+ pangolin_versions - Array[String?]+ pangolin_docker + Array[String?] pango_lineage = [""] + Array[String?] pango_lineage_expanded = [""] + Array[String?] pangolin_conflicts = [""] + Array[String?] pangolin_notes = [""] + Array[String?] pangolin_assignment_version = [""] + Array[String?] pangolin_versions = [""] + Array[String?] pangolin_docker = [""] # Nextclade outputs for non-flu - Array[String?]+ nextclade_version - Array[String?]+ nextclade_docker - Array[String?]+ nextclade_ds_tag - Array[String?]+ nextclade_aa_subs - Array[String?]+ nextclade_aa_dels - Array[String?]+ nextclade_clade - Array[String?]+ nextclade_lineage - Array[String?]+ nextclade_qc + Array[String?] nextclade_version = [""] + Array[String?] nextclade_docker = [""] + Array[String?] nextclade_ds_tag = [""] + Array[String?] nextclade_aa_subs = [""] + Array[String?] nextclade_aa_dels = [""] + Array[String?] nextclade_clade = [""] + Array[String?] nextclade_lineage = [""] + Array[String?] nextclade_qc # Nextclade outputs for flu HA - Array[String?]+ nextclade_ds_tag_flu_ha - Array[String?]+ nextclade_aa_subs_flu_ha - Array[String?]+ nextclade_aa_dels_flu_ha - Array[String?]+ nextclade_clade_flu_ha - Array[String?]+ nextclade_qc_flu_ha + Array[String?] nextclade_ds_tag_flu_ha = [""] + Array[String?] nextclade_aa_subs_flu_ha = [""] + Array[String?] nextclade_aa_dels_flu_ha = [""] + Array[String?] nextclade_clade_flu_ha = [""] + Array[String?] nextclade_qc_flu_ha = [""] # Nextclade outputs for flu NA - Array[String?]+ nextclade_ds_tag_flu_na - Array[String?]+ nextclade_aa_subs_flu_na - Array[String?]+ nextclade_aa_dels_flu_na - Array[String?]+ nextclade_clade_flu_na - Array[String?]+ nextclade_qc_flu_na + Array[String?] nextclade_ds_tag_flu_na = [""] + Array[String?] nextclade_aa_subs_flu_na = [""] + Array[String?] nextclade_aa_dels_flu_na = [""] + Array[String?] nextclade_clade_flu_na = [""] + Array[String?] nextclade_qc_flu_na = [""] String docker = "us-docker.pkg.dev/general-theiagen/quay/ubuntu:latest" Int disk_size = 50 Int cpu = 2 @@ -111,6 +111,8 @@ task gather_scatter { <(echo "~{sep="\n" nextclade_clade_flu_na}") \ <(echo "~{sep="\n" nextclade_qc_flu_na}") ) > ~{samplename}.results.tsv + + sed -i 's/0000000//g' ~{samplename}.results.tsv >>> output { File gathered_results = "~{samplename}.results.tsv" From a04af9982816c7998a588a8c93798be836395c8c Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Thu, 17 Oct 2024 15:02:44 +0000 Subject: [PATCH 18/48] it doesn't work --- .../data_handling/task_gather_scatter.wdl | 198 +++++++++--------- .../wf_theiameta_panel_illumina_pe.wdl | 90 ++++---- 2 files changed, 149 insertions(+), 139 deletions(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index 87331d924..04b8a901f 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -3,116 +3,126 @@ version 1.0 task gather_scatter { input { String samplename - Array[Int] taxon_ids = [0000000] + File? taxon_ids # krakentools outputs - Array[String?] organism = [""] - Array[File?] extracted_read1 = [""] - Array[File?] extracted_read2 = [""] - Array[String] krakentools_docker = [""] + File? organism + File? extracted_read1 + File? extracted_read2 + File? krakentools_docker # fastq_scan outputs - Array[Int?] fastq_scan_num_reads_binned1 = [0000000] - Array[Int?] fastq_scan_num_reads_binned2 = [0000000] - Array[String?] fastq_scan_num_reads_binned_pairs = [""] - Array[String?] fastq_scan_docker = [""] - Array[String?] fastq_scan_version = [""] + File? fastq_scan_num_reads_binned1 + File? fastq_scan_num_reads_binned2 + File? fastq_scan_num_reads_binned_pairs + File? fastq_scan_docker + File? fastq_scan_version # Assembly - Array[File?] pilon_assembly_fasta = [""]### maybe????? + File? pilon_assembly_fasta### maybe????? # quast outputs - Array[Int?] quast_genome_length = [0000000] - Array[Int?] quast_number_contigs = [0000000] - Array[Int?] quast_n50 = [0000000] - Array[Float?] quast_gc_percent = [0000000] + File? quast_genome_length + File? quast_number_contigs + File? quast_n50 + File? quast_gc_percent # consensus qc outputs - Array[Int?] number_N = [0000000] - Array[Int?] number_ATCG = [0000000] - Array[Int?] number_Degenerate = [0000000] - Array[Int?] number_Total = [0000000] - Array[Float?] percent_reference_coverage = [0000000] + File? number_N + File? number_ATCG + File? number_Degenerate + File? number_Total + File? percent_reference_coverage # pangolin outputs - Array[String?] pango_lineage = [""] - Array[String?] pango_lineage_expanded = [""] - Array[String?] pangolin_conflicts = [""] - Array[String?] pangolin_notes = [""] - Array[String?] pangolin_assignment_version = [""] - Array[String?] pangolin_versions = [""] - Array[String?] pangolin_docker = [""] + File? pango_lineage + File? pango_lineage_expanded + File? pangolin_conflicts + File? pangolin_notes + File? pangolin_assignment_version + File? pangolin_versions + File? pangolin_docker # Nextclade outputs for non-flu - Array[String?] nextclade_version = [""] - Array[String?] nextclade_docker = [""] - Array[String?] nextclade_ds_tag = [""] - Array[String?] nextclade_aa_subs = [""] - Array[String?] nextclade_aa_dels = [""] - Array[String?] nextclade_clade = [""] - Array[String?] nextclade_lineage = [""] - Array[String?] nextclade_qc + File? nextclade_version + File? nextclade_docker + File? nextclade_ds_tag + File? nextclade_aa_subs + File? nextclade_aa_dels + File? nextclade_clade + File? nextclade_lineage + File? nextclade_qc # Nextclade outputs for flu HA - Array[String?] nextclade_ds_tag_flu_ha = [""] - Array[String?] nextclade_aa_subs_flu_ha = [""] - Array[String?] nextclade_aa_dels_flu_ha = [""] - Array[String?] nextclade_clade_flu_ha = [""] - Array[String?] nextclade_qc_flu_ha = [""] + File? nextclade_ds_tag_flu_ha + File? nextclade_aa_subs_flu_ha + File? nextclade_aa_dels_flu_ha + File? nextclade_clade_flu_ha + File? nextclade_qc_flu_ha # Nextclade outputs for flu NA - Array[String?] nextclade_ds_tag_flu_na = [""] - Array[String?] nextclade_aa_subs_flu_na = [""] - Array[String?] nextclade_aa_dels_flu_na = [""] - Array[String?] nextclade_clade_flu_na = [""] - Array[String?] nextclade_qc_flu_na = [""] + File? nextclade_ds_tag_flu_na + File? nextclade_aa_subs_flu_na + File? nextclade_aa_dels_flu_na + File? nextclade_clade_flu_na + File? nextclade_qc_flu_na + # change to be a docker with pandas String docker = "us-docker.pkg.dev/general-theiagen/quay/ubuntu:latest" Int disk_size = 50 Int cpu = 2 Int memory = 8 } command <<< - ( - echo -e "taxon_ids\torganism\textracted_read1\textracted_read2\tkrakentools_docker\tfastq_scan_num_reads_binned1\tfastq_scan_num_reads_binned2\tfastq_scan_num_reads_binned_pairs\tfastq_scan_docker\tfastq_scan_version\tpilon_assembly_fasta\tquast_genome_length\tquast_number_contigs\tquast_n50\tquast_gc_percent\tnumber_N\tnumber_ATCG\tnumber_Degenerate\tnumber_Total\tpercent_reference_coverage\tpango_lineage\tpango_lineage_expanded\tpangolin_conflicts\tpangolin_notes\tpangolin_assignment_version\tpangolin_versions\tpangolin_docker\tnextclade_version\tnextclade_docker\tnextclade_ds_tag\tnextclade_aa_subs\tnextclade_aa_dels\tnextclade_clade\tnextclade_lineage\tnextclade_qc\tnextclade_ds_tag_flu_ha\tnextclade_aa_subs_flu_ha\tnextclade_aa_dels_flu_ha\tnextclade_clade_flu_ha\tnextclade_qc_flu_ha\tnextclade_ds_tag_flu_na\tnextclade_aa_subs_flu_na\tnextclade_aa_dels_flu_na\tnextclade_clade_flu_na\tnextclade_qc_flu_na" - paste <(echo "~{sep="\n" taxon_ids}") \ - <(echo "~{sep="\n" organism}") \ - <(echo "~{sep="\n" extracted_read1}") \ - <(echo "~{sep="\n" extracted_read2}") \ - <(echo "~{sep="\n" krakentools_docker}") \ - <(echo "~{sep="\n" fastq_scan_num_reads_binned1}") \ - <(echo "~{sep="\n" fastq_scan_num_reads_binned2}") \ - <(echo "~{sep="\n" fastq_scan_num_reads_binned_pairs}") \ - <(echo "~{sep="\n" fastq_scan_docker}") \ - <(echo "~{sep="\n" fastq_scan_version}") \ - <(echo "~{sep="\n" pilon_assembly_fasta}") \ - <(echo "~{sep="\n" quast_genome_length}") \ - <(echo "~{sep="\n" quast_number_contigs}") \ - <(echo "~{sep="\n" quast_n50}") \ - <(echo "~{sep="\n" quast_gc_percent}") \ - <(echo "~{sep="\n" number_N}") \ - <(echo "~{sep="\n" number_ATCG}") \ - <(echo "~{sep="\n" number_Degenerate}") \ - <(echo "~{sep="\n" number_Total}") \ - <(echo "~{sep="\n" percent_reference_coverage}") \ - <(echo "~{sep="\n" pango_lineage}") \ - <(echo "~{sep="\n" pango_lineage_expanded}") \ - <(echo "~{sep="\n" pangolin_conflicts}") \ - <(echo "~{sep="\n" pangolin_notes}") \ - <(echo "~{sep="\n" pangolin_assignment_version}") \ - <(echo "~{sep="\n" pangolin_versions}") \ - <(echo "~{sep="\n" pangolin_docker}") \ - <(echo "~{sep="\n" nextclade_version}") \ - <(echo "~{sep="\n" nextclade_docker}") \ - <(echo "~{sep="\n" nextclade_ds_tag}") \ - <(echo "~{sep="\n" nextclade_aa_subs}") \ - <(echo "~{sep="\n" nextclade_aa_dels}") \ - <(echo "~{sep="\n" nextclade_clade}") \ - <(echo "~{sep="\n" nextclade_lineage}") \ - <(echo "~{sep="\n" nextclade_qc}") \ - <(echo "~{sep="\n" nextclade_ds_tag_flu_ha}") \ - <(echo "~{sep="\n" nextclade_aa_subs_flu_ha}") \ - <(echo "~{sep="\n" nextclade_aa_dels_flu_ha}") \ - <(echo "~{sep="\n" nextclade_clade_flu_ha}") \ - <(echo "~{sep="\n" nextclade_qc_flu_ha}") \ - <(echo "~{sep="\n" nextclade_ds_tag_flu_na}") \ - <(echo "~{sep="\n" nextclade_aa_subs_flu_na}") \ - <(echo "~{sep="\n" nextclade_aa_dels_flu_na}") \ - <(echo "~{sep="\n" nextclade_clade_flu_na}") \ - <(echo "~{sep="\n" nextclade_qc_flu_na}") - ) > ~{samplename}.results.tsv + python3<>> output { File gathered_results = "~{samplename}.results.tsv" diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index c86b66c88..926864d7a 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -105,51 +105,51 @@ workflow theiameta_panel_illumina_pe { call gather_scatter_task.gather_scatter { input: samplename = samplename, - taxon_ids = taxon_ids, - organism = krakentools.organism_name, - extracted_read1 = krakentools.extracted_read1, - extracted_read2 = krakentools.extracted_read2, - krakentools_docker = krakentools.krakentools_docker, - fastq_scan_num_reads_binned1 = fastq_scan_binned.read1_seq, - fastq_scan_num_reads_binned2 = fastq_scan_binned.read2_seq, - fastq_scan_num_reads_binned_pairs = fastq_scan_binned.read_pairs, - fastq_scan_docker = fastq_scan_binned.fastq_scan_docker, - fastq_scan_version = fastq_scan_binned.version, - pilon_assembly_fasta = pilon.assembly_fasta, # maybe?? - quast_genome_length = quast.genome_length, - quast_number_contigs = quast.number_contigs, - quast_n50 = quast.n50_value, - quast_gc_percent = quast.gc_percent, - number_N = morgana_magic.number_N, - number_ATCG = morgana_magic.number_ATCG, - number_Degenerate = morgana_magic.number_Degenerate, - number_Total = morgana_magic.number_Total, - percent_reference_coverage = morgana_magic.percent_reference_coverage, - pango_lineage = morgana_magic.pango_lineage, - pango_lineage_expanded = morgana_magic.pango_lineage_expanded, - pangolin_conflicts = morgana_magic.pangolin_conflicts, - pangolin_notes = morgana_magic.pangolin_notes, - pangolin_assignment_version = morgana_magic.pangolin_assignment_version, - pangolin_versions = morgana_magic.pangolin_versions, - pangolin_docker = morgana_magic.pangolin_docker, - nextclade_version = morgana_magic.nextclade_version, - nextclade_docker = morgana_magic.nextclade_docker, - nextclade_ds_tag = morgana_magic.nextclade_ds_tag, - nextclade_aa_subs = morgana_magic.nextclade_aa_subs, - nextclade_aa_dels = morgana_magic.nextclade_aa_dels, - nextclade_clade = morgana_magic.nextclade_clade, - nextclade_lineage = morgana_magic.nextclade_lineage, - nextclade_qc = morgana_magic.nextclade_qc, - nextclade_ds_tag_flu_ha = morgana_magic.nextclade_ds_tag_flu_ha, - nextclade_aa_subs_flu_ha = morgana_magic.nextclade_aa_subs_flu_ha, - nextclade_aa_dels_flu_ha = morgana_magic.nextclade_aa_dels_flu_ha, - nextclade_clade_flu_ha = morgana_magic.nextclade_clade_flu_ha, - nextclade_qc_flu_ha = morgana_magic.nextclade_qc_flu_ha, - nextclade_ds_tag_flu_na = morgana_magic.nextclade_ds_tag_flu_na, - nextclade_aa_subs_flu_na = morgana_magic.nextclade_aa_subs_flu_na, - nextclade_aa_dels_flu_na = morgana_magic.nextclade_aa_dels_flu_na, - nextclade_clade_flu_na = morgana_magic.nextclade_clade_flu_na, - nextclade_qc_flu_na = morgana_magic.nextclade_qc_flu_na + taxon_ids = write_lines(taxon_ids), + organism = write_lines(krakentools.organism_name), + extracted_read1 = write_lines(krakentools.extracted_read1), + extracted_read2 = write_lines(krakentools.extracted_read2), + krakentools_docker = write_lines(krakentools.krakentools_docker), + fastq_scan_num_reads_binned1 = write_lines(fastq_scan_binned.read1_seq), + fastq_scan_num_reads_binned2 = write_lines(fastq_scan_binned.read2_seq), + fastq_scan_num_reads_binned_pairs = write_lines(fastq_scan_binned.read_pairs), + fastq_scan_docker = write_lines(fastq_scan_binned.fastq_scan_docker), + fastq_scan_version = write_lines(fastq_scan_binned.version), + pilon_assembly_fasta = write_lines(pilon.assembly_fasta), # maybe?? + quast_genome_length = write_lines(quast.genome_length), + quast_number_contigs = write_lines(quast.number_contigs), + quast_n50 = write_lines(quast.n50_value), + quast_gc_percent = write_lines(quast.gc_percent), + number_N = write_lines(morgana_magic.number_N), + number_ATCG = write_lines(morgana_magic.number_ATCG), + number_Degenerate = write_lines(morgana_magic.number_Degenerate), + number_Total = write_lines(morgana_magic.number_Total), + percent_reference_coverage = write_lines(morgana_magic.percent_reference_coverage), + pango_lineage = write_lines(morgana_magic.pango_lineage), + pango_lineage_expanded = write_lines(morgana_magic.pango_lineage_expanded), + pangolin_conflicts = write_lines(morgana_magic.pangolin_conflicts), + pangolin_notes = write_lines(morgana_magic.pangolin_notes), + pangolin_assignment_version = write_lines(morgana_magic.pangolin_assignment_version), + pangolin_versions = write_lines(morgana_magic.pangolin_versions), + pangolin_docker = write_lines(morgana_magic.pangolin_docker), + nextclade_version = write_lines(morgana_magic.nextclade_version), + nextclade_docker = write_lines(morgana_magic.nextclade_docker), + nextclade_ds_tag = write_lines(morgana_magic.nextclade_ds_tag), + nextclade_aa_subs = write_lines(morgana_magic.nextclade_aa_subs), + nextclade_aa_dels = write_lines(morgana_magic.nextclade_aa_dels), + nextclade_clade = write_lines(morgana_magic.nextclade_clade), + nextclade_lineage = write_lines(morgana_magic.nextclade_lineage), + nextclade_qc = write_lines(morgana_magic.nextclade_qc), + nextclade_ds_tag_flu_ha = write_lines(morgana_magic.nextclade_ds_tag_flu_ha), + nextclade_aa_subs_flu_ha = write_lines(morgana_magic.nextclade_aa_subs_flu_ha), + nextclade_aa_dels_flu_ha = write_lines(morgana_magic.nextclade_aa_dels_flu_ha), + nextclade_clade_flu_ha = write_lines(morgana_magic.nextclade_clade_flu_ha), + nextclade_qc_flu_ha = write_lines(morgana_magic.nextclade_qc_flu_ha), + nextclade_ds_tag_flu_na = write_lines(morgana_magic.nextclade_ds_tag_flu_na), + nextclade_aa_subs_flu_na = write_lines(morgana_magic.nextclade_aa_subs_flu_na), + nextclade_aa_dels_flu_na = write_lines(morgana_magic.nextclade_aa_dels_flu_na), + nextclade_clade_flu_na = write_lines(morgana_magic.nextclade_clade_flu_na), + nextclade_qc_flu_na = write_lines(morgana_magic.nextclade_qc_flu_na) } output { # kraken2 outputs From 498d07c73454297f6afa96a050c3229665cdc571 Mon Sep 17 00:00:00 2001 From: cimendes Date: Thu, 17 Oct 2024 15:19:36 +0000 Subject: [PATCH 19/48] more crazy ideas? --- .../data_handling/task_gather_scatter.wdl | 65 +++++-------------- 1 file changed, 16 insertions(+), 49 deletions(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index 04b8a901f..083fd6018 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -66,57 +66,24 @@ task gather_scatter { command <<< python3< Date: Thu, 17 Oct 2024 15:51:43 +0000 Subject: [PATCH 20/48] maybe basename is a good idea --- tasks/utilities/data_handling/task_gather_scatter.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index 083fd6018..a0a82bd6a 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -72,7 +72,7 @@ task gather_scatter { if os.path.exists(file_path): with open(file_path, 'r') as file: json_data_from_file = json.load(file) - df_from_file = pd.DataFrame(json_data_from_file, columns=[file_path]) + df_from_file = pd.DataFrame(json_data_from_file, columns=[os.basename(file_path)]) return df_from_file else: return None From fce8abbe0fa840f93c633b307c090e15d4c133c5 Mon Sep 17 00:00:00 2001 From: cimendes Date: Thu, 17 Oct 2024 16:01:48 +0000 Subject: [PATCH 21/48] change to json --- .../wf_theiameta_panel_illumina_pe.wdl | 90 +++++++++---------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index 926864d7a..e5479c864 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -105,51 +105,51 @@ workflow theiameta_panel_illumina_pe { call gather_scatter_task.gather_scatter { input: samplename = samplename, - taxon_ids = write_lines(taxon_ids), - organism = write_lines(krakentools.organism_name), - extracted_read1 = write_lines(krakentools.extracted_read1), - extracted_read2 = write_lines(krakentools.extracted_read2), - krakentools_docker = write_lines(krakentools.krakentools_docker), - fastq_scan_num_reads_binned1 = write_lines(fastq_scan_binned.read1_seq), - fastq_scan_num_reads_binned2 = write_lines(fastq_scan_binned.read2_seq), - fastq_scan_num_reads_binned_pairs = write_lines(fastq_scan_binned.read_pairs), - fastq_scan_docker = write_lines(fastq_scan_binned.fastq_scan_docker), - fastq_scan_version = write_lines(fastq_scan_binned.version), - pilon_assembly_fasta = write_lines(pilon.assembly_fasta), # maybe?? - quast_genome_length = write_lines(quast.genome_length), - quast_number_contigs = write_lines(quast.number_contigs), - quast_n50 = write_lines(quast.n50_value), - quast_gc_percent = write_lines(quast.gc_percent), - number_N = write_lines(morgana_magic.number_N), - number_ATCG = write_lines(morgana_magic.number_ATCG), - number_Degenerate = write_lines(morgana_magic.number_Degenerate), - number_Total = write_lines(morgana_magic.number_Total), - percent_reference_coverage = write_lines(morgana_magic.percent_reference_coverage), - pango_lineage = write_lines(morgana_magic.pango_lineage), - pango_lineage_expanded = write_lines(morgana_magic.pango_lineage_expanded), - pangolin_conflicts = write_lines(morgana_magic.pangolin_conflicts), - pangolin_notes = write_lines(morgana_magic.pangolin_notes), - pangolin_assignment_version = write_lines(morgana_magic.pangolin_assignment_version), - pangolin_versions = write_lines(morgana_magic.pangolin_versions), - pangolin_docker = write_lines(morgana_magic.pangolin_docker), - nextclade_version = write_lines(morgana_magic.nextclade_version), - nextclade_docker = write_lines(morgana_magic.nextclade_docker), - nextclade_ds_tag = write_lines(morgana_magic.nextclade_ds_tag), - nextclade_aa_subs = write_lines(morgana_magic.nextclade_aa_subs), - nextclade_aa_dels = write_lines(morgana_magic.nextclade_aa_dels), - nextclade_clade = write_lines(morgana_magic.nextclade_clade), - nextclade_lineage = write_lines(morgana_magic.nextclade_lineage), - nextclade_qc = write_lines(morgana_magic.nextclade_qc), - nextclade_ds_tag_flu_ha = write_lines(morgana_magic.nextclade_ds_tag_flu_ha), - nextclade_aa_subs_flu_ha = write_lines(morgana_magic.nextclade_aa_subs_flu_ha), - nextclade_aa_dels_flu_ha = write_lines(morgana_magic.nextclade_aa_dels_flu_ha), - nextclade_clade_flu_ha = write_lines(morgana_magic.nextclade_clade_flu_ha), - nextclade_qc_flu_ha = write_lines(morgana_magic.nextclade_qc_flu_ha), - nextclade_ds_tag_flu_na = write_lines(morgana_magic.nextclade_ds_tag_flu_na), - nextclade_aa_subs_flu_na = write_lines(morgana_magic.nextclade_aa_subs_flu_na), - nextclade_aa_dels_flu_na = write_lines(morgana_magic.nextclade_aa_dels_flu_na), - nextclade_clade_flu_na = write_lines(morgana_magic.nextclade_clade_flu_na), - nextclade_qc_flu_na = write_lines(morgana_magic.nextclade_qc_flu_na) + taxon_ids = write_json(taxon_ids), + organism = write_json(krakentools.organism_name), + extracted_read1 = write_json(krakentools.extracted_read1), + extracted_read2 = write_json(krakentools.extracted_read2), + krakentools_docker = write_json(krakentools.krakentools_docker), + fastq_scan_num_reads_binned1 = write_json(fastq_scan_binned.read1_seq), + fastq_scan_num_reads_binned2 = write_json(fastq_scan_binned.read2_seq), + fastq_scan_num_reads_binned_pairs = write_json(fastq_scan_binned.read_pairs), + fastq_scan_docker = write_json(fastq_scan_binned.fastq_scan_docker), + fastq_scan_version = write_json(fastq_scan_binned.version), + pilon_assembly_fasta = write_json(pilon.assembly_fasta), # maybe?? + quast_genome_length = write_json(quast.genome_length), + quast_number_contigs = write_json(quast.number_contigs), + quast_n50 = write_json(quast.n50_value), + quast_gc_percent = write_json(quast.gc_percent), + number_N = write_json(morgana_magic.number_N), + number_ATCG = write_json(morgana_magic.number_ATCG), + number_Degenerate = write_json(morgana_magic.number_Degenerate), + number_Total = write_json(morgana_magic.number_Total), + percent_reference_coverage = write_json(morgana_magic.percent_reference_coverage), + pango_lineage = write_json(morgana_magic.pango_lineage), + pango_lineage_expanded = write_json(morgana_magic.pango_lineage_expanded), + pangolin_conflicts = write_json(morgana_magic.pangolin_conflicts), + pangolin_notes = write_json(morgana_magic.pangolin_notes), + pangolin_assignment_version = write_json(morgana_magic.pangolin_assignment_version), + pangolin_versions = write_json(morgana_magic.pangolin_versions), + pangolin_docker = write_json(morgana_magic.pangolin_docker), + nextclade_version = write_json(morgana_magic.nextclade_version), + nextclade_docker = write_json(morgana_magic.nextclade_docker), + nextclade_ds_tag = write_json(morgana_magic.nextclade_ds_tag), + nextclade_aa_subs = write_json(morgana_magic.nextclade_aa_subs), + nextclade_aa_dels = write_json(morgana_magic.nextclade_aa_dels), + nextclade_clade = write_json(morgana_magic.nextclade_clade), + nextclade_lineage = write_json(morgana_magic.nextclade_lineage), + nextclade_qc = write_json(morgana_magic.nextclade_qc), + nextclade_ds_tag_flu_ha = write_json(morgana_magic.nextclade_ds_tag_flu_ha), + nextclade_aa_subs_flu_ha = write_json(morgana_magic.nextclade_aa_subs_flu_ha), + nextclade_aa_dels_flu_ha = write_json(morgana_magic.nextclade_aa_dels_flu_ha), + nextclade_clade_flu_ha = write_json(morgana_magic.nextclade_clade_flu_ha), + nextclade_qc_flu_ha = write_json(morgana_magic.nextclade_qc_flu_ha), + nextclade_ds_tag_flu_na = write_json(morgana_magic.nextclade_ds_tag_flu_na), + nextclade_aa_subs_flu_na = write_json(morgana_magic.nextclade_aa_subs_flu_na), + nextclade_aa_dels_flu_na = write_json(morgana_magic.nextclade_aa_dels_flu_na), + nextclade_clade_flu_na = write_json(morgana_magic.nextclade_clade_flu_na), + nextclade_qc_flu_na = write_json(morgana_magic.nextclade_qc_flu_na) } output { # kraken2 outputs From cc97d7076805d8b15661a60b556af64bf450fb57 Mon Sep 17 00:00:00 2001 From: cimendes Date: Thu, 17 Oct 2024 16:31:02 +0000 Subject: [PATCH 22/48] sort of works but is ugly --- tasks/utilities/data_handling/task_gather_scatter.wdl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index a0a82bd6a..3fb452914 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -58,7 +58,7 @@ task gather_scatter { File? nextclade_clade_flu_na File? nextclade_qc_flu_na # change to be a docker with pandas - String docker = "us-docker.pkg.dev/general-theiagen/quay/ubuntu:latest" + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16" Int disk_size = 50 Int cpu = 2 Int memory = 8 @@ -67,19 +67,20 @@ task gather_scatter { python3< Date: Thu, 17 Oct 2024 16:38:58 +0000 Subject: [PATCH 23/48] IT WORKS --- .../data_handling/task_gather_scatter.wdl | 54 +++++++++++++++++-- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index 3fb452914..5c90fdf33 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -69,22 +69,66 @@ task gather_scatter { import os import json - def load_json_data(file_path, column_name): + def load_json_data(file_path, column_name, df): if os.path.exists(file_path): with open(file_path, 'r') as file: json_data_from_file = json.load(file) df_from_file = pd.DataFrame(json_data_from_file, columns=[column_name]) - return df_from_file + df = pd.concat([df, df_from_file], axis=1) + return df else: return None df = pd.DataFrame() - taxon_ids_df = load_json_data("~{taxon_ids}", "taxon_ids") - if taxon_ids_df is not None: - df = pd.concat([df, taxon_ids_df], axis=1) + df = load_json_data("~{taxon_ids}", "taxon_ids", df) + df = load_json_data("~{organism}", "organism", df) + df = load_json_data("~{extracted_read1}", "extracted_read1", df) + df = load_json_data("~{extracted_read2}", "extracted_read2", df) + df = load_json_data("~{krakentools_docker}", "krakentools_docker", df) + df = load_json_data("~{fastq_scan_num_reads_binned1}", "fastq_scan_num_reads_binned1", df) + df = load_json_data("~{fastq_scan_num_reads_binned2}", "fastq_scan_num_reads_binned2", df) + df = load_json_data("~{fastq_scan_num_reads_binned_pairs}", "fastq_scan_num_reads_binned_pairs", df) + df = load_json_data("~{fastq_scan_docker}", "fastq_scan_docker", df) + df = load_json_data("~{fastq_scan_version}", "fastq_scan_version", df) + df = load_json_data("~{pilon_assembly_fasta}", "pilon_assembly_fasta", df) + df = load_json_data("~{quast_genome_length}", "quast_genome_length", df) + df = load_json_data("~{quast_number_contigs}", "quast_number_contigs", df) + df = load_json_data("~{quast_n50}", "quast_n50", df) + df = load_json_data("~{quast_gc_percent}", "quast_gc_percent", df) + df = load_json_data("~{number_N}", "number_N", df) + df = load_json_data("~{number_ATCG}", "number_ATCG", df) + df = load_json_data("~{number_Degenerate}", "number_Degenerate", df) + df = load_json_data("~{number_Total}", "number_Total", df) + df = load_json_data("~{percent_reference_coverage}", "percent_reference_coverage", df) + df = load_json_data("~{pango_lineage}", "pango_lineage", df) + df = load_json_data("~{pango_lineage_expanded}", "pango_lineage_expanded", df) + df = load_json_data("~{pangolin_conflicts}", "pangolin_conflicts", df) + df = load_json_data("~{pangolin_notes}", "pangolin_notes", df) + df = load_json_data("~{pangolin_assignment_version}", "pangolin_assignment_version", df) + df = load_json_data("~{pangolin_versions}", "pangolin_versions", df) + df = load_json_data("~{pangolin_docker}", "pangolin_docker", df) + df = load_json_data("~{nextclade_version}", "nextclade_version", df) + df = load_json_data("~{nextclade_docker}", "nextclade_docker", df) + df = load_json_data("~{nextclade_ds_tag}", "nextclade_ds_tag", df) + df = load_json_data("~{nextclade_aa_subs}", "nextclade_aa_subs", df) + df = load_json_data("~{nextclade_aa_dels}", "nextclade_aa_dels", df) + df = load_json_data("~{nextclade_clade}", "nextclade_clade", df) + df = load_json_data("~{nextclade_lineage}", "nextclade_lineage", df) + df = load_json_data("~{nextclade_qc}", "nextclade_qc", df) + df = load_json_data("~{nextclade_ds_tag_flu_ha}", "nextclade_ds_tag_flu_ha", df) + df = load_json_data("~{nextclade_aa_subs_flu_ha}", "nextclade_aa_subs_flu_ha", df) + df = load_json_data("~{nextclade_aa_dels_flu_ha}", "nextclade_aa_dels_flu_ha", df) + df = load_json_data("~{nextclade_clade_flu_ha}", "nextclade_clade_flu_ha", df) + df = load_json_data("~{nextclade_qc_flu_ha}", "nextclade_qc_flu_ha", df) + df = load_json_data("~{nextclade_ds_tag_flu_na}", "nextclade_ds_tag_flu_na", df) + df = load_json_data("~{nextclade_aa_subs_flu_na}", "nextclade_aa_subs_flu_na", df) + df = load_json_data("~{nextclade_aa_dels_flu_na}", "nextclade_aa_dels_flu_na", df) + df = load_json_data("~{nextclade_clade_flu_na}", "nextclade_clade_flu_na", df) + df = load_json_data("~{nextclade_qc_flu_na}", "nextclade_qc_flu_na", df) print(df) + df.to_csv("~{samplename}.results.tsv", sep='\t', index=False) CODE From bc96474819bef687b6c72d764649f28763c669a5 Mon Sep 17 00:00:00 2001 From: cimendes Date: Thu, 17 Oct 2024 16:40:26 +0000 Subject: [PATCH 24/48] clean up --- tasks/utilities/data_handling/task_gather_scatter.wdl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index 5c90fdf33..9b636e590 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -130,11 +130,7 @@ task gather_scatter { print(df) df.to_csv("~{samplename}.results.tsv", sep='\t', index=False) - CODE - - - >>> output { File gathered_results = "~{samplename}.results.tsv" From 93bb88b97be19079293a7812190fe9abd876be18 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 21 Oct 2024 13:22:21 +0000 Subject: [PATCH 25/48] add dummy genome length & logic block consensus qc --- workflows/utilities/wf_morgana_magic.wdl | 24 ++++++++++--------- .../utilities/wf_organism_parameters.wdl | 11 ++++----- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/workflows/utilities/wf_morgana_magic.wdl b/workflows/utilities/wf_morgana_magic.wdl index ba7a779db..686497124 100644 --- a/workflows/utilities/wf_morgana_magic.wdl +++ b/workflows/utilities/wf_morgana_magic.wdl @@ -20,13 +20,15 @@ workflow morgana_magic { call set_organism_defaults.organism_parameters { input: taxon_id = taxon_id, - organism = "To Be Determined" + organism = "unsupported" } - call consensus_qc_task.consensus_qc { - input: - assembly_fasta = assembly_fasta, - reference_genome = organism_parameters.reference, - genome_length = organism_parameters.genome_length + if (organism_parameters.standardized_organism != "unsupported") { # occurs in theiameta_panel + call consensus_qc_task.consensus_qc { + input: + assembly_fasta = assembly_fasta, + reference_genome = organism_parameters.reference, + genome_length = organism_parameters.genome_length + } } if (organism_parameters.standardized_organism == "flu") { call flu_track_wf.flu_track { @@ -72,11 +74,11 @@ workflow morgana_magic { output { String organism = organism_parameters.standardized_organism # Consensus QC outputs - Int number_N = consensus_qc.number_N - Int number_ATCG = consensus_qc.number_ATCG - Int number_Degenerate = consensus_qc.number_Degenerate - Int number_Total = consensus_qc.number_Total - Float percent_reference_coverage = consensus_qc.percent_reference_coverage + Int? number_N = consensus_qc.number_N + Int? number_ATCG = consensus_qc.number_ATCG + Int? number_Degenerate = consensus_qc.number_Degenerate + Int? number_Total = consensus_qc.number_Total + Float? percent_reference_coverage = consensus_qc.percent_reference_coverage # Pangolin outputs String? pango_lineage = pangolin4.pangolin_lineage String? pango_lineage_expanded = pangolin4.pangolin_lineage_expanded diff --git a/workflows/utilities/wf_organism_parameters.wdl b/workflows/utilities/wf_organism_parameters.wdl index 87e34abbb..0251a19a2 100644 --- a/workflows/utilities/wf_organism_parameters.wdl +++ b/workflows/utilities/wf_organism_parameters.wdl @@ -51,6 +51,8 @@ workflow organism_parameters { } # for morgana_magic & theiameta_panel compatibility if (defined(taxon_id)) { + # set dummy values for unsupported organisms to prevent workflow failure + Int unsupported_theiameta_panel_genome_length = 0 if (select_first([taxon_id]) == "2697049") { String sars_cov_2_taxon_id = "sars-cov-2" } @@ -61,13 +63,10 @@ workflow organism_parameters { String wnv_taxon_id = "WNV" } if (select_first([taxon_id]) == "11320") { - # flu A - String flu_a_taxon_id = "flu" - # also do flu type + String flu_a_taxon_id = "flu" # flu A } if (select_first([taxon_id]) == "11520") { - # flu B - String flu_b_taxon_id = "flu" + String flu_b_taxon_id = "flu" # flu B } if (select_first([taxon_id]) == "12814") { String rsv_a_taxon_id = "rsv_a" @@ -299,7 +298,7 @@ workflow organism_parameters { File gene_locations_bed = select_first([gene_locations_bed_file, sc2_gene_locations_bed, mpox_gene_locations_bed, "gs://theiagen-public-files/terra/theiacov-files/empty.bed"]) File primer_bed = select_first([primer_bed_file, mpox_primer_bed_file, wnv_primer_bed_file, hiv_v1_primer_bed, hiv_v2_primer_bed, "gs://theiagen-public-files/terra/theiacov-files/empty.bed"]) File reference_gff = select_first([reference_gff_file, mpox_reference_gff_file, hiv_v1_reference_gff, hiv_v2_reference_gff, "gs://theiagen-public-files/terra/theiacov-files/empty.gff3"]) - Int genome_length = select_first([genome_length_input, sc2_genome_len, mpox_genome_len, wnv_genome_len, flu_genome_len, rsv_a_genome_len, rsv_b_genome_len, hiv_v1_genome_len, hiv_v2_genome_len]) + Int genome_length = select_first([genome_length_input, sc2_genome_len, mpox_genome_len, wnv_genome_len, flu_genome_len, rsv_a_genome_len, rsv_b_genome_len, hiv_v1_genome_len, hiv_v2_genome_len, unsupported_theiameta_panel_genome_length]) # nextclade information String nextclade_dataset_tag = select_first([nextclade_dataset_tag_input, sc2_nextclade_ds_tag, mpox_nextclade_ds_tag, wnv_nextclade_ds_tag, h1n1_ha_nextclade_ds_tag, h3n2_ha_nextclade_ds_tag, vic_ha_nextclade_ds_tag, yam_ha_nextclade_ds_tag, h5n1_ha_nextclade_ds_tag, h1n1_na_nextclade_ds_tag, h3n2_na_nextclade_ds_tag, vic_na_nextclade_ds_tag, yam_na_nextclade_ds_tag, rsv_a_nextclade_ds_tag, rsv_b_nextclade_ds_tag, "NA"]) String nextclade_dataset_name = select_first([nextclade_dataset_name_input, sc2_nextclade_ds_name, mpox_nextclade_ds_name, wnv_nextclade_ds_name, h1n1_ha_nextclade_ds_name, h3n2_ha_nextclade_ds_name, vic_ha_nextclade_ds_name, yam_ha_nextclade_ds_name, h5n1_ha_nextclade_ds_name, h1n1_na_nextclade_ds_name, h3n2_na_nextclade_ds_name, vic_na_nextclade_ds_name, yam_na_nextclade_ds_name, rsv_a_nextclade_ds_name, rsv_b_nextclade_ds_name, "NA"]) From c4cf61b03d5ba448d4c265f6118dfa713e551757 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 21 Oct 2024 13:48:57 +0000 Subject: [PATCH 26/48] remove null values from identified_organisms otuput --- tasks/utilities/data_handling/task_gather_scatter.wdl | 6 ++++++ workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl | 5 +---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index 9b636e590..d2bb28456 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -66,6 +66,7 @@ task gather_scatter { command <<< python3<>> output { File gathered_results = "~{samplename}.results.tsv" + Array[String] organism_names = read_lines("~{samplename}.organism_names.tsv") + } runtime { docker: "~{docker}" diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index e5479c864..0f11e24ae 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -159,10 +159,7 @@ workflow theiameta_panel_illumina_pe { File kraken2_report = kraken2.kraken2_report File kraken2_classified_report = kraken2.kraken2_classified_report # krakentools outputs - Array[String] identified_organisms = krakentools.organism_name - # docker image??? -- work on figuring out how to make this not an array - # Array[String] krakentools_docker = select_first([krakentools.krakentools_docker]), + Array[String] identified_organisms = gather_scatter.organism_names File results_by_taxon_tsv = gather_scatter.gathered_results - } } \ No newline at end of file From 4e1c37334097cd5259f2dfaa58e49a414b3bf3b1 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 21 Oct 2024 13:55:52 +0000 Subject: [PATCH 27/48] add versioning --- workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index 0f11e24ae..bdef92d95 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -5,6 +5,7 @@ import "../../tasks/assembly/task_metaspades.wdl" as metaspades_task import "../../tasks/quality_control/basic_statistics/task_fastq_scan.wdl" as fastq_scan import "../../tasks/quality_control/basic_statistics/task_quast.wdl" as quast_task import "../../tasks/quality_control/read_filtering/task_pilon.wdl" as pilon_task +import "../../tasks/task_versioning.wdl" as versioning import "../../tasks/taxon_id/contamination/task_kraken2.wdl" as kraken_task import "../../tasks/taxon_id/task_krakentools.wdl" as krakentools_task import "../../tasks/utilities/data_handling/task_gather_scatter.wdl" as gather_scatter_task @@ -22,6 +23,9 @@ workflow theiameta_panel_illumina_pe { Int minimum_read_number = 100 File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz" } + call versioning.version_capture { + input: + } call read_qc_trim_pe.read_QC_trim_pe as read_QC_trim { input: samplename = samplename, @@ -152,6 +156,9 @@ workflow theiameta_panel_illumina_pe { nextclade_qc_flu_na = write_json(morgana_magic.nextclade_qc_flu_na) } output { + # versioning outputs + String theiameta_panel_illumina_pe_version = version_capture.phb_version + String theiameta_panel_illumina_pe_analysis_date = version_capture.date # kraken2 outputs String kraken2_version = kraken2.kraken2_version String kraken2_database = kraken2.kraken2_database From 148cb9da42fbe538e3259be73af09b6be5bbbbd8 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 21 Oct 2024 14:53:24 +0000 Subject: [PATCH 28/48] up to 1000 --- workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index bdef92d95..70ba8d762 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -20,7 +20,7 @@ workflow theiameta_panel_illumina_pe { File read2 Array[Int] taxon_ids # suggest using a workspace element if user wants to modify? - Int minimum_read_number = 100 + Int minimum_read_number = 1000 File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz" } call versioning.version_capture { From 8c7de78565e742a5135296de37b76a57bde45212 Mon Sep 17 00:00:00 2001 From: cimendes Date: Mon, 21 Oct 2024 15:11:08 +0000 Subject: [PATCH 29/48] make theiameta_panel fault-resistant, has impacts on theiameta_illumina_pe --- tasks/assembly/task_metaspades.wdl | 16 +- .../read_filtering/task_pilon.wdl | 16 +- .../theiameta/wf_theiameta_illumina_pe.wdl | 218 +++++++++--------- .../wf_theiameta_panel_illumina_pe.wdl | 74 +++--- 4 files changed, 173 insertions(+), 151 deletions(-) diff --git a/tasks/assembly/task_metaspades.wdl b/tasks/assembly/task_metaspades.wdl index a6473ae14..e615281a2 100644 --- a/tasks/assembly/task_metaspades.wdl +++ b/tasks/assembly/task_metaspades.wdl @@ -15,7 +15,9 @@ task metaspades_pe { } command <<< metaspades.py --version | head -1 | cut -d ' ' -f 2 | tee VERSION - metaspades.py \ + touch WARNING + + if metaspades.py \ -1 ~{read1_cleaned} \ -2 ~{read2_cleaned} \ ~{'-k ' + kmers} \ @@ -23,15 +25,21 @@ task metaspades_pe { -t ~{cpu} \ -o metaspades \ --phred-offset ~{phred_offset} \ - ~{metaspades_opts} + ~{metaspades_opts}; then - mv metaspades/contigs.fasta ~{samplename}_contigs.fasta + mv metaspades/contigs.fasta ~{samplename}_contigs.fasta + + else + tee "Metaspades failed to assemble for ~{samplename}" > WARNING + exit 1 + fi >>> output { - File assembly_fasta = "~{samplename}_contigs.fasta" + File? assembly_fasta = "~{samplename}_contigs.fasta" String metaspades_version = read_string("VERSION") String metaspades_docker = '~{docker}' + String metaspades_warning = read_string("WARNING") } runtime { docker: "~{docker}" diff --git a/tasks/quality_control/read_filtering/task_pilon.wdl b/tasks/quality_control/read_filtering/task_pilon.wdl index 2e869832d..b68f64612 100644 --- a/tasks/quality_control/read_filtering/task_pilon.wdl +++ b/tasks/quality_control/read_filtering/task_pilon.wdl @@ -16,20 +16,26 @@ task pilon { pilon --version | cut -d' ' -f3 | tee VERSION # run pilon - pilon \ + if pilon \ --genome ~{assembly} \ --frags ~{bam} \ --output ~{samplename} \ --outdir pilon \ - --changes --vcf + --changes --vcf; then + touch WARNING + else + tee "Pilon failed to run for ~{samplename}" > WARNING + exit 1 + fi >>> output { - File assembly_fasta = "pilon/~{samplename}.fasta" - File changes = "pilon/~{samplename}.changes" - File vcf = "pilon/~{samplename}.vcf" + File? assembly_fasta = "pilon/~{samplename}.fasta" + File? changes = "pilon/~{samplename}.changes" + File? vcf = "pilon/~{samplename}.vcf" String pilon_version = read_string("VERSION") String pilon_docker = "~{docker}" + String pilon_warning = read_string("WARNING") } runtime { docker: "~{docker}" diff --git a/workflows/theiameta/wf_theiameta_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_illumina_pe.wdl index 51f1a0054..f9cbbe7c8 100644 --- a/workflows/theiameta/wf_theiameta_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_illumina_pe.wdl @@ -71,110 +71,114 @@ workflow theiameta_illumina_pe { read2_cleaned = read_QC_trim.read2_clean, samplename = samplename } - call minimap2_task.minimap2 as minimap2_assembly_correction { - input: - query1 = read_QC_trim.read1_clean, - query2 = read_QC_trim.read2_clean, - reference = metaspades_pe.assembly_fasta, - samplename = samplename, - mode = "sr", - output_sam = true - } - call parse_mapping_task.sam_to_sorted_bam as sort_bam_assembly_correction { - input: - sam = minimap2_assembly_correction.minimap2_out, - samplename = samplename - } - call pilon_task.pilon { - input: - assembly = metaspades_pe.assembly_fasta, - bam = sort_bam_assembly_correction.bam, - bai = sort_bam_assembly_correction.bai, - samplename = samplename - } - # if reference is provided, perform mapping of assembled contigs to - # reference with minimap2, and extract those as final assembly - if (defined(reference)) { - call minimap2_task.minimap2 as minimap2_assembly { - input: - query1 = pilon.assembly_fasta, - reference = select_first([reference]), - samplename = samplename, - mode = "asm20", - output_sam = false - } - call parse_mapping_task.retrieve_aligned_contig_paf { - input: - paf = minimap2_assembly.minimap2_out, - assembly = pilon.assembly_fasta, - samplename = samplename - } - call parse_mapping_task.calculate_coverage_paf { - input: - paf = minimap2_assembly.minimap2_out - } - } - call quast_task.quast { + if (defined(metaspades_pe.assembly_fasta)) { + call minimap2_task.minimap2 as minimap2_assembly_correction { input: - assembly = select_first([retrieve_aligned_contig_paf.final_assembly, pilon.assembly_fasta]), + query1 = read_QC_trim.read1_clean, + query2 = read_QC_trim.read2_clean, + reference = select_first([metaspades_pe.assembly_fasta]), samplename = samplename, - min_contig_length = 1 - } - if (output_additional_files) { - call minimap2_task.minimap2 as minimap2_reads { - input: - query1 = read_QC_trim.read1_clean, - query2 = read_QC_trim.read2_clean, - reference = select_first([retrieve_aligned_contig_paf.final_assembly, pilon.assembly_fasta]), - samplename = samplename, - mode = "sr", - output_sam = true - } - call parse_mapping_task.sam_to_sorted_bam { - input: - sam = minimap2_reads.minimap2_out, - samplename = samplename - } - call parse_mapping_task.calculate_coverage { - input: - bam = sam_to_sorted_bam.bam, - bai = sam_to_sorted_bam.bai - } - call parse_mapping_task.retrieve_pe_reads_bam as retrieve_unaligned_pe_reads_sam { - input: - bam = sam_to_sorted_bam.bam, - samplename = samplename, - prefix = "unassembled", - sam_flag = 4 + mode = "sr", + output_sam = true + } + call parse_mapping_task.sam_to_sorted_bam as sort_bam_assembly_correction { + input: + sam = minimap2_assembly_correction.minimap2_out, + samplename = samplename + } + call pilon_task.pilon { + input: + assembly = select_first([metaspades_pe.assembly_fasta]), + bam = sort_bam_assembly_correction.bam, + bai = sort_bam_assembly_correction.bai, + samplename = samplename + } + if (defined(pilon.assembly_fasta)) { + # if reference is provided, perform mapping of assembled contigs to + # reference with minimap2, and extract those as final assembly + if (defined(reference)) { + call minimap2_task.minimap2 as minimap2_assembly { + input: + query1 = select_first([pilon.assembly_fasta]), + reference = select_first([reference]), + samplename = samplename, + mode = "asm20", + output_sam = false + } + call parse_mapping_task.retrieve_aligned_contig_paf { + input: + paf = minimap2_assembly.minimap2_out, + assembly = select_first([pilon.assembly_fasta]), + samplename = samplename + } + call parse_mapping_task.calculate_coverage_paf { + input: + paf = minimap2_assembly.minimap2_out + } } - call parse_mapping_task.retrieve_pe_reads_bam as retrieve_aligned_pe_reads_sam { + call quast_task.quast { input: - bam = sam_to_sorted_bam.bam, + assembly = select_first([retrieve_aligned_contig_paf.final_assembly, pilon.assembly_fasta]), samplename = samplename, - sam_flag = 2, - prefix = "assembled" + min_contig_length = 1 + } + if (output_additional_files) { + call minimap2_task.minimap2 as minimap2_reads { + input: + query1 = read_QC_trim.read1_clean, + query2 = read_QC_trim.read2_clean, + reference = select_first([retrieve_aligned_contig_paf.final_assembly, pilon.assembly_fasta]), + samplename = samplename, + mode = "sr", + output_sam = true + } + call parse_mapping_task.sam_to_sorted_bam { + input: + sam = minimap2_reads.minimap2_out, + samplename = samplename + } + call parse_mapping_task.calculate_coverage { + input: + bam = sam_to_sorted_bam.bam, + bai = sam_to_sorted_bam.bai + } + call parse_mapping_task.retrieve_pe_reads_bam as retrieve_unaligned_pe_reads_sam { + input: + bam = sam_to_sorted_bam.bam, + samplename = samplename, + prefix = "unassembled", + sam_flag = 4 + } + call parse_mapping_task.retrieve_pe_reads_bam as retrieve_aligned_pe_reads_sam { + input: + bam = sam_to_sorted_bam.bam, + samplename = samplename, + sam_flag = 2, + prefix = "assembled" + } + call parse_mapping_task.assembled_reads_percent { + input: + bam = sam_to_sorted_bam.bam, + } } - call parse_mapping_task.assembled_reads_percent { - input: - bam = sam_to_sorted_bam.bam, - } - } - if (! defined(reference)) { - call bwa_task.bwa as bwa { - input: - read1 = read_QC_trim.read1_clean, - read2 = read_QC_trim.read2_clean, - reference_genome = pilon.assembly_fasta, - samplename = samplename - } - call semibin_task.semibin as semibin { - input: - sorted_bam = bwa.sorted_bam, - sorted_bai = bwa.sorted_bai, - assembly_fasta = pilon.assembly_fasta, - samplename = samplename + if (! defined(reference)) { + call bwa_task.bwa as bwa { + input: + read1 = read_QC_trim.read1_clean, + read2 = read_QC_trim.read2_clean, + reference_genome = pilon.assembly_fasta, + samplename = samplename + } + call semibin_task.semibin as semibin { + input: + sorted_bam = bwa.sorted_bam, + sorted_bai = bwa.sorted_bai, + assembly_fasta = select_first([pilon.assembly_fasta]), + samplename = samplename + } } } + } call versioning.version_capture { input: } @@ -240,20 +244,20 @@ workflow theiameta_illumina_pe { String metaspades_version = metaspades_pe.metaspades_version String metaspades_docker = metaspades_pe.metaspades_docker # Assembly - minimap2 - String minimap2_version = minimap2_assembly_correction.minimap2_version - String minimap2_docker = minimap2_assembly_correction.minimap2_docker + String? minimap2_version = minimap2_assembly_correction.minimap2_version + String? minimap2_docker = minimap2_assembly_correction.minimap2_docker # Assembly - samtools - String samtools_version = sort_bam_assembly_correction.samtools_version - String samtools_docker = sort_bam_assembly_correction.samtools_docker + String? samtools_version = sort_bam_assembly_correction.samtools_version + String? samtools_docker = sort_bam_assembly_correction.samtools_docker # Assembly - pilon - String pilon_version = pilon.pilon_version - String pilon_docker = pilon.pilon_docker + String? pilon_version = pilon.pilon_version + String? pilon_docker = pilon.pilon_docker # Assembly QC - quast - Int assembly_length = quast.genome_length - Int contig_number = quast.number_contigs - Int largest_contig = quast.largest_contig - String quast_version = quast.version - String quast_docker = quast.quast_docker + Int? assembly_length = quast.genome_length + Int? contig_number = quast.number_contigs + Int? largest_contig = quast.largest_contig + String? quast_version = quast.version + String? quast_docker = quast.quast_docker # Assembly QC - minimap2 Float? percent_coverage = calculate_coverage_paf.percent_coverage # Assembly QC - bedtools diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index 70ba8d762..c0aec7a36 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -67,41 +67,45 @@ workflow theiameta_panel_illumina_pe { read2_cleaned = krakentools.extracted_read2, samplename = "~{samplename}_~{taxon_id}" } - call minimap2_task.minimap2 as minimap2_assembly_correction { - input: - query1 = krakentools.extracted_read1, - query2 = krakentools.extracted_read2, - reference = metaspades_pe.assembly_fasta, - samplename = "~{samplename}_~{taxon_id}", - mode = "sr", - output_sam = true - } - call parse_mapping_task.sam_to_sorted_bam as sort_bam_assembly_correction { - input: - sam = minimap2_assembly_correction.minimap2_out, - samplename = "~{samplename}_~{taxon_id}" - } - call pilon_task.pilon { - input: - assembly = metaspades_pe.assembly_fasta, - bam = sort_bam_assembly_correction.bam, - bai = sort_bam_assembly_correction.bai, - samplename = "~{samplename}_~{taxon_id}" - } - call quast_task.quast { - input: - assembly = pilon.assembly_fasta, - samplename = "~{samplename}_~{taxon_id}", - min_contig_length = 1 - } - call morgana_magic_workflow.morgana_magic { - input: - samplename = "~{samplename}_~{taxon_id}", - assembly_fasta = pilon.assembly_fasta, - read1 = krakentools.extracted_read1, - read2 = krakentools.extracted_read2, - taxon_id = "~{taxon_id}", - seq_method = "ILLUMINA" + if (defined(metaspades_pe.assembly_fasta)) { + call minimap2_task.minimap2 as minimap2_assembly_correction { + input: + query1 = krakentools.extracted_read1, + query2 = krakentools.extracted_read2, + reference = select_first([metaspades_pe.assembly_fasta]), + samplename = "~{samplename}_~{taxon_id}", + mode = "sr", + output_sam = true + } + call parse_mapping_task.sam_to_sorted_bam as sort_bam_assembly_correction { + input: + sam = minimap2_assembly_correction.minimap2_out, + samplename = "~{samplename}_~{taxon_id}" + } + call pilon_task.pilon { + input: + assembly = select_first([metaspades_pe.assembly_fasta]), + bam = sort_bam_assembly_correction.bam, + bai = sort_bam_assembly_correction.bai, + samplename = "~{samplename}_~{taxon_id}" + } + if (defined(pilon.assembly_fasta)) { + call quast_task.quast { + input: + assembly = select_first([pilon.assembly_fasta]), + samplename = "~{samplename}_~{taxon_id}", + min_contig_length = 1 + } + call morgana_magic_workflow.morgana_magic { + input: + samplename = "~{samplename}_~{taxon_id}", + assembly_fasta = select_first([pilon.assembly_fasta]), + read1 = krakentools.extracted_read1, + read2 = krakentools.extracted_read2, + taxon_id = "~{taxon_id}", + seq_method = "ILLUMINA" + } + } } } } From 365837e05db867216a197c4db26a2f641acb1ca8 Mon Sep 17 00:00:00 2001 From: cimendes Date: Mon, 21 Oct 2024 16:24:27 +0000 Subject: [PATCH 30/48] add catch if assembly file is empty --- tasks/assembly/task_metaspades.wdl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tasks/assembly/task_metaspades.wdl b/tasks/assembly/task_metaspades.wdl index e615281a2..45b71532d 100644 --- a/tasks/assembly/task_metaspades.wdl +++ b/tasks/assembly/task_metaspades.wdl @@ -28,6 +28,12 @@ task metaspades_pe { ~{metaspades_opts}; then mv metaspades/contigs.fasta ~{samplename}_contigs.fasta + + if [ ! -s ~{samplename}_contigs.fasta ]; then + tee "Metaspades produced an empty assembly for ~{samplename}" > WARNING + rm -f ~{samplename}_contigs.fasta + exit 1 + fi else tee "Metaspades failed to assemble for ~{samplename}" > WARNING From 5bcb25b36805faaf13e9bdd3e825e298ec8ad759 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 21 Oct 2024 17:33:48 +0000 Subject: [PATCH 31/48] remove exit 1 because it's causing task to fail --- tasks/assembly/task_metaspades.wdl | 2 -- 1 file changed, 2 deletions(-) diff --git a/tasks/assembly/task_metaspades.wdl b/tasks/assembly/task_metaspades.wdl index 45b71532d..4982e3521 100644 --- a/tasks/assembly/task_metaspades.wdl +++ b/tasks/assembly/task_metaspades.wdl @@ -32,12 +32,10 @@ task metaspades_pe { if [ ! -s ~{samplename}_contigs.fasta ]; then tee "Metaspades produced an empty assembly for ~{samplename}" > WARNING rm -f ~{samplename}_contigs.fasta - exit 1 fi else tee "Metaspades failed to assemble for ~{samplename}" > WARNING - exit 1 fi >>> From 166e9fbe8121e04747967a856175257c9c8cf235 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 21 Oct 2024 17:39:52 +0000 Subject: [PATCH 32/48] update contributions --- README.md | 6 ++++-- docs/index.md | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 84bccba76..a9046673d 100644 --- a/README.md +++ b/README.md @@ -47,13 +47,14 @@ You can expect a careful review of every PR and feedback as needed before mergin * **Sage Wright** ([@sage-wright](https://github.com/sage-wright)) - Conceptualization, Software, Validation, Supervision * **Inês Mendes** ([@cimendes](https://github.com/cimendes)) - Software, Validation * **Curtis Kapsak** ([@kapsakcj](https://github.com/kapsakcj)) - Conceptualization, Software, Validation -* **James Otieno** ([@jrotieno](https://github.com/jrotieno)) - Software, Validation * **Frank Ambrosio** ([@frankambrosio3](https://github.com/frankambrosio3)) - Conceptualization, Software, Validation * **Michelle Scribner** ([@michellescribner](https://github.com/michellescribner)) - Software, Validation * **Kevin Libuit** ([@kevinlibuit](https://github.com/kevinlibuit)) - Conceptualization, Project Administration, Software, Validation, Supervision * **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - Software, Validation -* **Andrew Page** ([@andrewjpage](https://github.com/andrewjpage)) - Project Administration, Software, Supervision +* **Fraser Combe** ([@fraser-combe](https://github.com/fraser-combe)) - Software, Validation +* **Michal-Babins** ([@Michal-Babins](https://github.com/Michal-Babins)) - Software, Validation * **Andrew Lang** ([@AndrewLangVt](https://github.com/AndrewLangVt)) - Software, Supervision +* **Andrew Page** ([@andrewjpage](https://github.com/andrewjpage)) - Project Administration, Software, Supervision * **Kelsey Kropp** ([@kelseykropp](https://github.com/kelseykropp)) - Validation * **Emily Smith** ([@emily-smith1](https://github.com/emily-smith1)) - Validation * **Joel Sevinsky** ([@sevinsky](https://github.com/sevinsky)) - Conceptualization, Project Administration, Supervision @@ -62,6 +63,7 @@ You can expect a careful review of every PR and feedback as needed before mergin We would like to gratefully acknowledge the following individuals from the public health community for their contributions to the PHB repository: +* **James Otieno** ([@jrotieno](https://github.com/jrotieno)) * **Robert Petit** ([@rpetit3](https://github.com/rpetit3)) * **Ash O'Farrel** ([@aofarrel](https://github.com/aofarrel)) * **Sam Baird** ([@sam-baird](https://github.com/sam-baird)) diff --git a/docs/index.md b/docs/index.md index 058b2149d..8454cde50 100644 --- a/docs/index.md +++ b/docs/index.md @@ -65,10 +65,11 @@ You can expect a careful review of every PR and feedback as needed before mergin - **Sage Wright** ([@sage-wright](https://github.com/sage-wright)) - Conceptualization, Software, Validation, Supervision - **Inês Mendes** ([@cimendes](https://github.com/cimendes)) - Software, Validation - **Curtis Kapsak** ([@kapsakcj](https://github.com/kapsakcj)) - Conceptualization, Software, Validation -- **James Otieno** ([@jrotieno](https://github.com/jrotieno)) - Software, Validation - **Frank Ambrosio** ([@frankambrosio3](https://github.com/frankambrosio3)) - Conceptualization, Software, Validation - **Michelle Scribner** ([@michellescribner](https://github.com/michellescribner)) - Software, Validation - **Kevin Libuit** ([@kevinlibuit](https://github.com/kevinlibuit)) - Conceptualization, Project Administration, Software, Validation, Supervision +- **Fraser Combe** ([@fraser-combe](https://github.com/fraser-combe)) - Software, Validation +- **Michal-Babins** ([@Michal-Babins](https://github.com/Michal-Babins)) - Software, Validation - **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - Software, Validation - **Andrew Page** ([@andrewjpage](https://github.com/andrewjpage)) - Project Administration, Software, Supervision - **Andrew Lang** ([@AndrewLangVt](https://github.com/AndrewLangVt)) - Software, Supervision @@ -80,6 +81,7 @@ You can expect a careful review of every PR and feedback as needed before mergin We would like to gratefully acknowledge the following individuals from the public health community for their contributions to the PHB repository: +- **James Otieno** ([@jrotieno](https://github.com/jrotieno)) - **Robert Petit** ([@rpetit3](https://github.com/rpetit3)) - **Ash O'Farrel** ([@aofarrel](https://github.com/aofarrel)) - **Sam Baird** ([@sam-baird](https://github.com/sam-baird)) From 05d55f776e08996d1da081f0f7ef69769eba70ee Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 21 Oct 2024 17:59:23 +0000 Subject: [PATCH 33/48] add warnings to gathered output --- tasks/utilities/data_handling/task_gather_scatter.wdl | 4 ++++ workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index d2bb28456..d97fe6b25 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -16,6 +16,8 @@ task gather_scatter { File? fastq_scan_docker File? fastq_scan_version # Assembly + File? metaspades_warning + File? pilon_warning File? pilon_assembly_fasta### maybe????? # quast outputs File? quast_genome_length @@ -92,6 +94,8 @@ task gather_scatter { df = load_json_data("~{fastq_scan_num_reads_binned_pairs}", "fastq_scan_num_reads_binned_pairs", df) df = load_json_data("~{fastq_scan_docker}", "fastq_scan_docker", df) df = load_json_data("~{fastq_scan_version}", "fastq_scan_version", df) + df = load_json_data("~{metaspades_warning}", "metaspades_warning", df) + df = load_json_data("~{pilon_warning}", "pilon_warning", df) df = load_json_data("~{pilon_assembly_fasta}", "pilon_assembly_fasta", df) df = load_json_data("~{quast_genome_length}", "quast_genome_length", df) df = load_json_data("~{quast_number_contigs}", "quast_number_contigs", df) diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index c0aec7a36..a71a82ee1 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -115,7 +115,7 @@ workflow theiameta_panel_illumina_pe { samplename = samplename, taxon_ids = write_json(taxon_ids), organism = write_json(krakentools.organism_name), - extracted_read1 = write_json(krakentools.extracted_read1), + extracted_read1 = write_json(krakentools.extracted_read1), ## not sure how useful these links are extracted_read2 = write_json(krakentools.extracted_read2), krakentools_docker = write_json(krakentools.krakentools_docker), fastq_scan_num_reads_binned1 = write_json(fastq_scan_binned.read1_seq), @@ -123,6 +123,8 @@ workflow theiameta_panel_illumina_pe { fastq_scan_num_reads_binned_pairs = write_json(fastq_scan_binned.read_pairs), fastq_scan_docker = write_json(fastq_scan_binned.fastq_scan_docker), fastq_scan_version = write_json(fastq_scan_binned.version), + metaspades_warning = write_json(metaspades_pe.metaspades_warning), + pilon_warning = write_json(pilon.pilon_warning), pilon_assembly_fasta = write_json(pilon.assembly_fasta), # maybe?? quast_genome_length = write_json(quast.genome_length), quast_number_contigs = write_json(quast.number_contigs), From ff73187107afead1d6a91076fb73621fca31bf00 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 21 Oct 2024 18:00:49 +0000 Subject: [PATCH 34/48] bump up al --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 8454cde50..2d8eb679a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -71,8 +71,8 @@ You can expect a careful review of every PR and feedback as needed before mergin - **Fraser Combe** ([@fraser-combe](https://github.com/fraser-combe)) - Software, Validation - **Michal-Babins** ([@Michal-Babins](https://github.com/Michal-Babins)) - Software, Validation - **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - Software, Validation -- **Andrew Page** ([@andrewjpage](https://github.com/andrewjpage)) - Project Administration, Software, Supervision - **Andrew Lang** ([@AndrewLangVt](https://github.com/AndrewLangVt)) - Software, Supervision +- **Andrew Page** ([@andrewjpage](https://github.com/andrewjpage)) - Project Administration, Software, Supervision - **Kelsey Kropp** ([@kelseykropp](https://github.com/kelseykropp)) - Validation - **Emily Smith** ([@emily-smith1](https://github.com/emily-smith1)) - Validation - **Joel Sevinsky** ([@sevinsky](https://github.com/sevinsky)) - Conceptualization, Project Administration, Supervision From 2e25834d6b42659234f509692bcb73a185884afb Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Tue, 22 Oct 2024 15:48:43 +0000 Subject: [PATCH 35/48] work on inputs --- .../genomic_characterization/theiameta_panel.md | 10 +++++++++- .../basic_statistics/task_fastq_scan.wdl | 6 +++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiameta_panel.md b/docs/workflows/genomic_characterization/theiameta_panel.md index fc0a43ed9..fe5f002e4 100644 --- a/docs/workflows/genomic_characterization/theiameta_panel.md +++ b/docs/workflows/genomic_characterization/theiameta_panel.md @@ -14,7 +14,15 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; | **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | |---|---|---|---|---|---| -| task_name | **variable_name** | Type | Description | Default Value | Required/Optional | +| theiameta_panel_illumina_pe | **read1** | File | The forward Illumina read in FASTQ file format (compression optional) | | Required | +| theiameta_panel_illumina_pe | **read2** | File | The reverse Illumina read in FASTQ file format (compression optional) | | Required | +| theiameta_panel_illumina_pe | **samplename** | String | The name of the sample being analyzed | | Required | +| theiameta_panel_illumina_pe | **taxon_ids** | Array[Int] | The taxon IDs to be used for taxonomic binning | | Required | +| fastq_scan_binned | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| fastq_scan_binned | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| fastq_scan_binned | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1 | Optional | +| fastq_scan_binned | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| fastq_scan_binned | ### Workflow Tasks diff --git a/tasks/quality_control/basic_statistics/task_fastq_scan.wdl b/tasks/quality_control/basic_statistics/task_fastq_scan.wdl index 029b94917..489cbb1f1 100644 --- a/tasks/quality_control/basic_statistics/task_fastq_scan.wdl +++ b/tasks/quality_control/basic_statistics/task_fastq_scan.wdl @@ -4,13 +4,13 @@ task fastq_scan_pe { input { File read1 File read2 - String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") - String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq") Int disk_size = 100 String docker = "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1" Int memory = 2 Int cpu = 2 } + String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") + String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq") command <<< # capture date and version date | tee DATE @@ -64,12 +64,12 @@ task fastq_scan_pe { task fastq_scan_se { input { File read1 - String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") Int disk_size = 100 Int memory = 2 Int cpu = 2 String docker = "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1" } + String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") command <<< # capture date and version date | tee DATE From c04fa48dbde1251c2be9c497c14cab39ba6d357a Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 23 Oct 2024 19:01:39 +0000 Subject: [PATCH 36/48] hide some optional inputs --- .../theiameta/wf_theiameta_panel_illumina_pe.wdl | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index a71a82ee1..67610109f 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -18,7 +18,10 @@ workflow theiameta_panel_illumina_pe { String samplename File read1 File read2 - Array[Int] taxon_ids # suggest using a workspace element if user wants to modify? + # default taxon IDs for Illumina VSP panel + Array[Int] taxon_ids + # = [10244, 10255, 10298, 10359, 10376, 10632, 10804, 11021, 11029, 11033, 11034, 11036, 11039, 11041, 11053, 11060, 11069, 11070, 11072, 11079, 11080, 11082, 11083, 11084, 11089, 11137, 11234, 11292, 11520, 11552, 11577, 11580, 11587, 11588, 11676, 11709, 12092, 12475, 12538, 12542, 28875, 28876, 31631, 33743, 35305, 35511, 36427, 37124, 38766, 38767, 45270, 46839, 57482, 57483, 59301, 64286, 64320, 68887, 80935, 90961, 95341, 102793, 102796, 108098, 114727, 114729, 118655, 119210, 129875, 129951, 130308, 130309, 130310, 138948, 138949, 138950, 138951, 147711, 147712, 152219, 162145, 169173, 186538, 186539, 186540, 186541, 238817, 277944, 290028, 333278, 333760, 333761, 333762, 440266, 463676, 493803, 536079, 565995, 862909, 1003835, 1216928, 1221391, 1239565, 1239570, 1239573, 1277649, 1313215, 1330524, 1335626, 1348384, 1424613, 1452514, 1474807, 1497391, 1608084, 1618189, 1891764, 1891767, 1965344, 1980456, 2010960, 2169701, 2169991, 2560525, 2560602, 2697049, 2847089, 2901879, 2907957, 3052148, 3052223, 3052225, 3052230, 3052302, 3052307, 3052310, 3052314, 3052470, 3052477, 3052480, 3052489, 3052490, 3052493, 3052496, 3052499, 3052503, 3052505, 3052518, 10798, 11216, 1203539, 12730, 142786, 1803956, 208893, 2560526, 2849717, 3052303, 3052317, 3052498, 746830, 746831, 943908] + # suggest using a workspace element if user wants to modify? Int minimum_read_number = 1000 File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz" @@ -31,9 +34,17 @@ workflow theiameta_panel_illumina_pe { samplename = samplename, read1 = read1, read2 = read2, - workflow_series = "theiameta" + workflow_series = "theiameta", + # adding these additional inputs to hide them from Terra; these are not used + call_kraken = false, + kraken_disk_size = 0, + kraken_memory = 0, + kraken_cpu = 0, + kraken_db = kraken2_db, + target_organism = "" } # kraken does not run as part of the theiameta track in read_QC_trim -- we may want to change that + # if we do change that, we will want to change the inputs to read_QC_trim to no longer have defaults hiding them from Terra call kraken_task.kraken2_standalone as kraken2 { input: samplename = samplename, @@ -60,7 +71,6 @@ workflow theiameta_panel_illumina_pe { } #### ADJUST IN THE FUTURE; SETTING TO 100 FOR TESTING #### if (fastq_scan_binned.read1_seq > minimum_read_number) { - String did_attempt_assembly = "Assembly attempted" call metaspades_task.metaspades_pe { input: read1_cleaned = krakentools.extracted_read1, From 43e0efd0943beb3e3734b60e5f0c8aab1670f959 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 23 Oct 2024 19:11:53 +0000 Subject: [PATCH 37/48] add inputs and outputs to docs --- .../theiameta_panel.md | 67 ++++++++++++++++++- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiameta_panel.md b/docs/workflows/genomic_characterization/theiameta_panel.md index fe5f002e4..6164930ec 100644 --- a/docs/workflows/genomic_characterization/theiameta_panel.md +++ b/docs/workflows/genomic_characterization/theiameta_panel.md @@ -22,7 +22,62 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; | fastq_scan_binned | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | | fastq_scan_binned | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1 | Optional | | fastq_scan_binned | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | -| fastq_scan_binned | +| gather_scatter | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| gather_scatter | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| gather_scatter | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16 | Optional | +| gather_scatter | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| kraken2 | **classified_out** | String | Allows user to rename the classified FASTQ files output. Must include .fastq as the suffix | classified#.fastq | Optional | + kraken2 | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| kraken2 | **disk_size** | Int | GB of storage to request for VM used to run the kraken2 task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | +| kraken2 | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db | Optional | +| kraken2 | **kraken2_args** | String | Allows a user to supply additional kraken2 command-line arguments | | Optional | +| kraken2 | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| kraken2 | **unclassified_out** | String | Allows user to rename unclassified FASTQ files output. Must include .fastq as the suffix | unclassified#.fastq | Optional | +| krakentools | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| krakentools | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| krakentools | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/krakentools:d4a2fbe| Optional | +| krakentools | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| metaspades | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| metaspades | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| metaspades | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/metaspades:3.15.3 | Optional | +| metaspades | **kmers** | String | The k-mer list to use; if not provided, the value is automatically set | | Optional | +| metaspades | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 32 | Optional | +| metaspades | **metaspades_opts** | String | Additional arguments to pass on to the metaspades command | | Optional | +| metaspades | **phred_offset** | Int | The PHRED quality offset of the input reads; can be either 33 or 64 | 33 | Optional | +| minimap2_assembly_correction | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| minimap2_assembly_correction | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| minimap2_assembly_correction | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/minimap2:2.22 | Optional | +| minimap2_assembly_correction | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| pilon | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| pilon | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| pilon | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pilon:1.24 | Optional | +| pilon | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| quast | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| quast | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| quast | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/quast:5.0.2 | Optional | +| quast | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim | **adapters** | File | A file containing the sequence of the adapters used during library preparation, used in the BBDuk task | | Optional | +| read_QC_trim | **bbduk_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| read_QC_trim | **call_kraken** | Boolean | Set to true to launch Kraken2; if true, you must provide a kraken_db | FALSE | Optional | +| read_QC_trim | **call_midas** | Boolean | Set to true to launch Midas | TRUE | Optional | +| read_QC_trim | **fastp_args** | String | Additional arguments to pass to fastp | "--detect_adapter_for_pe -g -5 20 -3 20 | Optional | +| read_QC_trim | **midas_db** | File | Midas database file | gs://theiagen-large-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz | Optional | +| read_QC_trim | **phix** | File | A file containing the phix used during Illumina sequencing; used in the BBDuk task | | Optional | +| read_QC_trim | **read_processing** | String | Read trimming software to use, either "trimmomatic" or "fastp" | trimmomatic | Optional | +| read_QC_trim | **read_qc** | String | Allows the user to decide between fastq_scan (default) and fastqc for the evaluation of read quality. | fastq_scan | Optional | +| read_QC_trim | **trim_min_length** | Int | The minimum length of each read after trimming | 75 | Optional | +| read_QC_trim | **trim_primers** | Boolean | A True/False option that determines if primers should be trimmed. | TRUE | Optional | +| read_QC_trim | **trim_quality_min_score** | Int | The minimum quality score to keep during trimming | 30 | Optional | +| read_QC_trim | **trim_window_size** | Int | Specifies window size for trimming (the number of bases to average the quality across) | 4 | Optional | +| read_QC_trim | **trimmomatic_args** | String | Additional arguments to pass to trimmomatic | -phred33 | Optional | +| sort_bam_assembly_correction | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| sort_bam_assembly_correction | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| sort_bam_assembly_correction | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.17 | Optional | +| sort_bam_assembly_correction | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| theiameta_panel_illumina_pe | **kraken2_db** | File | A Kraken2 database in .tar.gz format | gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz | Optional | +| theiameta_panel_illumina_pe | **minimum_read_number** | Int | The minimum number of reads in order to attempt assembly on a bin of reads | 1000 | Optional | +| version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | +| version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | ### Workflow Tasks @@ -126,7 +181,15 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; | **Variable** | **Type** | **Description** | |---|---|---| -| variable_name | Type | Description | +| identified_organisms | Array[String] | A list of organisms that were able to be identified in the sample with the specified Kraken2 database | +| kraken2_classified_report | File | Standard Kraken2 output report. TXT filetype, but can be opened in Excel as a TSV file | +| kraken2_database | String | The name of the database used to run Kraken2 | +| kraken2_docker | String | Docker image used to run kraken2 | +| kraken2_report | File | Text document describing taxonomic prediction of every FASTQ record. This file can be very large and cumbersome to open and view | +| kraken2_version | String | The version of Kraken2 used in the analysis | +| results_by_taxon_tsv | File | A TSV file that contains the results for every taxon ID provided in the taxon_ids input variable that had reads identified; characterization (if applicable) and basic statistics regarding read count, assembly generation (if applicable), and general quality, are also associated with each bin | +| theiameta_panel_illumina_pe_analysis_date | String | Date the workflow was run | +| theiameta_panel_illumina_pe_version | String | Version of PHB used to run the workflow | ## References (if applicable) From 547a920d4ce4f882a2555132c7e4feddae99b7ff Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Wed, 23 Oct 2024 19:14:05 +0000 Subject: [PATCH 38/48] enable searchable --- .../theiameta_panel.md | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiameta_panel.md b/docs/workflows/genomic_characterization/theiameta_panel.md index 6164930ec..842509c61 100644 --- a/docs/workflows/genomic_characterization/theiameta_panel.md +++ b/docs/workflows/genomic_characterization/theiameta_panel.md @@ -12,6 +12,8 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; ### Inputs +
+ | **Terra Task Name** | **Variable** | **Type** | **Description** | **Default Value** | **Terra Status** | |---|---|---|---|---|---| | theiameta_panel_illumina_pe | **read1** | File | The forward Illumina read in FASTQ file format (compression optional) | | Required | @@ -79,6 +81,8 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; | version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | | version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | +
+ ### Workflow Tasks ??? task "`read_QC_trim`: Read Quality Trimming, Adapter Removal, Quantification, and Identification" @@ -179,6 +183,8 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; ### Outputs +
+ | **Variable** | **Type** | **Description** | |---|---|---| | identified_organisms | Array[String] | A list of organisms that were able to be identified in the sample with the specified Kraken2 database | @@ -191,8 +197,23 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; | theiameta_panel_illumina_pe_analysis_date | String | Date the workflow was run | | theiameta_panel_illumina_pe_version | String | Version of PHB used to run the workflow | +
+ ## References (if applicable) -> reference1 +>**Trimmomatic:** Anthony M. Bolger and others, Trimmomatic: a flexible trimmer for Illumina sequence data, *Bioinformatics*, Volume 30, Issue 15, August 2014, Pages 2114–2120,  + +>**Fastq-Scan:** + +>**metaSPAdes:** Sergey Nurk and others, metaSPAdes: a new versatile metagenomic assembler, *Genome Res.* 2017 May; 27(5): 824–834.,  + +>**Pilon:** Bruce J. Walker and others. Pilon: An Integrated Tool for Comprehensive Microbial Variant Detection and Genome Assembly Improvement. *Plos One.* November 19, 2014. + +>**Minimap2:** Heng Li, Minimap2: pairwise alignment for nucleotide sequences, *Bioinformatics*, Volume 34, Issue 18, September 2018, Pages 3094–3100,  + +>**QUAST:** Alexey Gurevich and others, QUAST: quality assessment tool for genome assemblies, *Bioinformatics*, Volume 29, Issue 8, April 2013, Pages 1072–1075,  + +>**Samtools:** Li, Heng, Bob Handsaker, Alec Wysoker, Tim Fennell, Jue Ruan, Nils Homer, Gabor Marth, Goncalo Abecasis, Richard Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence Alignment/Map format and SAMtools. Bioinformatics 25(16): 2078-2079. + +>**Bcftools:** Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li. Twelve years of SAMtools and BCFtools. GigaScience, Volume 10, Issue 2, February 2021, giab008, -> reference2 From 82695a7216f65efde8dc944df25146449456d711 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Thu, 24 Oct 2024 20:40:35 +0000 Subject: [PATCH 39/48] set default, expand docs --- .../theiameta_panel.md | 210 ++++++++++++++++-- .../wf_theiameta_panel_illumina_pe.wdl | 3 +- 2 files changed, 191 insertions(+), 22 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiameta_panel.md b/docs/workflows/genomic_characterization/theiameta_panel.md index 842509c61..1549dbd05 100644 --- a/docs/workflows/genomic_characterization/theiameta_panel.md +++ b/docs/workflows/genomic_characterization/theiameta_panel.md @@ -8,7 +8,189 @@ ## TheiaMeta_Panel_Illumina_PE_PHB -TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; however, it can be used for any panel that is sequenced using Illumina paired-end reads if the appropriate taxon IDs are provided. TheiaMeta_Panel performs taxonomic binning, and then assembles the bins into contigs. If the contigs are associated with a supported organism, genomic characterization will be performed. +TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Surveillance Panel](https://www.illumina.com/products/by-type/sequencing-kits/library-prep-kits/viral-surveillance-panel.html)[^1]; however, it can be used for any panel that is sequenced using Illumina paired-end reads if the appropriate taxon IDs are provided. TheiaMeta_Panel performs taxonomic binning, and then assembles the bins into contigs. If the contigs are associated with a supported organism, genomic characterization will be performed. + +[^1]: We are not affiliated with Illumina, Inc. The mention of the Illumina Viral Surveillance Panel is for informational purposes only. + +??? toggle "**What organisms and taxon IDs are identified by default?**" + The Illumina VSP panel contains over 224 viral species, of which 163 can be identified in the default Kraken2 viral database. + + Accordingly, the following 163 taxon IDs are used by default in TheiaMeta_Panel_Illumina_PE. Feel free to search this table to see if your organism of interest is included. + +
+ + | **Taxon ID** | **Organism Name in Illumina VSP Panel** | + |---|---| + | 10804 | Adeno-associated virus 2 (AAV2) | + | 1313215 | Aichi virus 1 (AiV-A1) | + | 2849717  | Aigai virus (AIGV) | + | 1980456 | Andes virus (ANDV) | + | 1424613 | Anjozorobe virus (ANJV) | + | 90961 | Australian bat lyssavirus (ABLV) | + | 3052470 | Bayou virus (BAYV) | + | 3052490 | Black Creek Canal virus (BCCV) | + | 2010960 | Bombali virus (BOMV) | + | 1618189 | Bourbon virus (BRBV) | + | 565995 | Bundibugyo virus (BDBV) | + | 80935 | Cache Valley virus (CVV) | + | 35305 | California encephalitis virus (CEV) | + | 1221391 | Cedar virus (CedV) | + | 3052302 | Chapare virus (CHAPV) | + | 37124 | Chikungunya virus (CHIKV) | + | 169173 | Choclo virus (CHOV) | + | 46839 | Colorado tick fever virus (CTFV) | + | 138948 | Coxsackievirus A | + | 138949 | Coxsackievirus B | + | 3052518 | Crimean-Congo hemorrhagic fever virus (CCHFV) | + | 11053 | Dengue Virus 1 | + | 11060 | Dengue Virus 2 | + | 11069 | Dengue Virus 3 | + | 11070 | Dengue Virus 4 | + | 3052477 | Dobrava virus (DOBV) | + | 38767 | Duvenhage virus (DUVV) | + | 11021 | Eastern equine encephalitis virus (EEEV) | + | 138951 | Enterovirus D | + | 10376 | Epstein-Barr virus (EBV) | + | 57482 | European bat lyssavirus 1 | + | 57483 | European bat lyssavirus 2 | + | 2847089 | Ghana virus (GhV) | + | 3052307 | Guanarito virus (GTOV) | + | 3052480 | Hantaan virus (HTNV) | + | 1216928 | Heartland virus (HRTV) | + | 3052223 | Hendra virus (HeV) | + | 12092 | Hepatitis A virus (HAV) | + | 3052230 | Hepatitis C virus (HCV) | + | 12475 | Hepatitis D virus (HDV) | + | 10298 | Herpes simplex virus 1 (HSV1) | + | 129875 | Human adenovirus A | + | 108098 | Human adenovirus B | + | 129951 | Human adenovirus C | + | 130310 | Human adenovirus D | + | 130308 | Human adenovirus E | + | 130309 | Human adenovirus F | + | 536079 | Human adenovirus G | + | 11137 | Human coronavirus 229E (HCoV_229E) | + | 290028 | Human coronavirus HKU1 (HCoV_HKU1) | + | 277944 | Human coronavirus NL63 (HCoV_NL63) | + | 31631 | Human coronavirus OC43 (HCoV_OC43) | + | 10359 | Human cytomegalovirus (HCMV) | + | 11676 | Human immunodeficiency virus 1 (HIV-1) | + | 11709 | Human immunodeficiency virus 2 (HIV-2) | + | 162145 | Human metapneumovirus (HMPV) | + | 333760 | Human papillomavirus 16 (HPV16; high-risk) | + | 333761 | Human papillomavirus 18 (HPV18; high-risk) | + | 333762 | Human papillomavirus 26 (HPV26) | + | 12730 | Human parainfluenza virus 1 (HPIV-1) | + | 2560525 | Human parainfluenza virus 2 (HPIV-2) | + | 11216 | Human parainfluenza virus 3 (HPIV-3) | + | 2560526  | Human parainfluenza virus 4 (HPIV-4) | + | 1803956  | Human parechovirus (HPeV) | + | 10798  | Human parvovirus B19 (B19V) | + | 746830 | Human polyomavirus 6 (HPyV6) | + | 746831 | Human polyomavirus 7 (HPyV7) | + | 943908 | Human polyomavirus 9 (HPyV9) | + | 208893 | Human respiratory syncytial virus A (HRSV-A) | + | 114727 | Influenza A virus (H1N1) | + | 114729 | Influenza A virus (H2N2) | + | 119210 | Influenza A virus (H3N2) | + | 102793 | Influenza A virus (H5N1) | + | 333278 | Influenza A virus (H7N9) | + | 102796 | Influenza A virus (H9N2) | + | 11520 | Influenza B virus | + | 11552 | Influenza C virus | + | 35511 | Jamestown Canyon virus (JCV) | + | 11072 | Japanese encephalitis virus (JEV) | + | 10632 | JC polyomavirus (JCPyV) | + | 2169991 | Junin virus (JUNV) | + | 1891764 | KI polyomavirus (KIPyV) | + | 33743 | Kyasanur Forest disease virus (KFDV) | + | 11577 | La Crosse virus (LACV) | + | 38766 | Lagos bat virus (LBV) | + | 3052489 | Laguna Negra virus (LANV) | + | 3052310 | Lassa virus (LASV) | + | 1965344 | LI polyomavirus (LIPyV) | + | 3052148 | Lloviu virus (LLOV) | + | 3052314 | Lujo virus (LUJV) | + | 3052303 | Lymphocytic choriomeningitis virus (LCMV) | + | 3052317 | Machupo virus (MACV) | + | 1239565 | Mamastrovirus 1 (MAstV1) | + | 1239570 | Mamastrovirus 6 (MAstV6) | + | 1239573 | Mamastrovirus 9 (MAstV9) | + | 238817 | Maporal virus (MAPV) | + | 3052505 | Marburg virus (MARV) | + | 59301 | Mayaro virus (MAYV) | + | 11234 | Measles virus (MV) | + | 152219 | Menangle virus (MenV) | + | 493803 | Merkel cell polyomavirus (MCPyV) | + | 1335626 | Middle East respiratory syndrome-related coronavirus (MERS-CoV) | + | 1474807 | Mojiang virus (MojV) | + | 12538 | Mokola virus (MOKV) | + | 10244 | Monkeypox virus (MPV) | + | 2560602 | Mumps virus (MuV) | + | 11079 | Murray Valley encephalitis virus (MVEV) | + | 1203539 | MW polyomavirus (MWPyV) | + | 1497391 | New Jersey polyomavirus (NJPyV) | + | 3052225 | Nipah virus (NiV) | + | 142786 | Norovirus | + | 12542 | Omsk hemorrhagic fever virus (OHFV) | + | 2169701 | Onyong-nyong virus (ONNV) | + | 118655 | Oropouche virus (OROV) | + | 138950 | Poliovirus | + | 11083 | Powassan virus (POWV) | + | 11587 | Punta Toro virus (PTV) | + | 3052493 | Puumala virus (PUUV) | + | 11292 | Rabies virus (RABV) | + | 186539 | Reston virus (RESTV) | + | 147711 | Rhinovirus A (RV-A) | + | 147712 | Rhinovirus B (RV-B) | + | 463676 | Rhinovirus C (RV-C) | + | 11588 | Rift Valley fever virus (RVFV) | + | 11029 | Ross River virus (RRV) | + | 28875 | Rotavirus A (RVA) | + | 28876 | Rotavirus B (RVB) | + | 36427 | Rotavirus C (RVC) | + | 1348384 | Rotavirus H (RVH) | + | 11041 | Rubella virus (RuV) | + | 2907957 | Sabia virus (SBAV) | + | 1330524 | Salivirus A (SaV-A) | + | 3052496 | Sangassou virus (SANGV) | + | 95341 | Sapovirus | + | 11033 | Semliki Forest virus (SFV) | + | 3052498 | Seoul virus (SEOV) | + | 2901879 | Severe acute respiratory syndrome coronavirus (SARS-CoV) | + | 2697049 | Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) | + | 1003835 | Severe fever with thrombocytopenia syndrome virus (SFTSV) | + | 1891767 | Simian virus 40 (SV40) | + | 3052499 | Sin nombre virus (SNV) | + | 11034 | Sindbis virus (SINV) | + | 11580 | Snowshoe hare virus (SSHV) | + | 1452514 | Sosuga virus (SoRV) | + | 11080 | St. Louis encephalitis virus (SLEV) | + | 1277649 | STL polyomavirus (STLPyV) | + | 186540 | Sudan virus (SUDV) | + | 1608084 | Tacheng tick virus 2 (TcTV-2) | + | 45270 | Tahyna virus (TAHV) | + | 186541 | Tai Forest virus (TAFV) | + | 11084 | Tick-borne encephalitis virus (TBEV) | + | 68887 | Torque teno virus (TTV) | + | 862909 | Trichodysplasia spinulosa-associated polyomavirus (TSPyV) | + | 3052503 | Tula virus (TULV) | + | 64286 | Usutu virus (USUV) | + | 10255 | Variola virus (VARV) | + | 11036 | Venezuelan equine encephalitis virus (VEEV) | + | 11082 | West Nile virus (WNV) | + | 11039 | Western equine encephalitis virus (WEEV) | + | 440266 | WU polyomavirus (WUPyV) | + | 11089 | Yellow fever virus (YFV) | + | 186538 | Zaire ebolavirus(EBOV) | + | 64320 | Zika virus (ZIKV) | + +
+ +!!! tip "Make your own list of taxon IDs" + You may want to make your own list of taxon IDs if you know your sample is likely to contain a specific organism or group of organisms. You can find taxon IDs in the [NCBI Taxonomy Browser](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi). + + In Terra, you provide your created list of taxon IDs as an array of integers for the `taxon_ids` optional input variable, like this: `[1, 2, 3, 4, 5]`. Just replace the numbers in this example with the taxon IDs you want to use. ### Inputs @@ -170,7 +352,7 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; | Software Documentation | | | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | -??? task "`KrakenTools extract_kraken_reads`: Read Binning" +??? task "`extract_kraken_reads` from KrakenTools: Read Binning" KrakenTools is a collection of scripts that can be used to help downstream analysis of Kraken2 results. In particular, this task uses the `extract_kraken_reads` script, which extracts reads classified at any user-specified taxonomy IDs. All parent and children reads of the specified taxonomic ID are also extracted. !!! techdetails "KrakenTools Technical Details" @@ -199,21 +381,9 @@ TheiaMeta_Panel was created initially for the Illumina Viral Surveillance Panel; -## References (if applicable) - ->**Trimmomatic:** Anthony M. Bolger and others, Trimmomatic: a flexible trimmer for Illumina sequence data, *Bioinformatics*, Volume 30, Issue 15, August 2014, Pages 2114–2120,  - ->**Fastq-Scan:** - ->**metaSPAdes:** Sergey Nurk and others, metaSPAdes: a new versatile metagenomic assembler, *Genome Res.* 2017 May; 27(5): 824–834.,  - ->**Pilon:** Bruce J. Walker and others. Pilon: An Integrated Tool for Comprehensive Microbial Variant Detection and Genome Assembly Improvement. *Plos One.* November 19, 2014. - ->**Minimap2:** Heng Li, Minimap2: pairwise alignment for nucleotide sequences, *Bioinformatics*, Volume 34, Issue 18, September 2018, Pages 3094–3100,  - ->**QUAST:** Alexey Gurevich and others, QUAST: quality assessment tool for genome assemblies, *Bioinformatics*, Volume 29, Issue 8, April 2013, Pages 1072–1075,  - ->**Samtools:** Li, Heng, Bob Handsaker, Alec Wysoker, Tim Fennell, Jue Ruan, Nils Homer, Gabor Marth, Goncalo Abecasis, Richard Durbin, and 1000 Genome Project Data Processing Subgroup. 2009. The Sequence Alignment/Map format and SAMtools. Bioinformatics 25(16): 2078-2079. - ->**Bcftools:** Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li. Twelve years of SAMtools and BCFtools. GigaScience, Volume 10, Issue 2, February 2021, giab008, - +#### The `results_by_taxon_tsv` Output File + +This file contains the + +## References + diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index 67610109f..889935a03 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -19,8 +19,7 @@ workflow theiameta_panel_illumina_pe { File read1 File read2 # default taxon IDs for Illumina VSP panel - Array[Int] taxon_ids - # = [10244, 10255, 10298, 10359, 10376, 10632, 10804, 11021, 11029, 11033, 11034, 11036, 11039, 11041, 11053, 11060, 11069, 11070, 11072, 11079, 11080, 11082, 11083, 11084, 11089, 11137, 11234, 11292, 11520, 11552, 11577, 11580, 11587, 11588, 11676, 11709, 12092, 12475, 12538, 12542, 28875, 28876, 31631, 33743, 35305, 35511, 36427, 37124, 38766, 38767, 45270, 46839, 57482, 57483, 59301, 64286, 64320, 68887, 80935, 90961, 95341, 102793, 102796, 108098, 114727, 114729, 118655, 119210, 129875, 129951, 130308, 130309, 130310, 138948, 138949, 138950, 138951, 147711, 147712, 152219, 162145, 169173, 186538, 186539, 186540, 186541, 238817, 277944, 290028, 333278, 333760, 333761, 333762, 440266, 463676, 493803, 536079, 565995, 862909, 1003835, 1216928, 1221391, 1239565, 1239570, 1239573, 1277649, 1313215, 1330524, 1335626, 1348384, 1424613, 1452514, 1474807, 1497391, 1608084, 1618189, 1891764, 1891767, 1965344, 1980456, 2010960, 2169701, 2169991, 2560525, 2560602, 2697049, 2847089, 2901879, 2907957, 3052148, 3052223, 3052225, 3052230, 3052302, 3052307, 3052310, 3052314, 3052470, 3052477, 3052480, 3052489, 3052490, 3052493, 3052496, 3052499, 3052503, 3052505, 3052518, 10798, 11216, 1203539, 12730, 142786, 1803956, 208893, 2560526, 2849717, 3052303, 3052317, 3052498, 746830, 746831, 943908] + Array[Int] taxon_ids = [10244, 10255, 10298, 10359, 10376, 10632, 10804, 11021, 11029, 11033, 11034, 11036, 11039, 11041, 11053, 11060, 11069, 11070, 11072, 11079, 11080, 11082, 11083, 11084, 11089, 11137, 11234, 11292, 11520, 11552, 11577, 11580, 11587, 11588, 11676, 11709, 12092, 12475, 12538, 12542, 28875, 28876, 31631, 33743, 35305, 35511, 36427, 37124, 38766, 38767, 45270, 46839, 57482, 57483, 59301, 64286, 64320, 68887, 80935, 90961, 95341, 102793, 102796, 108098, 114727, 114729, 118655, 119210, 129875, 129951, 130308, 130309, 130310, 138948, 138949, 138950, 138951, 147711, 147712, 152219, 162145, 169173, 186538, 186539, 186540, 186541, 238817, 277944, 290028, 333278, 333760, 333761, 333762, 440266, 463676, 493803, 536079, 565995, 862909, 1003835, 1216928, 1221391, 1239565, 1239570, 1239573, 1277649, 1313215, 1330524, 1335626, 1348384, 1424613, 1452514, 1474807, 1497391, 1608084, 1618189, 1891764, 1891767, 1965344, 1980456, 2010960, 2169701, 2169991, 2560525, 2560602, 2697049, 2847089, 2901879, 2907957, 3052148, 3052223, 3052225, 3052230, 3052302, 3052307, 3052310, 3052314, 3052470, 3052477, 3052480, 3052489, 3052490, 3052493, 3052496, 3052499, 3052503, 3052505, 3052518, 10798, 11216, 1203539, 12730, 142786, 1803956, 208893, 2560526, 2849717, 3052303, 3052317, 3052484, 3052498, 746830, 746831, 943908] # suggest using a workspace element if user wants to modify? Int minimum_read_number = 1000 From 033cbc053a2b17cc767a9c81d0d1db1ab9e6e904 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 28 Oct 2024 13:22:42 +0000 Subject: [PATCH 40/48] update contributors --- README.md | 3 ++- docs/index.md | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a9046673d..da0701f03 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ You can expect a careful review of every PR and feedback as needed before mergin * **Kevin Libuit** ([@kevinlibuit](https://github.com/kevinlibuit)) - Conceptualization, Project Administration, Software, Validation, Supervision * **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - Software, Validation * **Fraser Combe** ([@fraser-combe](https://github.com/fraser-combe)) - Software, Validation -* **Michal-Babins** ([@Michal-Babins](https://github.com/Michal-Babins)) - Software, Validation +* **Michal Babinski** ([@Michal-Babins](https://github.com/Michal-Babins)) - Software, Validation * **Andrew Lang** ([@AndrewLangVt](https://github.com/AndrewLangVt)) - Software, Supervision * **Andrew Page** ([@andrewjpage](https://github.com/andrewjpage)) - Project Administration, Software, Supervision * **Kelsey Kropp** ([@kelseykropp](https://github.com/kelseykropp)) - Validation @@ -65,6 +65,7 @@ We would like to gratefully acknowledge the following individuals from the publi * **James Otieno** ([@jrotieno](https://github.com/jrotieno)) * **Robert Petit** ([@rpetit3](https://github.com/rpetit3)) +* **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) * **Ash O'Farrel** ([@aofarrel](https://github.com/aofarrel)) * **Sam Baird** ([@sam-baird](https://github.com/sam-baird)) * **Holly Halstead** ([@HNHalstead](https://github.com/HNHalstead)) diff --git a/docs/index.md b/docs/index.md index 2d8eb679a..95b148c37 100644 --- a/docs/index.md +++ b/docs/index.md @@ -69,8 +69,7 @@ You can expect a careful review of every PR and feedback as needed before mergin - **Michelle Scribner** ([@michellescribner](https://github.com/michellescribner)) - Software, Validation - **Kevin Libuit** ([@kevinlibuit](https://github.com/kevinlibuit)) - Conceptualization, Project Administration, Software, Validation, Supervision - **Fraser Combe** ([@fraser-combe](https://github.com/fraser-combe)) - Software, Validation -- **Michal-Babins** ([@Michal-Babins](https://github.com/Michal-Babins)) - Software, Validation -- **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - Software, Validation +- **Michal Babinski** ([@Michal-Babins](https://github.com/Michal-Babins)) - Software, Validation - **Andrew Lang** ([@AndrewLangVt](https://github.com/AndrewLangVt)) - Software, Supervision - **Andrew Page** ([@andrewjpage](https://github.com/andrewjpage)) - Project Administration, Software, Supervision - **Kelsey Kropp** ([@kelseykropp](https://github.com/kelseykropp)) - Validation @@ -83,6 +82,7 @@ We would like to gratefully acknowledge the following individuals from the publi - **James Otieno** ([@jrotieno](https://github.com/jrotieno)) - **Robert Petit** ([@rpetit3](https://github.com/rpetit3)) +- **Emma Doughty** ([@emmadoughty](https://github.com/emmadoughty)) - **Ash O'Farrel** ([@aofarrel](https://github.com/aofarrel)) - **Sam Baird** ([@sam-baird](https://github.com/sam-baird)) - **Holly Halstead** ([@HNHalstead](https://github.com/HNHalstead)) From c8b658a6886b0da910992d5c51cf6041469848cd Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 28 Oct 2024 13:53:43 +0000 Subject: [PATCH 41/48] input explosion --- .../betacoronavirus/task_pangolin.wdl | 6 +- workflows/utilities/wf_flu_track.wdl | 3 +- workflows/utilities/wf_morgana_magic.wdl | 100 ++++++++++++++++-- .../utilities/wf_organism_parameters.wdl | 2 +- 4 files changed, 98 insertions(+), 13 deletions(-) diff --git a/tasks/species_typing/betacoronavirus/task_pangolin.wdl b/tasks/species_typing/betacoronavirus/task_pangolin.wdl index 848a2fdc5..fa2fc9868 100644 --- a/tasks/species_typing/betacoronavirus/task_pangolin.wdl +++ b/tasks/species_typing/betacoronavirus/task_pangolin.wdl @@ -8,9 +8,9 @@ task pangolin4 { Float max_ambig = 0.5 String docker String? analysis_mode - Boolean expanded_lineage=true - Boolean skip_scorpio=false - Boolean skip_designation_cache=false + Boolean expanded_lineage = true + Boolean skip_scorpio = false + Boolean skip_designation_cache = false String? pangolin_arguments Int disk_size = 100 Int memory = 8 diff --git a/workflows/utilities/wf_flu_track.wdl b/workflows/utilities/wf_flu_track.wdl index 6cebdf4e8..4cf22fc57 100644 --- a/workflows/utilities/wf_flu_track.wdl +++ b/workflows/utilities/wf_flu_track.wdl @@ -53,6 +53,7 @@ workflow flu_track { Int? abricate_flu_disk_size # flu antiviral substitutions subworkflow inputs + Boolean analyze_flu_antiviral_substitutions = true File? flu_h1_ha_ref File? flu_h3_ha_ref File? flu_n1_na_ref @@ -75,8 +76,6 @@ workflow flu_track { Int? nextclade_output_parser_cpu Int? nextclade_output_parser_memory Int? nextclade_output_parser_disk_size - - Boolean analyze_flu_antiviral_substitutions = true } # IRMA will run if no assembly is provided (as in the case of TheiaCoV_FASTA) call irma_task.irma { diff --git a/workflows/utilities/wf_morgana_magic.wdl b/workflows/utilities/wf_morgana_magic.wdl index 686497124..73961d6c9 100644 --- a/workflows/utilities/wf_morgana_magic.wdl +++ b/workflows/utilities/wf_morgana_magic.wdl @@ -15,19 +15,66 @@ workflow morgana_magic { File read2 String taxon_id String seq_method + # consensus qc + Int? consensus_qc_cpu + Int? consensus_qc_disk_size + String? consensus_qc_docker + Int? consensus_qc_memory + # assembly metrics + Int? assembly_metrics_cpu + Int? assembly_metrics_disk_size + String? assembly_metrics_docker + Int? assembly_metrics_memory + # flu track - irma + Int? irma_cpu + Int? irma_disk_size + String? irma_docker_image + Boolean? irma_keep_ref_deletions + Int? irma_memory + # flu track - genoflu + Int? genoflu_cpu + File? genoflu_cross_reference + Int? genoflu_disk_size + String? genoflu_docker + Int? genoflu_memory + # flu track - abricate + Int? abricate_flu_cpu + Int? abricate_flu_disk_size + String? abricate_flu_docker + Int? abricate_flu_memory + Int? abricate_flu_mincov + Int? abricate_flu_minid + # nextclade inputs + Int? nextclade_cpu + Int? nextclade_disk_size + String? nextclade_docker_image + Int? nextclade_memory + Int? nextclade_output_parser_cpu + Int? nextclade_output_parser_disk_size + String? nextclade_output_parser_docker + Int? nextclade_output_parser_memory + # pangolin inputs + Int? pangolin_cpu + Int? pangolin_disk_size + String? pangolin_docker + Int? pangolin_memory } - #### need to add more flu characterization call set_organism_defaults.organism_parameters { input: taxon_id = taxon_id, - organism = "unsupported" + organism = "unsupported", + pangolin_docker_image = pangolin_docker } if (organism_parameters.standardized_organism != "unsupported") { # occurs in theiameta_panel call consensus_qc_task.consensus_qc { input: assembly_fasta = assembly_fasta, reference_genome = organism_parameters.reference, - genome_length = organism_parameters.genome_length + genome_length = organism_parameters.genome_length, + cpu = consensus_qc_cpu, + disk_size = consensus_qc_disk_size, + docker = consensus_qc_docker, + memory = consensus_qc_memory } } if (organism_parameters.standardized_organism == "flu") { @@ -38,7 +85,35 @@ workflow morgana_magic { read2 = read2, seq_method = seq_method, standardized_organism = organism_parameters.standardized_organism, - analyze_flu_antiviral_substitutions = false # don't try to look for antiviral substitutions?? or maybe? not sure + analyze_flu_antiviral_substitutions = false, # don't try to look for antiviral substitutions?? or maybe? not sure + assembly_metrics_cpu = assembly_metrics_cpu, + assembly_metrics_disk_size = assembly_metrics_disk_size, + assembly_metrics_docker = assembly_metrics_docker, + assembly_metrics_memory = assembly_metrics_memory, + irma_cpu = irma_cpu, + irma_disk_size = irma_disk_size, + irma_docker_image = irma_docker_image, + irma_keep_ref_deletions = irma_keep_ref_deletions, + irma_memory = irma_memory, + genoflu_cross_reference = genoflu_cross_reference, + genoflu_cpu = genoflu_cpu, + genoflu_disk_size = genoflu_disk_size, + genoflu_docker = genoflu_docker, + genoflu_memory = genoflu_memory, + abricate_flu_cpu = abricate_flu_cpu, + abricate_flu_disk_size = abricate_flu_disk_size, + abricate_flu_docker = abricate_flu_docker, + abricate_flu_memory = abricate_flu_memory, + abricate_flu_mincov = abricate_flu_mincov, + abricate_flu_minid = abricate_flu_minid, + nextclade_cpu = nextclade_cpu, + nextclade_disk_size = nextclade_disk_size, + nextclade_docker_image = nextclade_docker_image, + nextclade_memory = nextclade_memory, + nextclade_output_parser_cpu = nextclade_output_parser_cpu, + nextclade_output_parser_disk_size = nextclade_output_parser_disk_size, + nextclade_output_parser_docker = nextclade_output_parser_docker, + nextclade_output_parser_memory = nextclade_output_parser_memory } } if (organism_parameters.standardized_organism == "sars-cov-2") { @@ -46,7 +121,10 @@ workflow morgana_magic { input: samplename = samplename, fasta = assembly_fasta, - docker = organism_parameters.pangolin_docker + docker = organism_parameters.pangolin_docker, + cpu = pangolin_cpu, + disk_size = pangolin_disk_size, + memory = pangolin_memory } } if (organism_parameters.standardized_organism == "MPXV" || organism_parameters.standardized_organism == "sars-cov-2" || organism_parameters.standardized_organism == "rsv_a" || organism_parameters.standardized_organism == "rsv_b") { @@ -54,12 +132,20 @@ workflow morgana_magic { input: genome_fasta = assembly_fasta, dataset_name = organism_parameters.nextclade_dataset_name, - dataset_tag = organism_parameters.nextclade_dataset_tag + dataset_tag = organism_parameters.nextclade_dataset_tag, + cpu = nextclade_cpu, + disk_size = nextclade_disk_size, + docker = nextclade_docker_image, + memory = nextclade_memory } call nextclade_task.nextclade_output_parser { input: nextclade_tsv = nextclade_v3.nextclade_tsv, - organism = organism_parameters.standardized_organism + organism = organism_parameters.standardized_organism, + cpu = nextclade_output_parser_cpu, + disk_size = nextclade_output_parser_disk_size, + docker = nextclade_output_parser_docker, + memory = nextclade_output_parser_memory } } ##### is running quasitools even something we want to do???? diff --git a/workflows/utilities/wf_organism_parameters.wdl b/workflows/utilities/wf_organism_parameters.wdl index 0251a19a2..a88779842 100644 --- a/workflows/utilities/wf_organism_parameters.wdl +++ b/workflows/utilities/wf_organism_parameters.wdl @@ -23,7 +23,7 @@ workflow organism_parameters { File? gene_locations_bed_file Int? genome_length_input - # set default nextclade information as NA + # set default nextclade information as "NA" String? nextclade_dataset_tag_input String? nextclade_dataset_name_input From 92acb240f2262a086f0e5e75c68607abcf7ad04f Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 28 Oct 2024 13:56:08 +0000 Subject: [PATCH 42/48] make good --- workflows/utilities/wf_morgana_magic.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/utilities/wf_morgana_magic.wdl b/workflows/utilities/wf_morgana_magic.wdl index 73961d6c9..26bed4bb9 100644 --- a/workflows/utilities/wf_morgana_magic.wdl +++ b/workflows/utilities/wf_morgana_magic.wdl @@ -56,14 +56,14 @@ workflow morgana_magic { # pangolin inputs Int? pangolin_cpu Int? pangolin_disk_size - String? pangolin_docker + String? pangolin_docker_image Int? pangolin_memory } call set_organism_defaults.organism_parameters { input: taxon_id = taxon_id, organism = "unsupported", - pangolin_docker_image = pangolin_docker + pangolin_docker_image = pangolin_docker_image } if (organism_parameters.standardized_organism != "unsupported") { # occurs in theiameta_panel call consensus_qc_task.consensus_qc { From a3c7c52e5bd02dad434b3e5d61331c37726c3483 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 28 Oct 2024 14:09:45 +0000 Subject: [PATCH 43/48] document the explosion --- .../theiameta_panel.md | 42 +++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiameta_panel.md b/docs/workflows/genomic_characterization/theiameta_panel.md index 1549dbd05..31821973f 100644 --- a/docs/workflows/genomic_characterization/theiameta_panel.md +++ b/docs/workflows/genomic_characterization/theiameta_panel.md @@ -201,7 +201,6 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | theiameta_panel_illumina_pe | **read1** | File | The forward Illumina read in FASTQ file format (compression optional) | | Required | | theiameta_panel_illumina_pe | **read2** | File | The reverse Illumina read in FASTQ file format (compression optional) | | Required | | theiameta_panel_illumina_pe | **samplename** | String | The name of the sample being analyzed | | Required | -| theiameta_panel_illumina_pe | **taxon_ids** | Array[Int] | The taxon IDs to be used for taxonomic binning | | Required | | fastq_scan_binned | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | | fastq_scan_binned | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | | fastq_scan_binned | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/utility:1.1 | Optional | @@ -211,7 +210,7 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | gather_scatter | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16 | Optional | | gather_scatter | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | | kraken2 | **classified_out** | String | Allows user to rename the classified FASTQ files output. Must include .fastq as the suffix | classified#.fastq | Optional | - kraken2 | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| kraken2 | **cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | | kraken2 | **disk_size** | Int | GB of storage to request for VM used to run the kraken2 task. Increase this when using large (>30GB kraken2 databases such as the "k2_standard" database) | 100 | Optional | | kraken2 | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/kraken2:2.1.2-no-db | Optional | | kraken2 | **kraken2_args** | String | Allows a user to supply additional kraken2 command-line arguments | | Optional | @@ -232,6 +231,42 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | minimap2_assembly_correction | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | | minimap2_assembly_correction | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/minimap2:2.22 | Optional | | minimap2_assembly_correction | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| morgana_magic | **abricate_flu_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| morgana_magic | **abricate_flu_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| morgana_magic | **abricate_flu_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/abricate:1.0.1-insaflu-220727 | Optional | +| morgana_magic | **abricate_flu_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| morgana_magic | **abricate_flu_mincov** | Int | Minimum DNA % coverage | 60 | Optional | +| morgana_magic | **abricate_flu_minid** | Int | Minimum DNA % identity | 70 | Optional | +| morgana_magic | **assembly_metrics_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| morgana_magic | **assembly_metrics_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| morgana_magic | **assembly_metrics_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.15 | Optional | +| morgana_magic | **assembly_metrics_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | +| morgana_magic | **cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| morgana_magic | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| morgana_magic | **docker** | String | The Docker container to use for the task | ngolin | Optional | +| morgana_magic | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| morgana_magic | **genoflu_cpu** | Int | Number of CPUs to allocate to the task | 1 | Optional | +| morgana_magic | **genoflu_cross_reference** | File | An Excel file to cross-reference BLAST findings; probably useful if novel genotypes are not in the default file used by genoflu.py | | Optional | +| morgana_magic | **genoflu_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 25 | Optional | +| morgana_magic | **genoflu_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/genoflu:1.03 | Optional | +| morgana_magic | **genoflu_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 2 | Optional | +| morgana_magic | **irma_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| morgana_magic | **irma_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| morgana_magic | **irma_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/cdcgov/irma:v1.1.5 | Optional | +| morgana_magic | **irma_keep_ref_deletions** | Boolean | True/False variable that determines if sites missed during read gathering should be deleted by ambiguation. | TRUE | Optional | +| morgana_magic | **irma_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 16 | Optional | +| morgana_magic | **nextclade_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| morgana_magic | **nextclade_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| morgana_magic | **nextclade_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/nextstrain/nextclade:3.3.1 | Optional | +| morgana_magic | **nextclade_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| morgana_magic | **nextclade_output_parser_cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | +| morgana_magic | **nextclade_output_parser_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 50 | Optional | +| morgana_magic | **nextclade_output_parser_docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/python/python:3.8.18-slim | Optional | +| morgana_magic | **nextclade_output_parser_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 4 | Optional | +| morgana_magic | **pangolin_cpu** | Int | Number of CPUs to allocate to the task | 4 | Optional | +| morgana_magic | **pangolin_disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | +| morgana_magic | **pangolin_docker_image** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pangolin:4.3.1-pdata-1.29 | Optional | +| morgana_magic | **pangolin_memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | | pilon | **cpu** | Int | Number of CPUs to allocate to the task | 2 | Optional | | pilon | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | | pilon | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/pilon:1.24 | Optional | @@ -258,8 +293,9 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | sort_bam_assembly_correction | **disk_size** | Int | Amount of storage (in GB) to allocate to the task | 100 | Optional | | sort_bam_assembly_correction | **docker** | String | The Docker container to use for the task | us-docker.pkg.dev/general-theiagen/staphb/samtools:1.17 | Optional | | sort_bam_assembly_correction | **memory** | Int | Amount of memory/RAM (in GB) to allocate to the task | 8 | Optional | -| theiameta_panel_illumina_pe | **kraken2_db** | File | A Kraken2 database in .tar.gz format | gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz | Optional | +| theiameta_panel_illumina_pe | **kraken2_db** | File | A Kraken2 database in .tar.gz format | gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz | Optional | | theiameta_panel_illumina_pe | **minimum_read_number** | Int | The minimum number of reads in order to attempt assembly on a bin of reads | 1000 | Optional | +| theiameta_panel_illumina_pe | **taxon_ids** | Array[Int] | The taxon IDs to be used for taxonomic binning. By default, this array uses the taxon IDs listed above that are intended for the Illumina VSP panel | Illumina VSP panel (see above toggle) | Optional | | version_capture | **docker** | String | The Docker container to use for the task | "us-docker.pkg.dev/general-theiagen/theiagen/alpine-plus-bash:3.20.0" | Optional | | version_capture | **timezone** | String | Set the time zone to get an accurate date of analysis (uses UTC by default) | | Optional | From b0498ae074029b903dc21bb6fbc03ab1ab5f5f40 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 28 Oct 2024 16:39:29 +0000 Subject: [PATCH 44/48] optionalize extracted reads --- tasks/taxon_id/task_krakentools.wdl | 10 +++---- .../data_handling/task_gather_scatter.wdl | 12 ++++---- .../wf_theiameta_panel_illumina_pe.wdl | 28 ++++++++----------- 3 files changed, 22 insertions(+), 28 deletions(-) diff --git a/tasks/taxon_id/task_krakentools.wdl b/tasks/taxon_id/task_krakentools.wdl index ea21adecb..35f8a1034 100644 --- a/tasks/taxon_id/task_krakentools.wdl +++ b/tasks/taxon_id/task_krakentools.wdl @@ -31,6 +31,9 @@ task extract_kraken_reads { if [ -s ~{taxon_id}_1.fastq ]; then echo "DEBUG: Taxon ~{taxon_id} reads extracted" echo "true" > CONTINUE + + gzip ~{taxon_id}_1.fastq + gzip ~{taxon_id}_2.fastq else echo "DEBUG: No reads were extracted for taxon ~{taxon_id}, removing empty files" echo "false" > CONTINUE @@ -38,13 +41,10 @@ task extract_kraken_reads { grep ~{taxon_id} ~{kraken2_report} | awk '{for (i=6; i <= NF; ++i) print $i}' | tr '\n' ' ' | xargs > ORGANISM_NAME - gzip ~{taxon_id}_1.fastq - gzip ~{taxon_id}_2.fastq - >>> output { - File extracted_read1 = "~{taxon_id}_1.fastq.gz" - File extracted_read2 = "~{taxon_id}_2.fastq.gz" + File? extracted_read1 = "~{taxon_id}_1.fastq.gz" + File? extracted_read2 = "~{taxon_id}_2.fastq.gz" String organism_name = read_string("ORGANISM_NAME") ### fix String krakentools_docker = docker Boolean success = read_boolean("CONTINUE") diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index d97fe6b25..398f3d3f9 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -18,7 +18,7 @@ task gather_scatter { # Assembly File? metaspades_warning File? pilon_warning - File? pilon_assembly_fasta### maybe????? + File? assembly_fasta # quast outputs File? quast_genome_length File? quast_number_contigs @@ -59,7 +59,7 @@ task gather_scatter { File? nextclade_aa_dels_flu_na File? nextclade_clade_flu_na File? nextclade_qc_flu_na - # change to be a docker with pandas + String docker = "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16" Int disk_size = 50 Int cpu = 2 @@ -83,7 +83,6 @@ task gather_scatter { return None df = pd.DataFrame() - df = load_json_data("~{taxon_ids}", "taxon_ids", df) df = load_json_data("~{organism}", "organism", df) df = load_json_data("~{extracted_read1}", "extracted_read1", df) @@ -96,7 +95,7 @@ task gather_scatter { df = load_json_data("~{fastq_scan_version}", "fastq_scan_version", df) df = load_json_data("~{metaspades_warning}", "metaspades_warning", df) df = load_json_data("~{pilon_warning}", "pilon_warning", df) - df = load_json_data("~{pilon_assembly_fasta}", "pilon_assembly_fasta", df) + df = load_json_data("~{assembly_fasta}", "assembly_fasta", df) df = load_json_data("~{quast_genome_length}", "quast_genome_length", df) df = load_json_data("~{quast_number_contigs}", "quast_number_contigs", df) df = load_json_data("~{quast_n50}", "quast_n50", df) @@ -131,11 +130,11 @@ task gather_scatter { df = load_json_data("~{nextclade_aa_dels_flu_na}", "nextclade_aa_dels_flu_na", df) df = load_json_data("~{nextclade_clade_flu_na}", "nextclade_clade_flu_na", df) df = load_json_data("~{nextclade_qc_flu_na}", "nextclade_qc_flu_na", df) + df.insert(0, "samplename" , "~{samplename}") - print(df) + # print(df) df.to_csv("~{samplename}.results.tsv", sep='\t', index=False) - organism_names = df["organism"].replace('', np.nan).dropna() organism_names.to_csv("~{samplename}.organism_names.tsv", index=False, header=False) CODE @@ -143,7 +142,6 @@ task gather_scatter { output { File gathered_results = "~{samplename}.results.tsv" Array[String] organism_names = read_lines("~{samplename}.organism_names.tsv") - } runtime { docker: "~{docker}" diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index 889935a03..3ba184a02 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -20,7 +20,6 @@ workflow theiameta_panel_illumina_pe { File read2 # default taxon IDs for Illumina VSP panel Array[Int] taxon_ids = [10244, 10255, 10298, 10359, 10376, 10632, 10804, 11021, 11029, 11033, 11034, 11036, 11039, 11041, 11053, 11060, 11069, 11070, 11072, 11079, 11080, 11082, 11083, 11084, 11089, 11137, 11234, 11292, 11520, 11552, 11577, 11580, 11587, 11588, 11676, 11709, 12092, 12475, 12538, 12542, 28875, 28876, 31631, 33743, 35305, 35511, 36427, 37124, 38766, 38767, 45270, 46839, 57482, 57483, 59301, 64286, 64320, 68887, 80935, 90961, 95341, 102793, 102796, 108098, 114727, 114729, 118655, 119210, 129875, 129951, 130308, 130309, 130310, 138948, 138949, 138950, 138951, 147711, 147712, 152219, 162145, 169173, 186538, 186539, 186540, 186541, 238817, 277944, 290028, 333278, 333760, 333761, 333762, 440266, 463676, 493803, 536079, 565995, 862909, 1003835, 1216928, 1221391, 1239565, 1239570, 1239573, 1277649, 1313215, 1330524, 1335626, 1348384, 1424613, 1452514, 1474807, 1497391, 1608084, 1618189, 1891764, 1891767, 1965344, 1980456, 2010960, 2169701, 2169991, 2560525, 2560602, 2697049, 2847089, 2901879, 2907957, 3052148, 3052223, 3052225, 3052230, 3052302, 3052307, 3052310, 3052314, 3052470, 3052477, 3052480, 3052489, 3052490, 3052493, 3052496, 3052499, 3052503, 3052505, 3052518, 10798, 11216, 1203539, 12730, 142786, 1803956, 208893, 2560526, 2849717, 3052303, 3052317, 3052484, 3052498, 746830, 746831, 943908] - # suggest using a workspace element if user wants to modify? Int minimum_read_number = 1000 File kraken2_db = "gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz" @@ -34,7 +33,7 @@ workflow theiameta_panel_illumina_pe { read1 = read1, read2 = read2, workflow_series = "theiameta", - # adding these additional inputs to hide them from Terra; these are not used + # adding these additional inputs to hide them from Terra; these are not used and we don't want the user to modiy them call_kraken = false, kraken_disk_size = 0, kraken_memory = 0, @@ -54,8 +53,6 @@ workflow theiameta_panel_illumina_pe { scatter (taxon_id in taxon_ids) { call krakentools_task.extract_kraken_reads as krakentools { input: - # we should consider changing the classified_report name so - # it won't be confused with the actual kraken2 report kraken2_output = kraken2.kraken2_classified_report, kraken2_report = kraken2.kraken2_report, read1 = read_QC_trim.read1_clean, @@ -65,22 +62,21 @@ workflow theiameta_panel_illumina_pe { if (krakentools.success) { call fastq_scan.fastq_scan_pe as fastq_scan_binned { input: - read1 = krakentools.extracted_read1, - read2 = krakentools.extracted_read2 + read1 = select_first([krakentools.extracted_read1]), + read2 = select_first([krakentools.extracted_read2]) } - #### ADJUST IN THE FUTURE; SETTING TO 100 FOR TESTING #### if (fastq_scan_binned.read1_seq > minimum_read_number) { call metaspades_task.metaspades_pe { input: - read1_cleaned = krakentools.extracted_read1, - read2_cleaned = krakentools.extracted_read2, + read1_cleaned = select_first([krakentools.extracted_read1]), + read2_cleaned = select_first([krakentools.extracted_read2]), samplename = "~{samplename}_~{taxon_id}" } if (defined(metaspades_pe.assembly_fasta)) { call minimap2_task.minimap2 as minimap2_assembly_correction { input: - query1 = krakentools.extracted_read1, - query2 = krakentools.extracted_read2, + query1 = select_first([krakentools.extracted_read1]), + query2 = select_first([krakentools.extracted_read2]), reference = select_first([metaspades_pe.assembly_fasta]), samplename = "~{samplename}_~{taxon_id}", mode = "sr", @@ -109,9 +105,9 @@ workflow theiameta_panel_illumina_pe { input: samplename = "~{samplename}_~{taxon_id}", assembly_fasta = select_first([pilon.assembly_fasta]), - read1 = krakentools.extracted_read1, - read2 = krakentools.extracted_read2, - taxon_id = "~{taxon_id}", + read1 = select_first([krakentools.extracted_read1]), + read2 = select_first([krakentools.extracted_read2]), + taxon_id = taxon_id, seq_method = "ILLUMINA" } } @@ -124,7 +120,7 @@ workflow theiameta_panel_illumina_pe { samplename = samplename, taxon_ids = write_json(taxon_ids), organism = write_json(krakentools.organism_name), - extracted_read1 = write_json(krakentools.extracted_read1), ## not sure how useful these links are + extracted_read1 = write_json(krakentools.extracted_read1), extracted_read2 = write_json(krakentools.extracted_read2), krakentools_docker = write_json(krakentools.krakentools_docker), fastq_scan_num_reads_binned1 = write_json(fastq_scan_binned.read1_seq), @@ -134,7 +130,7 @@ workflow theiameta_panel_illumina_pe { fastq_scan_version = write_json(fastq_scan_binned.version), metaspades_warning = write_json(metaspades_pe.metaspades_warning), pilon_warning = write_json(pilon.pilon_warning), - pilon_assembly_fasta = write_json(pilon.assembly_fasta), # maybe?? + assembly_fasta = write_json(pilon.assembly_fasta), quast_genome_length = write_json(quast.genome_length), quast_number_contigs = write_json(quast.number_contigs), quast_n50 = write_json(quast.n50_value), From 48a26a21b5fd4ac4065e7aa46851129536449422 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 28 Oct 2024 17:04:41 +0000 Subject: [PATCH 45/48] add flu outputs to gather scatter --- .../data_handling/task_gather_scatter.wdl | 34 +++++++++++++++++++ .../wf_theiameta_panel_illumina_pe.wdl | 14 +++++++- workflows/utilities/wf_morgana_magic.wdl | 16 --------- 3 files changed, 47 insertions(+), 17 deletions(-) diff --git a/tasks/utilities/data_handling/task_gather_scatter.wdl b/tasks/utilities/data_handling/task_gather_scatter.wdl index 398f3d3f9..09523e4f3 100644 --- a/tasks/utilities/data_handling/task_gather_scatter.wdl +++ b/tasks/utilities/data_handling/task_gather_scatter.wdl @@ -59,7 +59,23 @@ task gather_scatter { File? nextclade_aa_dels_flu_na File? nextclade_clade_flu_na File? nextclade_qc_flu_na + # IRMA outputs + File? irma_version + File? irma_docker + File? irma_type + File? irma_subtype + File? irma_subtype_notes + # GenoFLU outputs + File? genoflu_version + File? genoflu_genotype + File? genoflu_all_segments + # abricate outputs + File? abricate_flu_type + File? abricate_flu_subtype + File? abricate_flu_database + File? abricate_flu_version + # runtime parameters String docker = "us-docker.pkg.dev/general-theiagen/theiagen/terra-tools:2023-03-16" Int disk_size = 50 Int cpu = 2 @@ -83,6 +99,8 @@ task gather_scatter { return None df = pd.DataFrame() + + # organism-agnostic columns df = load_json_data("~{taxon_ids}", "taxon_ids", df) df = load_json_data("~{organism}", "organism", df) df = load_json_data("~{extracted_read1}", "extracted_read1", df) @@ -105,6 +123,8 @@ task gather_scatter { df = load_json_data("~{number_Degenerate}", "number_Degenerate", df) df = load_json_data("~{number_Total}", "number_Total", df) df = load_json_data("~{percent_reference_coverage}", "percent_reference_coverage", df) + + # organism-specific columns df = load_json_data("~{pango_lineage}", "pango_lineage", df) df = load_json_data("~{pango_lineage_expanded}", "pango_lineage_expanded", df) df = load_json_data("~{pangolin_conflicts}", "pangolin_conflicts", df) @@ -130,6 +150,20 @@ task gather_scatter { df = load_json_data("~{nextclade_aa_dels_flu_na}", "nextclade_aa_dels_flu_na", df) df = load_json_data("~{nextclade_clade_flu_na}", "nextclade_clade_flu_na", df) df = load_json_data("~{nextclade_qc_flu_na}", "nextclade_qc_flu_na", df) + df = load_json_data("~{irma_version}", "irma_version", df) + df = load_json_data("~{irma_docker}", "irma_docker", df) + df = load_json_data("~{irma_type}", "irma_type", df) + df = load_json_data("~{irma_subtype}", "irma_subtype", df) + df = load_json_data("~{irma_subtype_notes}", "irma_subtype_notes", df) + df = load_json_data("~{genoflu_version}", "genoflu_version", df) + df = load_json_data("~{genoflu_genotype}", "genoflu_genotype", df) + df = load_json_data("~{genoflu_all_segments}", "genoflu_all_segments", df) + df = load_json_data("~{abricate_flu_type}", "abricate_flu_type", df) + df = load_json_data("~{abricate_flu_subtype}", "abricate_flu_subtype", df) + df = load_json_data("~{abricate_flu_database}", "abricate_flu_database", df) + df = load_json_data("~{abricate_flu_version}", "abricate_flu_version", df) + + # add samplename column df.insert(0, "samplename" , "~{samplename}") # print(df) diff --git a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl index 3ba184a02..95a71f46a 100644 --- a/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl +++ b/workflows/theiameta/wf_theiameta_panel_illumina_pe.wdl @@ -164,7 +164,19 @@ workflow theiameta_panel_illumina_pe { nextclade_aa_subs_flu_na = write_json(morgana_magic.nextclade_aa_subs_flu_na), nextclade_aa_dels_flu_na = write_json(morgana_magic.nextclade_aa_dels_flu_na), nextclade_clade_flu_na = write_json(morgana_magic.nextclade_clade_flu_na), - nextclade_qc_flu_na = write_json(morgana_magic.nextclade_qc_flu_na) + nextclade_qc_flu_na = write_json(morgana_magic.nextclade_qc_flu_na), + irma_version = write_json(morgana_magic.irma_version), + irma_docker = write_json(morgana_magic.irma_docker), + irma_type = write_json(morgana_magic.irma_type), + irma_subtype = write_json(morgana_magic.irma_subtype), + irma_subtype_notes = write_json(morgana_magic.irma_subtype_notes), + genoflu_version = write_json(morgana_magic.genoflu_version), + genoflu_genotype = write_json(morgana_magic.genoflu_genotype), + genoflu_all_segments = write_json(morgana_magic.genoflu_all_segments), + abricate_flu_type = write_json(morgana_magic.abricate_flu_type), + abricate_flu_subtype = write_json(morgana_magic.abricate_flu_subtype), + abricate_flu_database = write_json(morgana_magic.abricate_flu_database), + abricate_flu_version = write_json(morgana_magic.abricate_flu_version) } output { # versioning outputs diff --git a/workflows/utilities/wf_morgana_magic.wdl b/workflows/utilities/wf_morgana_magic.wdl index 26bed4bb9..80b6298a5 100644 --- a/workflows/utilities/wf_morgana_magic.wdl +++ b/workflows/utilities/wf_morgana_magic.wdl @@ -148,15 +148,6 @@ workflow morgana_magic { memory = nextclade_output_parser_memory } } - ##### is running quasitools even something we want to do???? - if (organism_parameters.standardized_organism == "HIV") { - call quasitools.quasitools as quasitools_illumina_pe { - input: - read1 = read1, - read2 = read2, - samplename = samplename - } - } output { String organism = organism_parameters.standardized_organism # Consensus QC outputs @@ -222,12 +213,5 @@ workflow morgana_magic { File? abricate_flu_results = flu_track.abricate_flu_results String? abricate_flu_database = flu_track.abricate_flu_database String? abricate_flu_version = flu_track.abricate_flu_version - # HIV Outputs - String? quasitools_version = quasitools_illumina_pe.quasitools_version - String? quasitools_date = quasitools_illumina_pe.quasitools_date - File? quasitools_coverage_file = quasitools_illumina_pe.coverage_file - File? quasitools_dr_report = quasitools_illumina_pe.dr_report - File? quasitools_hydra_vcf = quasitools_illumina_pe.hydra_vcf - File? quasitools_mutations_report = quasitools_illumina_pe.mutations_report } } \ No newline at end of file From fcc17d9bd9a0b6cf134e335d1817a3836e45ba1b Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 28 Oct 2024 18:25:29 +0000 Subject: [PATCH 46/48] finish documentation --- .../pangolin_update.md | 6 +- .../genomic_characterization/theiacov.md | 3 +- .../genomic_characterization/theiameta.md | 50 ++- .../theiameta_panel.md | 285 +++++++++++++++++- 4 files changed, 330 insertions(+), 14 deletions(-) diff --git a/docs/workflows/genomic_characterization/pangolin_update.md b/docs/workflows/genomic_characterization/pangolin_update.md index 988db4404..a05756888 100644 --- a/docs/workflows/genomic_characterization/pangolin_update.md +++ b/docs/workflows/genomic_characterization/pangolin_update.md @@ -65,4 +65,8 @@ This workflow runs on the sample level. | **pangolin_updates** | String | Result of Pangolin Update (lineage changed versus unchanged) with lineage assignment and date of analysis | | **pangolin_versions** | String | All Pangolin software and database versions | - \ No newline at end of file + + +## References + +> **Pangolin**: RRambaut A, Holmes EC, O'Toole Á, Hill V, McCrone JT, Ruis C, du Plessis L, Pybus OG. A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology. Nat Microbiol. 2020 Nov;5(11):1403-1407. doi: 10.1038/s41564-020-0770-5. Epub 2020 Jul 15. PMID: 32669681; PMCID: PMC7610519. diff --git a/docs/workflows/genomic_characterization/theiacov.md b/docs/workflows/genomic_characterization/theiacov.md index a21d46f89..b78c368ae 100644 --- a/docs/workflows/genomic_characterization/theiacov.md +++ b/docs/workflows/genomic_characterization/theiacov.md @@ -900,6 +900,7 @@ All input reads are processed through "core tasks" in the TheiaCoV Illumina, ONT | Task | [task_pangolin.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/betacoronavirus/task_pangolin.wdl) | | Software Source Code | [Pangolin on GitHub](https://github.com/cov-lineages/pangolin) | | Software Documentation | [Pangolin website](https://cov-lineages.org/resources/pangolin.html) | + | Original Publication(s) | [A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology](https://doi.org/10.1038/s41564-020-0770-5) | ??? task "`nextclade`" @@ -1138,7 +1139,7 @@ All TheiaCoV Workflows (not TheiaCoV_FASTA_Batch) | nextclade_json_flu_ha | File | Nextclade output in JSON file format, specific to Flu HA segment | ONT, PE | | nextclade_json_flu_na | File | Nextclade output in JSON file format, specific to Flu NA segment | ONT, PE | | nextclade_lineage | String | Nextclade lineage designation | CL, FASTA, ONT, PE, SE | -| nextclade_qc | String | QC metric as determined by Nextclade. (For Flu, this output will be specific to HA segment) | CL, FASTA, ONT, PE, SE | +| nextclade_qc | String | QC metric as determined by Nextclade. Will be blank for Flu | CL, FASTA, ONT, PE, SE | | nextclade_qc_flu_ha | String | QC metric as determined by Nextclade, specific to Flu HA segment | ONT, PE | | nextclade_qc_flu_na | String | QC metric as determined by Nextclade, specific to Flu NA segment | ONT, PE | | nextclade_tsv | File | Nextclade output in TSV file format. (For Flu, this output will be specific to HA segment) | CL, FASTA, ONT, PE, SE | diff --git a/docs/workflows/genomic_characterization/theiameta.md b/docs/workflows/genomic_characterization/theiameta.md index 55c26d9a6..eb501b301 100644 --- a/docs/workflows/genomic_characterization/theiameta.md +++ b/docs/workflows/genomic_characterization/theiameta.md @@ -241,22 +241,62 @@ The TheiaMeta_Illumina_PE workflow processes Illumina paired-end (PE) reads ge #### Assembly ??? task "`metaspades`: _De Novo_ Metagenomic Assembly" + While metagenomics has emerged as a technology of choice for analyzing bacterial populations, the assembly of metagenomic data remains challenging. A dedicated metagenomic assembly algorithm is necessary to circumvent the challenge of interpreting variation. metaSPAdes addresses various challenges of metagenomic assembly by capitalizing on computational ideas that proved to be useful in assemblies of single cells and highly polymorphic diploid genomes. - While metagenomics has emerged as a technology of choice for analyzing bacterial populations, the assembly of metagenomic data remains challenging. A dedicated metagenomic assembly algorithm is necessary to circumvent the challenge of interpreting variation. metaSPAdes addresses various challenges of metagenomic assembly by capitalizing on computational ideas that proved to be useful in assemblies of single cells and highly polymorphic diploid genomes. + `metaspades` is a _de novo_ assembler that first constructs a de Bruijn graph of all the reads using the SPAdes algorithm. Through various graph simplification procedures, paths in the assembly graph are reconstructed that correspond to long genomic fragments within the metagenome. For more details, please see the original publication. !!! techdetails "MetaSPAdes Technical Details" - | | Links | | --- | --- | | Task | [task_metaspades.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_metaspades.wdl) | | Software Source Code | [SPAdes on GitHub](https://github.com/ablab/spades) | - | Software Documentation | | - | Original Publication(s) | [metaSPAdes: a new versatile metagenomic assembler](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5411777/) | + | Software Documentation | [SPAdes Manual](https://ablab.github.io/spades/index.html) | + | Original Publication(s) | [metaSPAdes: a new versatile metagenomic assembler](http://www.genome.org/cgi/doi/10.1101/gr.213959.116) | -??? task "`minimap2`: Assembly Alignment and Contig Filtering (if a reference is provided)" +??? task "`minimap2`: Assembly Alignment and Contig Filtering" If a reference genome is provided through the **`reference`** optional input, the assembly produced with `metaspades` will be mapped to the reference genome with `minimap2`. The contigs which align to the reference are retrieved and returned in the **`assembly_fasta`** output. + `minimap2` is a popular aligner that is used for correcting the assembly produced by metaSPAdes. This is done by aligning the reads back to the generated assembly or a reference genome. + + In minimap2, "modes" are a group of preset options. Two different modes are used in this task depending on whether a reference genome is provided. + + If a reference genome is _not_ provided, the only mode used in this task is `sr` which is intended for "short single-end reads without splicing". The `sr` mode indicates the following parameters should be used: `-k21 -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m20 -s40 -g100 -2K50m --heap-sort=yes --secondary=no`. The output file is in SAM format. + + If a reference genome is provided, then after the draft assembly polishing with `pilon`, this task runs again with the mode set to `asm20` which is intended for "long assembly to reference mapping". The `asm20` mode indicates the following parameters should be used: `-k19 -w10 -U50,500 --rmq -r100k -g10k -A1 -B4 -O6,26 -E2,1 -s200 -z200 -N50`. The output file is in PAF format. + + For more information, please see the [minimap2 manpage](https://lh3.github.io/minimap2/minimap2.html) + + !!! techdetails "minimap2 Technical Details" + | | Links | + |---|---| + | Task | [task_minimap2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/alignment/task_minimap2.wdl) | + | Software Source Code | [minimap2 on GitHub](https://github.com/lh3/minimap2) | + | Software Documentation | [minimap2](https://lh3.github.io/minimap2) | + | Original Publication(s) | [Minimap2: pairwise alignment for nucleotide sequences](https://academic.oup.com/bioinformatics/article/34/18/3094/4994778) | + +??? task "`samtools`: SAM File Conversion " + This task converts the output SAM file from minimap2 and converts it to a BAM file. It then sorts the BAM based on the read names, and then generates an index file. + + !!! techdetails "samtools Technical Details" + | | Links | + |---|---| + | Task | [task_samtools.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/data_handling/task_parse_mapping.wdl) | + | Software Source Code | [samtools on GitHub](https://github.com/samtools/samtools) | + | Software Documentation | [samtools](https://www.htslib.org/doc/samtools.html) | + | Original Publication(s) | [The Sequence Alignment/Map format and SAMtools](https://doi.org/10.1093/bioinformatics/btp352)
[Twelve Years of SAMtools and BCFtools](https://doi.org/10.1093/gigascience/giab008) | + +??? task "`pilon`: Assembly Polishing" + `pilon` is a tool that uses read alignment to correct errors in an assembly. It is used to polish the assembly produced by metaSPAdes. The input to Pilon is the sorted BAM file produced by `samtools`, and the original draft assembly produced by `metaspades`. + + !!! techdetails "pilon Technical Details" + | | Links | + |---|---| + | Task | [task_pilon.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_pilon.wdl) | + | Software Source Code | [Pilon on GitHub](https://github.com/broadinstitute/pilon) | + | Software Documentation | [Pilon Wiki](https://github.com/broadinstitute/pilon/wiki) | + | Original Publication(s) | [Pilon: An Integrated Tool for Comprehensive Microbial Variant Detection and Genome Assembly Improvement](https://doi.org/10.1371/journal.pone.0112963) | + #### Assembly QC ??? task "`quast`: Assembly Quality Assessment" diff --git a/docs/workflows/genomic_characterization/theiameta_panel.md b/docs/workflows/genomic_characterization/theiameta_panel.md index 31821973f..553b52856 100644 --- a/docs/workflows/genomic_characterization/theiameta_panel.md +++ b/docs/workflows/genomic_characterization/theiameta_panel.md @@ -304,7 +304,7 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei ### Workflow Tasks ??? task "`read_QC_trim`: Read Quality Trimming, Adapter Removal, Quantification, and Identification" - + ##### Read Cleaning {#read_QC_trim} `read_QC_trim` is a sub-workflow within TheiaMeta that removes low-quality reads, low-quality regions of reads, and sequencing adapters to improve data quality. It uses a number of tasks, described below. **Read quality trimming** @@ -372,7 +372,7 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | Original Publication(s) | [Trimmomatic: a flexible trimmer for Illumina sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
[fastp: an ultra-fast all-in-one FASTQ preprocessor](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234?login=false)
[An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography](https://pubmed.ncbi.nlm.nih.gov/27803195/) | ??? task "`kraken2`: Taxonomic Classification" - + ##### Kraken2 {#kraken2} Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate, eukaryotic isolate, viral isolate, etc.) whole genome sequence data. Kraken2 is run on the clean reads that result from the `read_QC_trim` subworkflow. By default, the Kraken2 database is set to the `k2_viral_20240112` database, located at `"gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz"`. @@ -389,6 +389,7 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | ??? task "`extract_kraken_reads` from KrakenTools: Read Binning" + ##### KrakenTools {#extract_kraken_reads} KrakenTools is a collection of scripts that can be used to help downstream analysis of Kraken2 results. In particular, this task uses the `extract_kraken_reads` script, which extracts reads classified at any user-specified taxonomy IDs. All parent and children reads of the specified taxonomic ID are also extracted. !!! techdetails "KrakenTools Technical Details" @@ -397,7 +398,170 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | Task | [task_kraken_tools.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_krakentools.wdl) | Software Source Code | [KrakenTools on GitHub](https://github.com/jenniferlu717/KrakenTools) | | Software Documentation | [KrakenTools on GitHub](https://github.com/jenniferlu717/KrakenTools) | - | Original Publication | [Metagenome analysis using the Kraken software suite](https://doi.org/10.1038/s41596-022-00738-y) | + | Original Publication(s) | [Metagenome analysis using the Kraken software suite](https://doi.org/10.1038/s41596-022-00738-y) | + +??? task "`fastq_scan`: Summarizing Read Bins" + ##### FASTQ Scan {#fastq_scan} + `fastq_scan` is used to summarize the read bins generated by the `extract_kraken_reads` task. It provides basic statistics about the read bins, such as the number of reads in each bin, the number of read pairs, and the number of reads in each bin. + + !!! techdetails "fastq_scan Technical Details" + | | Links | + | --- | --- | + | Task | [task_fastq_scan.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_fastq_scan.wdl) | + | Software Source Code | [fastq-scan](https://github.com/rpetit3/fastq-scan) | + | Software Documentation | [fastq-scan](https://github.com/rpetit3/fastq-scan) | + +??? task "`metaspades`: _De Novo_ Metagenomic Assembly" + ##### metaSPAdes {#metaspades} + While metagenomics has emerged as a technology of choice for analyzing bacterial populations, the assembly of metagenomic data remains challenging. A dedicated metagenomic assembly algorithm is necessary to circumvent the challenge of interpreting variation. metaSPAdes addresses various challenges of metagenomic assembly by capitalizing on computational ideas that proved to be useful in assemblies of single cells and highly polymorphic diploid genomes. + + `metaspades` is a _de novo_ assembler that first constructs a de Bruijn graph of all the reads using the SPAdes algorithm. Through various graph simplification procedures, paths in the assembly graph are reconstructed that correspond to long genomic fragments within the metagenome. For more details, please see the original publication. + + !!! techdetails "MetaSPAdes Technical Details" + | | Links | + | --- | --- | + | Task | [task_metaspades.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_metaspades.wdl) | + | Software Source Code | [SPAdes on GitHub](https://github.com/ablab/spades) | + | Software Documentation | [SPAdes Manual](https://ablab.github.io/spades/index.html) | + | Original Publication(s) | [metaSPAdes: a new versatile metagenomic assembler](http://www.genome.org/cgi/doi/10.1101/gr.213959.116) | + +??? task "`minimap2`: Assembly Alignment and Contig Filtering" + + ##### minimap2 {#minimap2} + + `minimap2` is a popular aligner that is used in TheiaMeta_Panel for correcting the assembly produced by metaSPAdes. This is done by aligning the reads back to the generated assembly. + + The default mode used in this task is `sr` which is intended for "short single-end reads without splicing". In minimap2, "modes" are a group of preset options; the `sr` mode indicates the following parameters should be used: `-k21 -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m20 -s40 -g100 -2K50m --heap-sort=yes --secondary=no`. + + For more information, please see the [minimap2 manpage](https://lh3.github.io/minimap2/minimap2.html) + + !!! techdetails "minimap2 Technical Details" + | | Links | + |---|---| + | Task | [task_minimap2.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/alignment/task_minimap2.wdl) | + | Software Source Code | [minimap2 on GitHub](https://github.com/lh3/minimap2) | + | Software Documentation | [minimap2](https://lh3.github.io/minimap2) | + | Original Publication(s) | [Minimap2: pairwise alignment for nucleotide sequences](https://academic.oup.com/bioinformatics/article/34/18/3094/4994778) | + +??? task "`samtools`: SAM File Conversion" + This task converts the output SAM file from minimap2 and converts it to a BAM file. It then sorts the BAM based on the read names, and then generates an index file. + + !!! techdetails "samtools Technical Details" + | | Links | + |---|---| + | Task | [task_samtools.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/utilities/data_handling/task_parse_mapping.wdl) | + | Software Source Code | [samtools on GitHub](https://github.com/samtools/samtools) | + | Software Documentation | [samtools](https://www.htslib.org/doc/samtools.html) | + | Original Publication(s) | [The Sequence Alignment/Map format and SAMtools](https://doi.org/10.1093/bioinformatics/btp352)
[Twelve Years of SAMtools and BCFtools](https://doi.org/10.1093/gigascience/giab008) | + +??? task "`pilon`: Assembly Polishing" + + ##### Pilon {#pilon} + + `pilon` is a tool that uses read alignment to correct errors in an assembly. It is used to polish the assembly produced by metaSPAdes. The input to Pilon is the sorted BAM file produced by `samtools`, and the original draft assembly produced by `metaspades`. + + !!! techdetails "pilon Technical Details" + | | Links | + |---|---| + | Task | [task_pilon.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/read_filtering/task_pilon.wdl) | + | Software Source Code | [Pilon on GitHub](https://github.com/broadinstitute/pilon) | + | Software Documentation | [Pilon Wiki](https://github.com/broadinstitute/pilon/wiki) | + | Original Publication(s) | [Pilon: An Integrated Tool for Comprehensive Microbial Variant Detection and Genome Assembly Improvement](https://doi.org/10.1371/journal.pone.0112963) | + +??? task "`quast`: Assembly Quality Assessment" + + ##### QUAST {#quast} + + QUAST stands for QUality ASsessment Tool. It evaluates genome/metagenome assemblies by computing various metrics without a reference being necessary. It includes useful metrics such as number of contigs, length of the largest contig and N50. + + !!! techdetails "QUAST Technical Details" + | | Links | + | --- | --- | + | Task | [task_quast.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/task_quast.wdl) | + | Software Source Code | [QUAST on GitHub](https://github.com/ablab/quast) | + | Software Documentation | | + | Original Publication(s) | [QUAST: quality assessment tool for genome assemblies](https://academic.oup.com/bioinformatics/article/29/8/1072/228832) | + +??? task "`morgana_magic`: Genomic Characterization" + + ##### Morgana Magic {#morgana_magic} + + Morgana Magic is the viral equivalent of the `merlin_magic` subworkflow used in the TheiaProk workflows. This workflow launches several tasks the characterize the viral genome, including Pangolin4, Nextclade, and others. + + This subworkflow currently only supports the organisms that are natively supported by the [TheiaCoV workflows](./theiacov.md). + + The following tasks only run for the appropriate taxon ID if sufficient reads were extracted. The following table illustrates which characterization tools are run for the indicated organism. + + | | SARS-CoV-2 | MPXV | WNV | Influenza | RSV-A | RSV-B | + | --- | --- | --- | --- | --- | --- | --- | + | Pangolin | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | + | Nextclade | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | + | IRMA | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | + | Abricate | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | + | GenoFLU | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | + + ??? task "`pangolin`" + Pangolin designates SARS-CoV-2 lineage assignments. + + !!! techdetails "Pangolin Technical Details" + + | | Links | + | --- | --- | + | Task | [task_pangolin.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/betacoronavirus/task_pangolin.wdl) | + | Software Source Code | [Pangolin on GitHub](https://github.com/cov-lineages/pangolin) | + | Software Documentation | [Pangolin website](https://cov-lineages.org/resources/pangolin.html) | + | Original Publication(s) | [A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology](https://doi.org/10.1038/s41564-020-0770-5) | + + ??? task "`nextclade`" + ["Nextclade is an open-source project for viral genome alignment, mutation calling, clade assignment, quality checks and phylogenetic placement."](https://docs.nextstrain.org/projects/nextclade/en/stable/) + + !!! techdetails "Nextclade Technical Details" + + | | Links | + | --- | --- | + | Task | [task_nextclade.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/taxon_id/task_nextclade.wdl#L63) | + | Software Source Code | | + | Software Documentation | [Nextclade](https://docs.nextstrain.org/projects/nextclade/en/stable/) | + | Original Publication(s) | [Nextclade: clade assignment, mutation calling and quality control for viral genomes.](https://doi.org/10.21105/joss.03773) | + + ??? task "`irma`" + Cleaned reads are re-assembled using `irma` which does not use a reference due to the rapid evolution and high variability of influenza. Assemblies produced by `irma` will be orderd from largest to smallest assembled flu segment. `irma` also performs typing and subtyping as part of the assembly process. + + General statistics about the assembly are generated with the `consensus_qc` task ([task_assembly_metrics.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/quality_control/basic_statistics/task_assembly_metrics.wdl)). + + !!! techdetails "IRMA Technical Details" + | | Links | + | --- | --- | + | Task | [task_irma.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/assembly/task_irma.wdl) | + | Software Documentation | [IRMA website](https://wonder.cdc.gov/amd/flu/irma/) | + | Original Publication(s) | [Viral deep sequencing needs an adaptive approach: IRMA, the iterative refinement meta-assembler](https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-016-3030-6) | + + ??? task "`abricate`" + Abricate assigns types and subtype/lineages for flu samples + + !!! techdetails "Abricate Technical Details" + | | Links | + | --- | --- | + | Task | [task_abricate.wdl (abricate_flu subtask)](https://github.com/theiagen/public_health_bioinformatics/blob/2dff853defc6ea540a058873f6fe6a78cc2350c7/tasks/gene_typing/drug_resistance/task_abricate.wdl#L59) | + | Software Source Code | [ABRicate on GitHub](https://github.com/tseemann/abricate) | + | Software Documentation | [ABRicate on GitHub](https://github.com/tseemann/abricate) | + + ??? task "`genoflu`" + This sub-workflow determines the whole-genome genotype of an H5N1 flu sample. + + !!! techdetails "GenoFLU Technical Details" + | | Links | + | --- | --- | + | Task | [task_genoflu.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/species_typing/orthomyxoviridae/task_genoflu.wdl) | + | Software Source Code | [GenoFLU on GitHub](https://github.com/USDA-VS/GenoFLU) | + +??? task "`gather_scatter`: Generate Summary File" + The `gather_scatter` task generates a summary file with all the results for all taxon IDs with identified reads. Please see the [`results_by_taxon_tsv`](#results_by_taxon_tsv) section below for more information. + + !!! techdetails "gather_scatter Technical Details" + | | Links | + | --- | --- | + | Task | [task_gather_scatter.wdl](https://github.com/theiagen/public_health_bioinformatics/blob/main/tasks/data_handling/task_gather_scatter.wdl) | ### Outputs @@ -411,15 +575,122 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | kraken2_docker | String | Docker image used to run kraken2 | | kraken2_report | File | Text document describing taxonomic prediction of every FASTQ record. This file can be very large and cumbersome to open and view | | kraken2_version | String | The version of Kraken2 used in the analysis | -| results_by_taxon_tsv | File | A TSV file that contains the results for every taxon ID provided in the taxon_ids input variable that had reads identified; characterization (if applicable) and basic statistics regarding read count, assembly generation (if applicable), and general quality, are also associated with each bin | +| results_by_taxon_tsv | File | A TSV file that contains the results for every taxon ID provided in the taxon_ids input variable that had reads identified; characterization (if applicable) and basic statistics regarding read count, assembly generation (if applicable), and general quality, are also associated with each bin; see below for more details. | | theiameta_panel_illumina_pe_analysis_date | String | Date the workflow was run | | theiameta_panel_illumina_pe_version | String | Version of PHB used to run the workflow | -#### The `results_by_taxon_tsv` Output File - -This file contains the +#### The `results_by_taxon_tsv` Output File {#results_by_taxon_tsv} + +This TSV file contains a summary of all of the taxon IDs provided in the `taxon_ids` input variable that had reads identified, with each row representing a taxon ID. + +Depending on if reads could be extract for the taxon ID, the `organism` column will contain the name of the organism. This column will be blank if no reads were able to be extracted for the taxon ID in the sample. + +??? toggle "What columns are included?" + The following columns are included in the `results_by_taxon_tsv` file: + + - `taxon_id`: The taxon ID used for the binning, generated for all taxon IDs provided in the `taxon_ids` input variable + - `organism`: The name of the organism associated with the taxon ID if reads were able to be extracted; the following columns are blank if no reads were able to be extracted for the taxon ID in the sample + - `extracted_read1`: The GSURI of the extracted read1 FASTQ file + - `extracted_read2`: The GSURI of the extracted read2 FASTQ file + - `krakentools_docker`: The Docker image used to run KrakenTools' `extract_kraken_reads` + - `fastq_scan_num_reads_binned1`: The number of reads in the extracted read1 FASTQ file + - `fastq_scan_num_reads_binned2`: The number of reads in the extracted read2 FASTQ file + - `fastq_scan_num_reads_binned_pairs`: The number of read pairs in the extracted read1 and read2 FASTQ files + - `fastq_scan_docker`: The Docker image used to run the `fastq_scan` task + - `fastq_scan_version`: The version of the `fastq_scan` tool used in the analysis + - `metaspades_warning`: A warning message if an empty assembly was produced for the taxon ID; blank if assembly was successful + - `pilon_warning`: A warning message if Pilon failed, blank if assembly polishing was successful + - `assembly_fasta`: A GSURI to the assembly FASTA file + - `quast_genome_length`: The length of the assembly + - `quast_number_contigs`: The number of contigs in the assembly + - `quast_n50`: The N50 value of the assembly + - `quast_gc_percent`: The GC content of the assembly + - `number_N`: The number of Ns in the assembly + - `number_ATCG`: The number of ATCGs in the assembly + - `number_Degenerate`: The number of degenerate bases in the assembly + - `number_Total`: The total number of bases in the assembly + - `percent_reference_coverage`: The percent of the reference genome covered by the assembly; only applicable if the taxon ID is already supported by TheiaCoV (additional assembly files may be added in the future) + + Any subsequent columns are specific to the identified organism and taxon ID; typically, values for these columns are only produced if the organism is natively supported by the TheiaCoV workflows. + + ??? toggle "SARS-CoV-2: _Pangolin_" + - `pango_lineage`: The Pango lineage of the assembly + - `pango_lineage_expanded`: The Pango lineage of the assembly without aliases + - `pangolin_conflicts`: The number of conflicts in the Pango lineage + - `pangolin_notes`: Any notes generated by Pangolin about the lineage + - `pangolin_assignment_version`: The version of the assignment module used to assign the Pango lineage + - `pangolin_version`: The version of Pangolin used to generate the Pango lineage + - `pangolin_docker`: The Docker image used to run Pangolin + + ??? toggle "Mpox, SARS-CoV-2, RSV-A, RSV-B: _Nextclade_" + - `nextclade_version`: The version of Nextclade used + - `nextclade_docker`: The Docker image used to run Nextclade + - `nextclade_ds_tag`: The dataset tag used to run Nextclade + - `nextclade_aa_subs`: Amino-acid substitutions as detected by Nextclade + - `nextclade_aa_dels`: Amino-acid deletions as detected by Nextclade + - `nextclade_clade`: Nextclade clade designation + - `nextclade_lineage`: Nextclade lineage designation + - `nextclade_qc`: QC metric as determined by Nextclade + + ??? toggle "Flu: _Nextclade_, _IRMA_, _GenoFLU_, _ABRicate_" + - `nextclade_version`: The version of Nextclade used + - `nextclade_docker`: The Docker image used to run Nextclade + - `nextclade_ds_tag_flu_ha`: The dataset tag used to run Nextclade for the HA segment + - `nextclade_aa_subs_flu_ha`: Amino-acid substitutions as detected by Nextclade for the HA segment + - `nextclade_aa_dels_flu_ha`: Amino-acid deletions as detected by Nextclade for the HA segment + - `nextclade_clade_flu_ha`: Nextclade clade designation for the HA segment + - `nextclade_lineage_flu_ha`: Nextclade lineage designation for the HA segment + - `nextclade_qc_flu_ha`: QC metric as determined by Nextclade for the HA segment + - `nextclade_ds_tag_flu_na`: The dataset tag used to run Nextclade for the NA segment + - `nextclade_aa_subs_na`: Amino-acid substitutions as detected by Nextclade for the NA segment + - `nextclade_aa_dels_na`: Amino-acid deletions as detected by Nextclade for the NA segment + - `nextclade_clade_flu_na`: Nextclade clade designation for the NA segment + - `nextclade_lineage_flu_na`: Nextclade lineage designation for the NA segment + - `nextclade_qc_flu_na`: QC metric as determined by Nextclade for the NA segment + - `irma_version`: The version of IRMA used + - `irma_docker`: The Docker image used to run IRMA + - `irma_type`: The flu type identified by IRMA + - `irma_subtype`: The flu subtype identified by IRMA + - `irma_subtype_notes`: Any notes generated by IRMA about the subtype + - `genoflu_version`: The version of GenoFLU used + - `genoflu_genotype`: The complete genotype of the flu sample + - `genoflu_all_segments`: The genotype of each flu segment in the sample + - `abricate_flu_type`: The flu type identified by ABRicate + - `abricate_flu_subtype`: The flu subtype identified by ABRicate + - `abricate_flu_database`: The flu database used by ABRicate + - `abricate_flu_version`: The version of ABRicate used + +This file can be downloaded and opened in Excel to view the full result summary for the sample. Due to the nature of the TheiaMeta_Panel workflow and Terra, displaying this information in the Terra table would be challenging to view, which is why we have generated this file. If you have any suggestions on formatting or additional outputs, please let us know at or by submitting an issue. ## References +> **Trimmomatic**: Bolger AM, Lohse M, Usadel B. Trimmomatic: a flexible trimmer for Illumina sequence data. Bioinformatics. 2014 Aug 1;30(15):2114-20. doi: 10.1093/bioinformatics/btu170. Epub 2014 Apr 1. PMID: 24695404; PMCID: PMC4103590. + +> **fastp**: Chen S, Zhou Y, Chen Y, Gu J. fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics. 2018 Sep 1;34(17):i884-i890. doi: 10.1093/bioinformatics/bty560. PMID: 30423086; PMCID: PMC6129281. + +> **MIDAS**: Nayfach S, Rodriguez-Mueller B, Garud N, Pollard KS. An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography. Genome Res. 2016 Nov;26(11):1612-1625. doi: 10.1101/gr.201863.115. Epub 2016 Oct 18. PMID: 27803195; PMCID: PMC5088602. + +> **Kraken2**: Wood DE, Lu J, Langmead B. Improved metagenomic analysis with Kraken 2. Genome Biol. 2019 Nov 28;20(1):257. doi: 10.1186/s13059-019-1891-0. PMID: 31779668; PMCID: PMC6883579. + +> **KrakenTools**: Lu J, Rincon N, Wood DE, Breitwieser FP, Pockrandt C, Langmead B, Salzberg SL, Steinegger M. Metagenome analysis using the Kraken software suite. Nat Protoc. 2022 Dec;17(12):2815-2839. doi: 10.1038/s41596-022-00738-y. Epub 2022 Sep 28. Erratum in: Nat Protoc. 2024 Aug 29. doi: 10.1038/s41596-024-01064-1. PMID: 36171387; PMCID: PMC9725748. + +> **metaSPAdes**: Nurk S, Meleshko D, Korobeynikov A, Pevzner PA. metaSPAdes: a new versatile metagenomic assembler. Genome Res. 2017 May;27(5):824-834. doi: 10.1101/gr.213959.116. Epub 2017 Mar 15. PMID: 28298430; PMCID: PMC5411777. + +> **minimap2**: Li H. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics. 2018 Sep 15;34(18):3094-3100. doi: 10.1093/bioinformatics/bty191. PMID: 29750242; PMCID: PMC6137996. + +> **SAMtools**: Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. doi: 10.1093/bioinformatics/btp352. Epub 2009 Jun 8. PMID: 19505943; PMCID: PMC2723002. + +> **SAMtools**: Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590861; PMCID: PMC7931819. + +> **Pilon**: Walker BJ, Abeel T, Shea T, Priest M, Abouelliel A, Sakthikumar S, Cuomo CA, Zeng Q, Wortman J, Young SK, Earl AM. Pilon: an integrated tool for comprehensive microbial variant detection and genome assembly improvement. PLoS One. 2014 Nov 19;9(11):e112963. doi: 10.1371/journal.pone.0112963. PMID: 25409509; PMCID: PMC4237348. + +> **QUAST**: Gurevich A, Saveliev V, Vyahhi N, Tesler G. QUAST: quality assessment tool for genome assemblies. Bioinformatics. 2013 Apr 15;29(8):1072-5. doi: 10.1093/bioinformatics/btt086. Epub 2013 Feb 19. PMID: 23422339; PMCID: PMC3624806. + +> **Pangolin**: RRambaut A, Holmes EC, O'Toole Á, Hill V, McCrone JT, Ruis C, du Plessis L, Pybus OG. A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology. Nat Microbiol. 2020 Nov;5(11):1403-1407. doi: 10.1038/s41564-020-0770-5. Epub 2020 Jul 15. PMID: 32669681; PMCID: PMC7610519. + +> **Nextclade**: Aksamentov et al., (2021). Nextclade: clade assignment, mutation calling and quality control for viral genomes. Journal of Open Source Software, 6(67), 3773, https://doi.org/10.21105/joss.03773 + +> **IRMA**: Shepard SS, Meno S, Bahl J, Wilson MM, Barnes J, Neuhaus E. Viral deep sequencing needs an adaptive approach: IRMA, the iterative refinement meta-assembler. BMC Genomics. 2016 Sep 5;17(1):708. doi: 10.1186/s12864-016-3030-6. Erratum in: BMC Genomics. 2016 Oct 13;17(1):801. doi: 10.1186/s12864-016-3138-8. PMID: 27595578; PMCID: PMC5011931. + From 8bc64fb2cdeaea7827213f9220d543c9497d6140 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 28 Oct 2024 18:28:55 +0000 Subject: [PATCH 47/48] clean up docs --- .../theiameta_panel.md | 31 +++++++------------ 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/docs/workflows/genomic_characterization/theiameta_panel.md b/docs/workflows/genomic_characterization/theiameta_panel.md index 553b52856..56458276d 100644 --- a/docs/workflows/genomic_characterization/theiameta_panel.md +++ b/docs/workflows/genomic_characterization/theiameta_panel.md @@ -284,7 +284,7 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | read_QC_trim | **phix** | File | A file containing the phix used during Illumina sequencing; used in the BBDuk task | | Optional | | read_QC_trim | **read_processing** | String | Read trimming software to use, either "trimmomatic" or "fastp" | trimmomatic | Optional | | read_QC_trim | **read_qc** | String | Allows the user to decide between fastq_scan (default) and fastqc for the evaluation of read quality. | fastq_scan | Optional | -| read_QC_trim | **trim_min_length** | Int | The minimum length of each read after trimming | 75 | Optional | +| read_QC_trim | **trim_min_length** | Int | The minimum length of each read after trimming | 75 | Optional | | read_QC_trim | **trim_primers** | Boolean | A True/False option that determines if primers should be trimmed. | TRUE | Optional | | read_QC_trim | **trim_quality_min_score** | Int | The minimum quality score to keep during trimming | 30 | Optional | | read_QC_trim | **trim_window_size** | Int | Specifies window size for trimming (the number of bases to average the quality across) | 4 | Optional | @@ -303,8 +303,9 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei ### Workflow Tasks +#### Read QC and Cleaning + ??? task "`read_QC_trim`: Read Quality Trimming, Adapter Removal, Quantification, and Identification" - ##### Read Cleaning {#read_QC_trim} `read_QC_trim` is a sub-workflow within TheiaMeta that removes low-quality reads, low-quality regions of reads, and sequencing adapters to improve data quality. It uses a number of tasks, described below. **Read quality trimming** @@ -371,8 +372,9 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | Software Documentation | [fastp](https://github.com/OpenGene/fastp); [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic); [BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/); [fastq-scan](https://github.com/rpetit3/fastq-scan); [MIDAS](https://github.com/snayfach/MIDAS) | | Original Publication(s) | [Trimmomatic: a flexible trimmer for Illumina sequence data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4103590/)
[fastp: an ultra-fast all-in-one FASTQ preprocessor](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234?login=false)
[An integrated metagenomics pipeline for strain profiling reveals novel patterns of bacterial transmission and biogeography](https://pubmed.ncbi.nlm.nih.gov/27803195/) | +#### Taxonomic Classification and Read Binning + ??? task "`kraken2`: Taxonomic Classification" - ##### Kraken2 {#kraken2} Kraken2 is a bioinformatics tool originally designed for metagenomic applications. It has additionally proven valuable for validating taxonomic assignments and checking contamination of single-species (e.g. bacterial isolate, eukaryotic isolate, viral isolate, etc.) whole genome sequence data. Kraken2 is run on the clean reads that result from the `read_QC_trim` subworkflow. By default, the Kraken2 database is set to the `k2_viral_20240112` database, located at `"gs://theiagen-large-public-files-rp/terra/databases/kraken2/k2_viral_20240112.tar.gz"`. @@ -389,7 +391,6 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | Original Publication(s) | [Improved metagenomic analysis with Kraken 2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1891-0) | ??? task "`extract_kraken_reads` from KrakenTools: Read Binning" - ##### KrakenTools {#extract_kraken_reads} KrakenTools is a collection of scripts that can be used to help downstream analysis of Kraken2 results. In particular, this task uses the `extract_kraken_reads` script, which extracts reads classified at any user-specified taxonomy IDs. All parent and children reads of the specified taxonomic ID are also extracted. !!! techdetails "KrakenTools Technical Details" @@ -401,7 +402,6 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | Original Publication(s) | [Metagenome analysis using the Kraken software suite](https://doi.org/10.1038/s41596-022-00738-y) | ??? task "`fastq_scan`: Summarizing Read Bins" - ##### FASTQ Scan {#fastq_scan} `fastq_scan` is used to summarize the read bins generated by the `extract_kraken_reads` task. It provides basic statistics about the read bins, such as the number of reads in each bin, the number of read pairs, and the number of reads in each bin. !!! techdetails "fastq_scan Technical Details" @@ -411,8 +411,9 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | Software Source Code | [fastq-scan](https://github.com/rpetit3/fastq-scan) | | Software Documentation | [fastq-scan](https://github.com/rpetit3/fastq-scan) | +#### Assembly and Polishing + ??? task "`metaspades`: _De Novo_ Metagenomic Assembly" - ##### metaSPAdes {#metaspades} While metagenomics has emerged as a technology of choice for analyzing bacterial populations, the assembly of metagenomic data remains challenging. A dedicated metagenomic assembly algorithm is necessary to circumvent the challenge of interpreting variation. metaSPAdes addresses various challenges of metagenomic assembly by capitalizing on computational ideas that proved to be useful in assemblies of single cells and highly polymorphic diploid genomes. `metaspades` is a _de novo_ assembler that first constructs a de Bruijn graph of all the reads using the SPAdes algorithm. Through various graph simplification procedures, paths in the assembly graph are reconstructed that correspond to long genomic fragments within the metagenome. For more details, please see the original publication. @@ -426,9 +427,6 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | Original Publication(s) | [metaSPAdes: a new versatile metagenomic assembler](http://www.genome.org/cgi/doi/10.1101/gr.213959.116) | ??? task "`minimap2`: Assembly Alignment and Contig Filtering" - - ##### minimap2 {#minimap2} - `minimap2` is a popular aligner that is used in TheiaMeta_Panel for correcting the assembly produced by metaSPAdes. This is done by aligning the reads back to the generated assembly. The default mode used in this task is `sr` which is intended for "short single-end reads without splicing". In minimap2, "modes" are a group of preset options; the `sr` mode indicates the following parameters should be used: `-k21 -w11 --sr --frag=yes -A2 -B8 -O12,32 -E2,1 -b0 -r100 -p.5 -N20 -f1000,5000 -n2 -m20 -s40 -g100 -2K50m --heap-sort=yes --secondary=no`. @@ -455,9 +453,6 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | Original Publication(s) | [The Sequence Alignment/Map format and SAMtools](https://doi.org/10.1093/bioinformatics/btp352)
[Twelve Years of SAMtools and BCFtools](https://doi.org/10.1093/gigascience/giab008) | ??? task "`pilon`: Assembly Polishing" - - ##### Pilon {#pilon} - `pilon` is a tool that uses read alignment to correct errors in an assembly. It is used to polish the assembly produced by metaSPAdes. The input to Pilon is the sorted BAM file produced by `samtools`, and the original draft assembly produced by `metaspades`. !!! techdetails "pilon Technical Details" @@ -469,10 +464,7 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | Original Publication(s) | [Pilon: An Integrated Tool for Comprehensive Microbial Variant Detection and Genome Assembly Improvement](https://doi.org/10.1371/journal.pone.0112963) | ??? task "`quast`: Assembly Quality Assessment" - - ##### QUAST {#quast} - - QUAST stands for QUality ASsessment Tool. It evaluates genome/metagenome assemblies by computing various metrics without a reference being necessary. It includes useful metrics such as number of contigs, length of the largest contig and N50. + QUAST stands for QUality ASsessment Tool. It evaluates genome/metagenome assemblies by computing various metrics without a reference being necessary. It includes useful metrics such as number of contigs, length of the largest contig and N50. !!! techdetails "QUAST Technical Details" | | Links | @@ -482,10 +474,9 @@ TheiaMeta_Panel_Illumina_PE was created initially for the [Illumina Viral Survei | Software Documentation | | | Original Publication(s) | [QUAST: quality assessment tool for genome assemblies](https://academic.oup.com/bioinformatics/article/29/8/1072/228832) | -??? task "`morgana_magic`: Genomic Characterization" - - ##### Morgana Magic {#morgana_magic} +#### Morgana Magic +??? task "`morgana_magic`: Genomic Characterization" Morgana Magic is the viral equivalent of the `merlin_magic` subworkflow used in the TheiaProk workflows. This workflow launches several tasks the characterize the viral genome, including Pangolin4, Nextclade, and others. This subworkflow currently only supports the organisms that are natively supported by the [TheiaCoV workflows](./theiacov.md). @@ -690,7 +681,7 @@ This file can be downloaded and opened in Excel to view the full result summary > **Pangolin**: RRambaut A, Holmes EC, O'Toole Á, Hill V, McCrone JT, Ruis C, du Plessis L, Pybus OG. A dynamic nomenclature proposal for SARS-CoV-2 lineages to assist genomic epidemiology. Nat Microbiol. 2020 Nov;5(11):1403-1407. doi: 10.1038/s41564-020-0770-5. Epub 2020 Jul 15. PMID: 32669681; PMCID: PMC7610519. -> **Nextclade**: Aksamentov et al., (2021). Nextclade: clade assignment, mutation calling and quality control for viral genomes. Journal of Open Source Software, 6(67), 3773, https://doi.org/10.21105/joss.03773 +> **Nextclade**: Aksamentov et al., (2021). Nextclade: clade assignment, mutation calling and quality control for viral genomes. Journal of Open Source Software, 6(67), 3773, > **IRMA**: Shepard SS, Meno S, Bahl J, Wilson MM, Barnes J, Neuhaus E. Viral deep sequencing needs an adaptive approach: IRMA, the iterative refinement meta-assembler. BMC Genomics. 2016 Sep 5;17(1):708. doi: 10.1186/s12864-016-3030-6. Erratum in: BMC Genomics. 2016 Oct 13;17(1):801. doi: 10.1186/s12864-016-3138-8. PMID: 27595578; PMCID: PMC5011931. From 931e81546a36d2874bb3ca65c47f6232c3e08070 Mon Sep 17 00:00:00 2001 From: Sage Wright Date: Mon, 28 Oct 2024 18:35:55 +0000 Subject: [PATCH 48/48] update md5sums --- tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml | 2 +- tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml index 91ae801b7..ffc05aa78 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_pe.yml @@ -625,7 +625,7 @@ - path: miniwdl_run/wdl/tasks/taxon_id/task_gambit.wdl md5sum: 2aa70eab24868920f6c28843dd3b5613 - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_kraken2.wdl - md5sum: 0ea83681884800bda1e3c4e116f2b19d + md5sum: 2d18c06542f5333e42bd21069d20deef - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl diff --git a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml index 82f9a9a74..fb26972c8 100644 --- a/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml +++ b/tests/workflows/theiaprok/test_wf_theiaprok_illumina_se.yml @@ -588,7 +588,7 @@ - path: miniwdl_run/wdl/tasks/taxon_id/task_gambit.wdl md5sum: 2aa70eab24868920f6c28843dd3b5613 - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_kraken2.wdl - md5sum: 0ea83681884800bda1e3c4e116f2b19d + md5sum: 2d18c06542f5333e42bd21069d20deef - path: miniwdl_run/wdl/tasks/taxon_id/contamination/task_midas.wdl md5sum: 64caaaff5910ac0036e2659434500962 - path: miniwdl_run/wdl/tasks/utilities/data_export/task_broad_terra_tools.wdl