diff --git a/.dockstore.yml b/.dockstore.yml index 004bf9955..91f8eabab 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -220,3 +220,8 @@ workflows: primaryDescriptorPath: /workflows/utilities/wf_theiavalidate.wdl testParameterFiles: - empty.json + - name: Usher_PHB + subclass: WDL + primaryDescriptorPath: /workflows/phylogenetics/wf_usher.wdl + testParameterFiles: + - empty.json \ No newline at end of file diff --git a/tasks/phylogenetic_inference/task_usher.wdl b/tasks/phylogenetic_inference/task_usher.wdl new file mode 100644 index 000000000..4de9ee9d7 --- /dev/null +++ b/tasks/phylogenetic_inference/task_usher.wdl @@ -0,0 +1,140 @@ +version 1.0 + +task usher { + input { + Array[File] assembly_fasta + String tree_name + + # the protobuf tree used to place sequences onto -- required if not mpox or sc2 + File? mutation_annotated_tree_pb + + # the reference genome to align to -- required if not mpox or sc2 + File? reference_genome + Int subtree_size = 20 # change value to indicate how many of the closest-related samples you want to show in a subtree + + # what organism to run usher on -- the organisms below have default data provided + String organism # options: sars-cov-2, mpox, RSV-A, RSV-B + + # runtime + String docker = "us-docker.pkg.dev/general-theiagen/pathogengenomics/usher:0.6.2" + Int memory = 8 + Int cpus = 2 + Int disk_size = 100 + } + command <<< + if [ "~{organism}" == "mpox" ]; then + echo "DEBUG: organism is mpox, downloading latest UShER data" + wget "https://hgdownload.gi.ucsc.edu/hubs/GCF/014/621/545/GCF_014621545.1/UShER_hMPXV/mpxv.latest.masked.pb.gz" + # unzip protobuf file into renamed file + gunzip -c mpxv.latest.masked.pb.gz > mutation.annotated.tree.pb + + # download versioning information of protobuf + wget "https://hgdownload.gi.ucsc.edu/hubs/GCF/014/621/545/GCF_014621545.1/UShER_hMPXV/mpxv.latest.version.txt" + cat mpxv.latest.version.txt > PROTOBUF_VERSION + + # download mpox reference used to make usher protobuf + wget "https://hgdownload.gi.ucsc.edu/hubs/GCF/014/621/545/GCF_014621545.1/GCF_014621545.1.fa.gz" + # unzip reference into renamed file + gunzip -c GCF_014621545.1.fa.gz > reference_genome.fasta + echo "DEBUG: finished downloading mpox data" + elif [ "~{organism}" == "sars-cov-2" ]; then + echo "DEBUG: organism is sars-cov-2, downloading latest UShER data" + wget "https://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/public-latest.all.masked.pb.gz" + # unzip protobuf file into renamed file + gunzip -c public-latest.all.masked.pb.gz > mutation.annotated.tree.pb + + # download versioning information of protobuf + wget "https://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/public-latest.version.txt" + cat public-latest.version.txt > PROTOBUF_VERSION + + # copy the usher SC2 reference to expected name + cp /HOME/usher/test/NC_045512v2.fa reference_genome.fasta + echo "DEBUG: finished downloading sars-cov-2 data" + elif [ "~{organism}" == "RSV-A" ]; then + echo "DEBUG: organism is RSV-A, downloading latest UShER data" + wget "https://hgdownload.soe.ucsc.edu/hubs/GCF/002/815/475/GCF_002815475.1/UShER_RSV-A/rsvA.latest.pb.gz" + # unzip protobuf file into renamed file + gunzip -c rsvA.latest.pb.gz > mutation.annotated.tree.pb + + # download versioning information of protobuf + wget "https://hgdownload.soe.ucsc.edu/hubs/GCF/002/815/475/GCF_002815475.1/UShER_RSV-A/rsvA.latest.version.txt" + cat rsvA.latest.version.txt > PROTOBUF_VERSION + + # download RSV-A reference used to make usher protobuf + wget "https://hgdownload.soe.ucsc.edu/hubs/GCF/002/815/475/GCF_002815475.1/GCF_002815475.1.fa.gz" + # unzip reference into renamed file + gunzip -c GCF_002815475.1.fa.gz > reference_genome.fasta + echo "DEBUG: finished downloading RSV-A data" + elif [ "~{organism}" == "RSV-B" ]; then + echo "DEBUG: organism is RSV-B, downloading latest UShER data" + wget "https://hgdownload.soe.ucsc.edu/hubs/GCF/000/855/545/GCF_000855545.1/UShER_RSV-B/rsvB.latest.pb.gz" + # unzip protobuf file into renamed file + gunzip -c rsvB.latest.pb.gz > mutation.annotated.tree.pb + + # download versioning information of protobuf + wget "https://hgdownload.soe.ucsc.edu/hubs/GCF/000/855/545/GCF_000855545.1/UShER_RSV-B/rsvB.latest.version.txt" + cat rsvB.latest.version.txt > PROTOBUF_VERSION + + # download RSV-B reference used to make usher protobuf + wget "https://hgdownload.soe.ucsc.edu/hubs/GCF/000/855/545/GCF_000855545.1/GCF_000855545.1.fa.gz" + # unzip reference into renamed file + gunzip -c GCF_000855545.1.fa.gz > reference_genome.fasta + echo "DEBUG: finished downloading RSV-B data" + else + echo "DEBUG: organism is unknown, assuming user-provided data" + echo "User-provided protobuf file" > PROTOBUF_VERSION + # rename provided protobuf and reference + mv "~{mutation_annotated_tree_pb}" mutation.annotated.tree.pb + mv "~{reference_genome}" reference_genome.fasta + echo "DEBUG: finished renaming user-provided data" + fi + + echo "DEBUG: concatenating assembly files" + for file in ~{sep=' ' assembly_fasta}; do + cat "${file}" >> all_sequences.fasta + done + + echo "DEBUG: generating MSA" + mafft --thread 10 --auto --keeplength --addfragments all_sequences.fasta reference_genome.fasta > all_sequences.aligned.fasta + + echo "DEBUG: converting MSA to VCF" + faToVcf all_sequences.aligned.fasta all_sequences.merged.vcf + + # capture usher version + usher --version > VERSION + + echo "DEBUG: running UShER" + usher \ + --load-mutation-annotated-tree mutation.annotated.tree.pb \ + --vcf all_sequences.merged.vcf \ + --write-uncondensed-final-tree \ + --write-subtrees-size ~{subtree_size} + + echo "DEUBG: UShER finished, renaming files" + mv uncondensed-final-tree.nh ~{tree_name}_uncondensed-final-tree.nwk + mv clades.txt ~{tree_name}_clades.txt + + # rename subtree files to .nwk extension + for file in *nh; do + mv "$file" "$(basename -- "$file" .nh).nwk" + done + + >>> + output { + File usher_uncondensed_tree = "~{tree_name}_uncondensed-final-tree.nwk" + File usher_clades = "~{tree_name}_clades.txt" + Array[File] usher_subtrees = glob("subtree-*.nwk") + Array[File] usher_subtree_mutations = glob("subtree-*-mutations.txt") + String usher_version = read_string("VERSION") + String usher_protobuf_version = read_string("PROTOBUF_VERSION") + } + runtime { + docker: docker + memory: memory + " GB" + cpu : cpus + disks: "local-disk " + disk_size + " HDD" + disk: disk_size + " GB" + preemptible: 0 + dx_instance_type: "mem3_ssd1_v2_x4" + } +} \ No newline at end of file diff --git a/workflows/phylogenetics/wf_usher.wdl b/workflows/phylogenetics/wf_usher.wdl new file mode 100644 index 000000000..fa10afb2f --- /dev/null +++ b/workflows/phylogenetics/wf_usher.wdl @@ -0,0 +1,33 @@ +version 1.0 + +import "../../tasks/phylogenetic_inference/task_usher.wdl" as usher_task +import "../../tasks/task_versioning.wdl" as versioning + +workflow usher_workflow { + meta { + description: "Place samples onto global tree" + } + input { + Array[File] assembly_fasta + String tree_name + String organism # currently available: "sars-cov-2", "mpox", "RSV-A", "RSV-B" + } + call usher_task.usher { + input: + assembly_fasta = assembly_fasta, + tree_name = tree_name, + organism = organism + } + call versioning.version_capture { + } + output { + File usher_uncondensed_tree = usher.usher_uncondensed_tree + File usher_clades = usher.usher_clades + Array[File] usher_subtrees = usher.usher_subtrees + Array[File] usher_subtree_mutations = usher.usher_subtree_mutations + String usher_version = usher.usher_version + String usher_protobuf_version = usher.usher_protobuf_version + String usher_phb_version = version_capture.phb_version + String usher_phb_analysis_date = version_capture.date + } +} \ No newline at end of file