-
Notifications
You must be signed in to change notification settings - Fork 591
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding scripts to liftover gnomAD. Also bugfixes for Funcotator NIO. (#…
…5514) * Added liftover chain file creation script. * Added WDLs and some arguments to lift over gnomAD * Added chain file for b37->hg38 and arguments for liftover. * Limited to 1000 records in memory. * Added stack trace option to all wdls and sub tasks. * Fixed output to be consistent with local files for indexing. * Added timing information on wdls. * Added a wdl/json to create a TSV from gnomAD allele freq data. * Updated indexFeatureFile wdl, added params for run to index gnomAD. * Added json file for indexing a large gnomad file. * Fixed critical issues with NIO data sources. * Updates to the test script to save output and point to full cloud data. * Added a logger to SeekableByteChannelPrefetcher (no messages logged).
- Loading branch information
1 parent
b9b1d34
commit 02682d5
Showing
19 changed files
with
58,052 additions
and
60 deletions.
There are no files selected for viewing
Empty file.
567 changes: 567 additions & 0 deletions
567
scripts/funcotator/data_sources/createLiftoverForB37ToHg38.sh
Large diffs are not rendered by default.
Oops, something went wrong.
56,506 changes: 56,506 additions & 0 deletions
56,506
scripts/funcotator/data_sources/gnomAD/b37ToHg38.over.chain
Large diffs are not rendered by default.
Oops, something went wrong.
10 changes: 10 additions & 0 deletions
10
scripts/funcotator/data_sources/gnomAD/createGnomadAlleleFreqTsv.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
{ | ||
"CreateGnomadAlleleFreqTsv.gatk_docker": "broadinstitute/gatk:4.0.11.0", | ||
|
||
"CreateGnomadAlleleFreqTsv.gnomAD_file": "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.vcf.bgz", | ||
"CreateGnomadAlleleFreqTsv.out_file_name": "gnomad.genomes.r2.1.sites.alleleFreqs.tsv", | ||
|
||
"CreateGnomadAlleleFreqTsv.mem_gb": "128", | ||
"CreateGnomadAlleleFreqTsv.disk_space_gb": "16384", | ||
"CreateGnomadAlleleFreqTsv.boot_disk_size_gb": "100" | ||
} |
115 changes: 115 additions & 0 deletions
115
scripts/funcotator/data_sources/gnomAD/createGnomadAlleleFreqTsv.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
# Create a TSV containing genomic position, dbSNP ID, alleles, and the allele frequency from v2.1 of gnomAD (hg19/b37). | ||
# | ||
# NOTE: This will by default download all of gnomAD to disk. This is a big file, so be careful! | ||
# | ||
# Description of inputs: | ||
# | ||
# Required: | ||
# String gatk_docker - GATK Docker image in which to run | ||
# File gnomAD_file - gnomAD VCF file to process | ||
# String out_file_name - Output file name. | ||
# | ||
# Optional: | ||
# File gatk4_jar_override - Override Jar file containing GATK 4. Use this when overriding the docker JAR or when using a backend without docker. | ||
# Int mem - Amount of memory to give to the machine running each task in this workflow. | ||
# Int preemptible_attempts - Number of times to allow each task in this workflow to be preempted. | ||
# Int disk_space_gb - Amount of storage disk space (in Gb) to give to each machine running each task in this workflow. | ||
# Int cpu - Number of CPU cores to give to each machine running each task in this workflow. | ||
# Int boot_disk_size_gb - Amount of boot disk space (in Gb) to give to each machine running each task in this workflow. | ||
# | ||
# This WDL needs to decide whether to use the ``gatk_jar`` or ``gatk_jar_override`` for the jar location. As of cromwell-0.24, | ||
# this logic *must* go into each task. Therefore, there is a lot of duplicated code. This allows users to specify a jar file | ||
# independent of what is in the docker file. See the README.md for more info. | ||
# | ||
workflow CreateGnomadAlleleFreqTsv { | ||
|
||
File gnomAD_file = "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.vcf.bgz" | ||
String out_file_name | ||
|
||
String gatk_docker | ||
|
||
Int? mem_gb | ||
Int? preemptible_attempts | ||
Int? disk_space_gb | ||
Int? cpu | ||
Int? boot_disk_size_gb | ||
|
||
call CreateGnomadAlleleFreqTsvTask { | ||
input: | ||
gnomAD_file = gnomAD_file, | ||
out_file_name = out_file_name, | ||
gatk_docker = gatk_docker, | ||
mem_gb = mem_gb, | ||
preemptible_attempts = preemptible_attempts, | ||
disk_space_gb = disk_space_gb, | ||
cpu = cpu, | ||
boot_disk_size_gb = boot_disk_size_gb | ||
} | ||
|
||
output { | ||
File gnomadAlleleFreqTsv = CreateGnomadAlleleFreqTsvTask.gnomadAlleleFreqTsv | ||
} | ||
} | ||
|
||
|
||
task CreateGnomadAlleleFreqTsvTask { | ||
|
||
File gnomAD_file | ||
String out_file_name | ||
|
||
# ------------------------------------------------ | ||
# runtime | ||
String gatk_docker | ||
Int? mem_gb | ||
Int? preemptible_attempts | ||
Int? disk_space_gb | ||
Int? cpu | ||
Int? boot_disk_size_gb | ||
|
||
# ------------------------------------------------ | ||
# Get machine settings: | ||
Boolean use_ssd = false | ||
|
||
# You may have to change the following two parameter values depending on the task requirements | ||
Int default_ram_mb = 1024 * 3 | ||
# WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. | ||
Int default_disk_space_gb = 100 | ||
|
||
Int default_boot_disk_size_gb = 15 | ||
|
||
# Mem is in units of GB but our command and memory runtime values are in MB | ||
Int machine_mem = if defined(mem_gb) then mem_gb * 1024 else default_ram_mb | ||
|
||
# ------------------------------------------------ | ||
# Run our command: | ||
command <<< | ||
set -e | ||
|
||
startTime=`date +%s.%N` | ||
echo "StartTime: $startTime" > timingInformation.txt | ||
|
||
cat ${gnomAD_file} | sed 's#^\([0-9X]*\)\t\([0-9]*\)\t\(.*\)\t\([ATGCN]*\)\t\([ATGCN,]*\)\t.*;AF=\([e0-9\.+\-]*\);.*#\1 \2 \3 \4 \5 \6#g' > ${out_file_name} | ||
|
||
endTime=`date +%s.%N` | ||
echo "EndTime: $endTime" >> timingInformation.txt | ||
elapsedTime=`echo "scale=5;$endTime - $startTime" | bc` | ||
echo "Elapsed Time: $elapsedTime" >> timingInformation.txt | ||
>>> | ||
|
||
# ------------------------------------------------ | ||
# Runtime settings: | ||
runtime { | ||
docker: gatk_docker | ||
memory: machine_mem + " MB" | ||
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" | ||
bootDiskSizeGb: select_first([boot_disk_size_gb, default_boot_disk_size_gb]) | ||
preemptible: 0 | ||
cpu: select_first([cpu, 1]) | ||
} | ||
|
||
# ------------------------------------------------ | ||
# Outputs: | ||
output { | ||
File gnomadAlleleFreqTsv = "${out_file_name}" | ||
} | ||
} |
111 changes: 111 additions & 0 deletions
111
scripts/funcotator/data_sources/gnomAD/gatherVcfsCloud.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
# Run GatherVcfsCloud on a list of VCF files. | ||
# | ||
# Description of inputs: | ||
# | ||
# Required: | ||
# gatk_docker - GATK Docker image in which to run | ||
# variant_vcfs - Array of Variant Context Files (VCF) containing the variants. | ||
# output_vcf_file_name - Desired name of the resulting VCF output file. | ||
# output_vcf_index_name - Desired name of the resulting VCF index output file. | ||
# | ||
# Optional: | ||
# File gatk4_jar_override - Override Jar file containing GATK 4. Use this when overriding the docker JAR or when using a backend without docker. | ||
# Int mem - Amount of memory to give to the machine running each task in this workflow. | ||
# Int preemptible_attempts - Number of times to allow each task in this workflow to be preempted. | ||
# Int disk_space_gb - Amount of storage disk space (in Gb) to give to each machine running each task in this workflow. | ||
# Int cpu - Number of CPU cores to give to each machine running each task in this workflow. | ||
# Int boot_disk_size_gb - Amount of boot disk space (in Gb) to give to each machine running each task in this workflow. | ||
# | ||
# This WDL needs to decide whether to use the ``gatk_jar`` or ``gatk_jar_override`` for the jar location. As of cromwell-0.24, | ||
# this logic *must* go into each task. Therefore, there is a lot of duplicated code. This allows users to specify a jar file | ||
# independent of what is in the docker file. See the README.md for more info. | ||
# | ||
workflow GatherVcfsCloudWorkflow { | ||
String gatk_docker | ||
Array[File] variant_vcfs | ||
String output_vcf_file_name | ||
String output_vcf_index_name | ||
|
||
File? gatk4_jar_override | ||
Int? mem | ||
Int? preemptible_attempts | ||
Int? disk_space_gb | ||
Int? cpu | ||
Int? boot_disk_size_gb | ||
|
||
call GatherVcfsCloud { | ||
input: | ||
input_vcfs = variant_vcfs, | ||
output_vcf_file = output_vcf_file_name, | ||
output_vcf_index = output_vcf_index_name, | ||
gatk_docker = gatk_docker, | ||
gatk_override = gatk4_jar_override, | ||
mem = mem, | ||
preemptible_attempts = preemptible_attempts, | ||
disk_space_gb = disk_space_gb, | ||
cpu = cpu, | ||
boot_disk_size_gb = boot_disk_size_gb | ||
} | ||
|
||
output { | ||
File vcf_file = GatherVcfsCloud.vcf_file | ||
File vcf_index = GatherVcfsCloud.vcf_index | ||
} | ||
} | ||
|
||
|
||
task GatherVcfsCloud { | ||
# inputs | ||
Array[File] input_vcfs | ||
|
||
# outputs | ||
String output_vcf_file | ||
String output_vcf_index | ||
|
||
# runtime | ||
String gatk_docker | ||
File? gatk_override | ||
Int? mem | ||
Int? preemptible_attempts | ||
Int? disk_space_gb | ||
Int? cpu | ||
Int? boot_disk_size_gb | ||
|
||
Boolean use_ssd = false | ||
|
||
# You may have to change the following two parameter values depending on the task requirements | ||
Int default_ram_mb = 3000 | ||
# WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. | ||
Int default_disk_space_gb = 100 | ||
|
||
Int default_boot_disk_size_gb = 15 | ||
|
||
# Mem is in units of GB but our command and memory runtime values are in MB | ||
Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb | ||
Int command_mem = machine_mem - 1000 | ||
|
||
command <<< | ||
set -e | ||
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} | ||
|
||
gatk --java-options "-Xmx${command_mem}m -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" \ | ||
GatherVcfsCloud \ | ||
--create-output-variant-index true \ | ||
-I ${sep=' -I ' input_vcfs} \ | ||
-O ${output_vcf_file} | ||
>>> | ||
|
||
runtime { | ||
docker: gatk_docker | ||
memory: machine_mem + " MB" | ||
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" | ||
bootDiskSizeGb: select_first([boot_disk_size_gb, default_boot_disk_size_gb]) | ||
preemptible: 0 | ||
cpu: select_first([cpu, 1]) | ||
} | ||
|
||
output { | ||
File vcf_file = "${output_vcf_file}" | ||
File vcf_index = "${output_vcf_index}" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
|
||
task indexFastaFileTask { | ||
|
||
File input_fasta_file | ||
|
||
# runtime | ||
String gatk_docker | ||
File? gatk_override | ||
Int? mem | ||
Int? preemptible_attempts | ||
Int? disk_space_gb | ||
Int? cpu | ||
Int? boot_disk_size_gb | ||
|
||
# ------------------------------------------------ | ||
# Get machine settings: | ||
Boolean use_ssd = false | ||
|
||
# You may have to change the following two parameter values depending on the task requirements | ||
Int default_ram_mb = 1024 * 3 | ||
# WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. | ||
Int default_disk_space_gb = 100 | ||
|
||
Int default_boot_disk_size_gb = 15 | ||
|
||
# Mem is in units of GB but our command and memory runtime values are in MB | ||
Int machine_mem = if defined(mem) then mem * 1024 else default_ram_mb | ||
Int command_mem = machine_mem - 1024 | ||
|
||
# ------------------------------------------------ | ||
# Run our command: | ||
command <<< | ||
|
||
set -e | ||
samtools faidx ${input_fasta_file} | ||
|
||
>>> | ||
|
||
# ------------------------------------------------ | ||
# Runtime settings: | ||
runtime { | ||
docker: gatk_docker | ||
memory: machine_mem + " MB" | ||
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" | ||
bootDiskSizeGb: select_first([boot_disk_size_gb, default_boot_disk_size_gb]) | ||
preemptible: 0 | ||
cpu: select_first([cpu, 1]) | ||
} | ||
|
||
# ------------------------------------------------ | ||
# Outputs: | ||
output { | ||
File vcf_index = "${input_fasta_file}.idx" | ||
} | ||
|
||
} |
Oops, something went wrong.