Skip to content

Commit

Permalink
Adding scripts to liftover gnomAD. Also bugfixes for Funcotator NIO. (#…
Browse files Browse the repository at this point in the history
…5514)

* Added liftover chain file creation script.
* Added WDLs and some arguments to lift over gnomAD
* Added chain file for b37->hg38 and arguments for liftover.
* Limited to 1000 records in memory.
* Added stack trace option to all wdls and sub tasks.
* Fixed output to be consistent with local files for indexing.
* Added timing information on wdls.
* Added a wdl/json to create a TSV from gnomAD allele freq data.
* Updated indexFeatureFile wdl, added params for run to index gnomAD.
* Added json file for indexing a large gnomad file.
* Fixed critical issues with NIO data sources.
* Updates to the test script to save output and point to full cloud data.
* Added a logger to SeekableByteChannelPrefetcher (no messages logged).
  • Loading branch information
jonn-smith authored Dec 17, 2018
1 parent b9b1d34 commit 02682d5
Show file tree
Hide file tree
Showing 19 changed files with 58,052 additions and 60 deletions.
Empty file.
567 changes: 567 additions & 0 deletions scripts/funcotator/data_sources/createLiftoverForB37ToHg38.sh

Large diffs are not rendered by default.

56,506 changes: 56,506 additions & 0 deletions scripts/funcotator/data_sources/gnomAD/b37ToHg38.over.chain

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"CreateGnomadAlleleFreqTsv.gatk_docker": "broadinstitute/gatk:4.0.11.0",

"CreateGnomadAlleleFreqTsv.gnomAD_file": "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.vcf.bgz",
"CreateGnomadAlleleFreqTsv.out_file_name": "gnomad.genomes.r2.1.sites.alleleFreqs.tsv",

"CreateGnomadAlleleFreqTsv.mem_gb": "128",
"CreateGnomadAlleleFreqTsv.disk_space_gb": "16384",
"CreateGnomadAlleleFreqTsv.boot_disk_size_gb": "100"
}
115 changes: 115 additions & 0 deletions scripts/funcotator/data_sources/gnomAD/createGnomadAlleleFreqTsv.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# Create a TSV containing genomic position, dbSNP ID, alleles, and the allele frequency from v2.1 of gnomAD (hg19/b37).
#
# NOTE: This will by default download all of gnomAD to disk. This is a big file, so be careful!
#
# Description of inputs:
#
# Required:
# String gatk_docker - GATK Docker image in which to run
# File gnomAD_file - gnomAD VCF file to process
# String out_file_name - Output file name.
#
# Optional:
# File gatk4_jar_override - Override Jar file containing GATK 4. Use this when overriding the docker JAR or when using a backend without docker.
# Int mem - Amount of memory to give to the machine running each task in this workflow.
# Int preemptible_attempts - Number of times to allow each task in this workflow to be preempted.
# Int disk_space_gb - Amount of storage disk space (in Gb) to give to each machine running each task in this workflow.
# Int cpu - Number of CPU cores to give to each machine running each task in this workflow.
# Int boot_disk_size_gb - Amount of boot disk space (in Gb) to give to each machine running each task in this workflow.
#
# This WDL needs to decide whether to use the ``gatk_jar`` or ``gatk_jar_override`` for the jar location. As of cromwell-0.24,
# this logic *must* go into each task. Therefore, there is a lot of duplicated code. This allows users to specify a jar file
# independent of what is in the docker file. See the README.md for more info.
#
workflow CreateGnomadAlleleFreqTsv {

File gnomAD_file = "gs://gnomad-public/release/2.1/vcf/genomes/gnomad.genomes.r2.1.sites.vcf.bgz"
String out_file_name

String gatk_docker

Int? mem_gb
Int? preemptible_attempts
Int? disk_space_gb
Int? cpu
Int? boot_disk_size_gb

call CreateGnomadAlleleFreqTsvTask {
input:
gnomAD_file = gnomAD_file,
out_file_name = out_file_name,
gatk_docker = gatk_docker,
mem_gb = mem_gb,
preemptible_attempts = preemptible_attempts,
disk_space_gb = disk_space_gb,
cpu = cpu,
boot_disk_size_gb = boot_disk_size_gb
}

output {
File gnomadAlleleFreqTsv = CreateGnomadAlleleFreqTsvTask.gnomadAlleleFreqTsv
}
}


task CreateGnomadAlleleFreqTsvTask {

File gnomAD_file
String out_file_name

# ------------------------------------------------
# runtime
String gatk_docker
Int? mem_gb
Int? preemptible_attempts
Int? disk_space_gb
Int? cpu
Int? boot_disk_size_gb

# ------------------------------------------------
# Get machine settings:
Boolean use_ssd = false

# You may have to change the following two parameter values depending on the task requirements
Int default_ram_mb = 1024 * 3
# WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples.
Int default_disk_space_gb = 100

Int default_boot_disk_size_gb = 15

# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem_gb) then mem_gb * 1024 else default_ram_mb

# ------------------------------------------------
# Run our command:
command <<<
set -e

startTime=`date +%s.%N`
echo "StartTime: $startTime" > timingInformation.txt

cat ${gnomAD_file} | sed 's#^\([0-9X]*\)\t\([0-9]*\)\t\(.*\)\t\([ATGCN]*\)\t\([ATGCN,]*\)\t.*;AF=\([e0-9\.+\-]*\);.*#\1 \2 \3 \4 \5 \6#g' > ${out_file_name}

endTime=`date +%s.%N`
echo "EndTime: $endTime" >> timingInformation.txt
elapsedTime=`echo "scale=5;$endTime - $startTime" | bc`
echo "Elapsed Time: $elapsedTime" >> timingInformation.txt
>>>

# ------------------------------------------------
# Runtime settings:
runtime {
docker: gatk_docker
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD"
bootDiskSizeGb: select_first([boot_disk_size_gb, default_boot_disk_size_gb])
preemptible: 0
cpu: select_first([cpu, 1])
}

# ------------------------------------------------
# Outputs:
output {
File gnomadAlleleFreqTsv = "${out_file_name}"
}
}
111 changes: 111 additions & 0 deletions scripts/funcotator/data_sources/gnomAD/gatherVcfsCloud.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Run GatherVcfsCloud on a list of VCF files.
#
# Description of inputs:
#
# Required:
# gatk_docker - GATK Docker image in which to run
# variant_vcfs - Array of Variant Context Files (VCF) containing the variants.
# output_vcf_file_name - Desired name of the resulting VCF output file.
# output_vcf_index_name - Desired name of the resulting VCF index output file.
#
# Optional:
# File gatk4_jar_override - Override Jar file containing GATK 4. Use this when overriding the docker JAR or when using a backend without docker.
# Int mem - Amount of memory to give to the machine running each task in this workflow.
# Int preemptible_attempts - Number of times to allow each task in this workflow to be preempted.
# Int disk_space_gb - Amount of storage disk space (in Gb) to give to each machine running each task in this workflow.
# Int cpu - Number of CPU cores to give to each machine running each task in this workflow.
# Int boot_disk_size_gb - Amount of boot disk space (in Gb) to give to each machine running each task in this workflow.
#
# This WDL needs to decide whether to use the ``gatk_jar`` or ``gatk_jar_override`` for the jar location. As of cromwell-0.24,
# this logic *must* go into each task. Therefore, there is a lot of duplicated code. This allows users to specify a jar file
# independent of what is in the docker file. See the README.md for more info.
#
workflow GatherVcfsCloudWorkflow {
String gatk_docker
Array[File] variant_vcfs
String output_vcf_file_name
String output_vcf_index_name

File? gatk4_jar_override
Int? mem
Int? preemptible_attempts
Int? disk_space_gb
Int? cpu
Int? boot_disk_size_gb

call GatherVcfsCloud {
input:
input_vcfs = variant_vcfs,
output_vcf_file = output_vcf_file_name,
output_vcf_index = output_vcf_index_name,
gatk_docker = gatk_docker,
gatk_override = gatk4_jar_override,
mem = mem,
preemptible_attempts = preemptible_attempts,
disk_space_gb = disk_space_gb,
cpu = cpu,
boot_disk_size_gb = boot_disk_size_gb
}

output {
File vcf_file = GatherVcfsCloud.vcf_file
File vcf_index = GatherVcfsCloud.vcf_index
}
}


task GatherVcfsCloud {
# inputs
Array[File] input_vcfs

# outputs
String output_vcf_file
String output_vcf_index

# runtime
String gatk_docker
File? gatk_override
Int? mem
Int? preemptible_attempts
Int? disk_space_gb
Int? cpu
Int? boot_disk_size_gb

Boolean use_ssd = false

# You may have to change the following two parameter values depending on the task requirements
Int default_ram_mb = 3000
# WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples.
Int default_disk_space_gb = 100

Int default_boot_disk_size_gb = 15

# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb
Int command_mem = machine_mem - 1000

command <<<
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

gatk --java-options "-Xmx${command_mem}m -DGATK_STACKTRACE_ON_USER_EXCEPTION=true" \
GatherVcfsCloud \
--create-output-variant-index true \
-I ${sep=' -I ' input_vcfs} \
-O ${output_vcf_file}
>>>

runtime {
docker: gatk_docker
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD"
bootDiskSizeGb: select_first([boot_disk_size_gb, default_boot_disk_size_gb])
preemptible: 0
cpu: select_first([cpu, 1])
}

output {
File vcf_file = "${output_vcf_file}"
File vcf_index = "${output_vcf_index}"
}
}
56 changes: 56 additions & 0 deletions scripts/funcotator/data_sources/gnomAD/indexFastaFile.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@

task indexFastaFileTask {

File input_fasta_file

# runtime
String gatk_docker
File? gatk_override
Int? mem
Int? preemptible_attempts
Int? disk_space_gb
Int? cpu
Int? boot_disk_size_gb

# ------------------------------------------------
# Get machine settings:
Boolean use_ssd = false

# You may have to change the following two parameter values depending on the task requirements
Int default_ram_mb = 1024 * 3
# WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples.
Int default_disk_space_gb = 100

Int default_boot_disk_size_gb = 15

# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem * 1024 else default_ram_mb
Int command_mem = machine_mem - 1024

# ------------------------------------------------
# Run our command:
command <<<

set -e
samtools faidx ${input_fasta_file}

>>>

# ------------------------------------------------
# Runtime settings:
runtime {
docker: gatk_docker
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD"
bootDiskSizeGb: select_first([boot_disk_size_gb, default_boot_disk_size_gb])
preemptible: 0
cpu: select_first([cpu, 1])
}

# ------------------------------------------------
# Outputs:
output {
File vcf_index = "${input_fasta_file}.idx"
}

}
Loading

0 comments on commit 02682d5

Please sign in to comment.