Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BENCHMARK] Create folder structure #198

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions test/benchmark/caller_comparison/workflow/rules/plots.smk

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
configfile: "Repos/iGenVar/test/benchmark/caller_comparison/config/config.yaml"
configfile: "Repos/iGenVar/test/benchmark/config/caller_comparison_config.yaml"

include: "workflow/rules/callers.smk"
include: "workflow/rules/eval.smk"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,12 @@ rule run_igenvar:
input:
bam = config["long_bam"]
output:
vcf = "results/caller_comparison/iGenVar/variants.vcf"
vcf = "results/caller_comparison_long_read/iGenVar/variants.vcf"
threads: 1
shell:
"""
./build/iGenVar/bin/iGenVar --input_long_reads {input.bam} --output {output.vcf} \
--vcf_sample_name {sample} \
--threads {threads} \
--min_var_length {min_var_length} \
--max_var_length {max_var_length} \
--min_qual 1
./build/iGenVar/bin/iGenVar --input_long_reads {input.bam} --output {output.vcf} --vcf_sample_name {sample} \
--threads {threads} --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1
"""
# Defaults:
# --method cigar_string --method split_read --method read_pairs --method read_depth
Expand All @@ -29,28 +25,21 @@ rule run_svim:
bai = config["long_bai"],
genome = config["reference_fa"] # [E::fai_build3_core] Cannot index files compressed with gzip, please use bgzip
output:
"results/caller_comparison/SVIM/variants.vcf"
"results/caller_comparison_long_read/SVIM/variants.vcf"
resources:
mem_mb = 20000,
time_min = 600,
io_gb = 100
params:
working_dir = "results/caller_comparison/SVIM/"
working_dir = "results/caller_comparison_long_read/SVIM/"
threads: 1
# conda:
# "../../../envs/svim.yaml"
shell:
"""
svim alignment --sample {sample} \
--partition_max_distance 1000 \
--cluster_max_distance 0.5 \
--min_sv_size {min_var_length} \
--segment_gap_tolerance 20 \
--segment_overlap_tolerance 20 \
--read_names \
--max_sv_size {max_var_length} \
{params.working_dir} {input.bam} {input.genome} \
&>> logs/svim_output.log
svim alignment --sample {sample} --partition_max_distance 1000 --cluster_max_distance 0.5 \
--min_sv_size {min_var_length} --segment_gap_tolerance 20 --segment_overlap_tolerance 20 --read_names \
--max_sv_size {max_var_length} {params.working_dir} {input.bam} {input.genome} &>> logs/svim_output.log
"""
# Defaults:
# --position_distance_normalizer 900 --edit_distance_normalizer 1.0
Expand All @@ -60,7 +49,7 @@ rule run_sniffles:
input:
bam = config["long_md_bam"]
output:
"results/caller_comparison/Sniffles/raw_variants_{min_qual}.vcf"
"results/caller_comparison_long_read/Sniffles/raw_variants_{min_qual}.vcf"
resources:
mem_mb = 400000,
time_min = 1200,
Expand All @@ -71,28 +60,28 @@ rule run_sniffles:
shell:
"""
sniffles --mapped_reads {input.bam} \
--vcf results/caller_comparison/Sniffles/raw_variants_{wildcards.min_qual}.vcf \
--min_support {wildcards.min_qual} --min_length {min_var_length} --threads {threads} --genotype \
>> logs/sniffles_output.log
--vcf results/caller_comparison_long_read/Sniffles/raw_variants_{wildcards.min_qual}.vcf \
--min_support {wildcards.min_qual} --min_length {min_var_length} --threads {threads} --genotype \
>> logs/sniffles_output.log
"""

# see https://github.com/spiralgenetics/truvari/issues/43
# see https://github.com/fritzsedlazeck/Sniffles/issues/209 (Fixed, but there just hasn't been a release since the fix.)
rule fix_sniffles:
input:
"results/caller_comparison/Sniffles/raw_variants_{min_qual}.vcf"
"results/caller_comparison_long_read/Sniffles/raw_variants_{min_qual}.vcf"
output:
"results/caller_comparison/Sniffles/variants.unsorted.min_qual_{min_qual}.vcf"
"results/caller_comparison_long_read/Sniffles/variants.unsorted.min_qual_{min_qual}.vcf"
run:
shell("sed 's/##INFO=<ID=SUPTYPE,Number=A/##INFO=<ID=SUPTYPE,Number=./' {input} > {output}")
shell("sed -i '4i##FILTER=<ID=STRANDBIAS,Description=\"Strand is biased.\">' {output}")

# Split to SV classes
rule fix_sniffles_2:
input:
"results/caller_comparison/Sniffles/variants.unsorted.min_qual_{min_qual}.vcf"
"results/caller_comparison_long_read/Sniffles/variants.unsorted.min_qual_{min_qual}.vcf"
output:
"results/caller_comparison/Sniffles/variants.min_qual_{min_qual}.vcf"
"results/caller_comparison_long_read/Sniffles/variants.min_qual_{min_qual}.vcf"
shell:
"bcftools sort {input} > {output}"

Expand All @@ -101,8 +90,8 @@ rule run_pbsv_dicsover:
input:
bam = config["long_bam"]
output:
svsig_gz = "results/caller_comparison/pbsv/signatures.svsig.gz"
# svsig_gz = dynamic("results/caller_comparison/pbsv/signatures.{region}.svsig.gz")
svsig_gz = "results/caller_comparison_long_read/pbsv/signatures.svsig.gz"
# svsig_gz = dynamic("results/caller_comparison_long_read/pbsv/signatures.{region}.svsig.gz")
resources:
mem_mb = 400000,
time_min = 2000,
Expand All @@ -114,17 +103,17 @@ rule run_pbsv_dicsover:
"pbsv discover {input.bam} {output.svsig_gz}"
# """
# for i in $(samtools view -H {input.bam} | grep '^@SQ' | cut -f2 | cut -d':' -f2); do
# pbsv discover --region $i {input.bam} results/caller_comparison/pbsv/signatures.$i.svsig.gz
# pbsv discover --region $i {input.bam} results/caller_comparison_long_read/pbsv/signatures.$i.svsig.gz
# done
# """

rule run_pbsv_call:
input:
genome = config["reference_fa"],
svsig_gz = "results/caller_comparison/pbsv/signatures.svsig.gz"
# svsig_gz = dynamic("results/caller_comparison/pbsv/signatures.{region}.svsig.gz")
svsig_gz = "results/caller_comparison_long_read/pbsv/signatures.svsig.gz"
# svsig_gz = dynamic("results/caller_comparison_long_read/pbsv/signatures.{region}.svsig.gz")
output:
"results/caller_comparison/pbsv/variants.min_qual_{min_qual}.vcf"
"results/caller_comparison_long_read/pbsv/variants.min_qual_{min_qual}.vcf"
resources:
mem_mb = 400000,
time_min = 2000,
Expand All @@ -135,19 +124,19 @@ rule run_pbsv_call:
shell:
"""
pbsv call --types DEL,INS,DUP --min-sv-length {min_var_length} --max-ins-length 100K \
--call-min-reads-all-samples {wildcards.min_qual} --call-min-reads-one-sample {wildcards.min_qual} \
--call-min-reads-per-strand-all-samples 0 --call-min-bnd-reads-all-samples 0 --call-min-read-perc-one-sample 0 \
--num-threads {threads} {input.genome} {input.svsig_gz} \
results/caller_comparison/pbsv/variants.min_qual_{wildcards.min_qual}.vcf
--call-min-reads-all-samples {wildcards.min_qual} --call-min-reads-one-sample {wildcards.min_qual} \
--call-min-reads-per-strand-all-samples 0 --call-min-bnd-reads-all-samples 0 --call-min-read-perc-one-sample 0 \
--num-threads {threads} {input.genome} {input.svsig_gz} \
results/caller_comparison_long_read/pbsv/variants.min_qual_{wildcards.min_qual}.vcf
"""

rule run_pbsv_call_without_DUP:
input:
genome = config["reference_fa"],
svsig_gz = "results/caller_comparison/pbsv/signatures.svsig.gz"
# svsig_gz = dynamic("results/caller_comparison/pbsv/signatures.{region}.svsig.gz")
svsig_gz = "results/caller_comparison_long_read/pbsv/signatures.svsig.gz"
# svsig_gz = dynamic("results/caller_comparison_long_read/pbsv/signatures.{region}.svsig.gz")
output:
"results/caller_comparison/pbsv_without_DUP/variants.min_qual_{min_qual}.vcf"
"results/caller_comparison_long_read/pbsv_without_DUP/variants.min_qual_{min_qual}.vcf"
resources:
mem_mb = 400000,
time_min = 2000,
Expand All @@ -158,8 +147,8 @@ rule run_pbsv_call_without_DUP:
shell:
"""
pbsv call --types DEL,INS --min-sv-length {min_var_length} --max-ins-length 100K \
--call-min-reads-all-samples {wildcards.min_qual} --call-min-reads-one-sample {wildcards.min_qual} \
--call-min-reads-per-strand-all-samples 0 --call-min-bnd-reads-all-samples 0 --call-min-read-perc-one-sample 0 \
--num-threads {threads} {input.genome} {input.svsig_gz} \
results/caller_comparison/pbsv_without_DUP/variants.min_qual_{wildcards.min_qual}.vcf
--call-min-reads-all-samples {wildcards.min_qual} --call-min-reads-one-sample {wildcards.min_qual} \
--call-min-reads-per-strand-all-samples 0 --call-min-bnd-reads-all-samples 0 --call-min-read-perc-one-sample 0 \
--num-threads {threads} {input.genome} {input.svsig_gz} \
results/caller_comparison_long_read/pbsv_without_DUP/variants.min_qual_{wildcards.min_qual}.vcf
"""
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
rule filter_vcf:
input:
"results/caller_comparison/{caller,iGenVar|SVIM}/variants.vcf"
"results/caller_comparison_long_read/{caller,iGenVar|SVIM}/variants.vcf"
output:
"results/caller_comparison/{caller,iGenVar|SVIM}/variants.min_qual_{min_qual}.vcf"
"results/caller_comparison_long_read/{caller,iGenVar|SVIM}/variants.min_qual_{min_qual}.vcf"
shell:
"bcftools view -i 'QUAL>={wildcards.min_qual}' {input} > {output}"

Expand All @@ -24,60 +24,61 @@ rule tabix:

rule truvari:
input:
vcf = "results/caller_comparison/{caller}/variants.min_qual_{min_qual}.vcf.gz",
index = "results/caller_comparison/{caller}/variants.min_qual_{min_qual}.vcf.gz.tbi"
vcf = "results/caller_comparison_long_read/{caller}/variants.min_qual_{min_qual}.vcf.gz",
index = "results/caller_comparison_long_read/{caller}/variants.min_qual_{min_qual}.vcf.gz.tbi"
params:
output_dir = "results/caller_comparison/eval/{caller}/min_qual_{min_qual}"
output_dir = "results/caller_comparison_long_read/eval/{caller}/min_qual_{min_qual}"
output:
summary = "results/caller_comparison/eval/{caller}/min_qual_{min_qual}/summary.txt"
summary = "results/caller_comparison_long_read/eval/{caller}/min_qual_{min_qual}/summary.txt"
shell:
"""
rm -rf {params.output_dir} && truvari bench -b data/truth_set/HG002_SVs_Tier1_v0.6.vcf.gz \
-c {input.vcf} -o {params.output_dir} --passonly --includebed data/truth_set/HG002_SVs_Tier1_v0.6.bed -p 0 \
&>> logs/truvari_output.log
-c {input.vcf} -o {params.output_dir} --passonly --includebed data/truth_set/HG002_SVs_Tier1_v0.6.bed -p 0 \
&>> logs/truvari_output.log
"""

rule reformat_truvari_results:
input:
"results/caller_comparison/eval/{caller}/min_qual_{min_qual}/summary.txt"
"results/caller_comparison_long_read/eval/{caller}/min_qual_{min_qual}/summary.txt"
output:
"results/caller_comparison/eval/{caller}/min_qual_{min_qual}/pr_rec.txt"
"results/caller_comparison_long_read/eval/{caller}/min_qual_{min_qual}/pr_rec.txt"
threads: 1
shell:
"""
cat {input} | grep '\<precision\>\|\<recall\>' | tr -d ',' |sed 's/^[ \t]*//' | tr -d '\"' | tr -d ' ' \
| tr ':' '\t' | awk 'OFS=\"\\t\" {{ print \"{wildcards.caller}\", \"{wildcards.min_qual}\", $1, $2 }}' > {output}
cat {input} | grep '\<precision\>\|\<recall\>' | tr -d ',' |sed 's/^[ \t]*//' | tr -d '\"' | tr -d ' ' \
| tr ':' '\t' | awk 'OFS=\"\\t\" {{ print \"{wildcards.caller}\", \"{wildcards.min_qual}\", $1, $2 }}' \
> {output}
"""

rule cat_truvari_results_all:
input:
igenvar = expand("results/caller_comparison/eval/iGenVar/min_qual_{min_qual}/pr_rec.txt",
igenvar = expand("results/caller_comparison_long_read/eval/iGenVar/min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"]+1,
config["quality_ranges"]["step"]))),
svim = expand("results/caller_comparison/eval/SVIM/min_qual_{min_qual}/pr_rec.txt",
svim = expand("results/caller_comparison_long_read/eval/SVIM/min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"]+1,
config["quality_ranges"]["step"]))),
sniffles = expand("results/caller_comparison/eval/Sniffles/min_qual_{min_qual}/pr_rec.txt",
sniffles = expand("results/caller_comparison_long_read/eval/Sniffles/min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"]+1,
config["quality_ranges"]["step"]))),
pbsv = expand("results/caller_comparison/eval/pbsv/min_qual_{min_qual}/pr_rec.txt",
pbsv = expand("results/caller_comparison_long_read/eval/pbsv/min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"]+1,
config["quality_ranges"]["step"]))),
pbsv_without_DUP = expand("results/caller_comparison/eval/pbsv_without_DUP/min_qual_{min_qual}/pr_rec.txt",
pbsv_without_DUP = expand("results/caller_comparison_long_read/eval/pbsv_without_DUP/min_qual_{min_qual}/pr_rec.txt",
min_qual=list(range(config["quality_ranges"]["from"],
config["quality_ranges"]["to"]+1,
config["quality_ranges"]["step"])))
output:
igenvar = temp("results/caller_comparison/eval/igenvar.all_results.txt"),
svim = temp("results/caller_comparison/eval/svim.all_results.txt"),
sniffles = temp("results/caller_comparison/eval/sniffles.all_results.txt"),
pbsv = temp("results/caller_comparison/eval/pbsv.all_results.txt"),
pbsv_without_DUP = temp("results/caller_comparison/eval/pbsv_without_DUP.all_results.txt"),
all = "results/caller_comparison/eval/all_results.txt"
igenvar = temp("results/caller_comparison_long_read/eval/igenvar.all_results.txt"),
svim = temp("results/caller_comparison_long_read/eval/svim.all_results.txt"),
sniffles = temp("results/caller_comparison_long_read/eval/sniffles.all_results.txt"),
pbsv = temp("results/caller_comparison_long_read/eval/pbsv.all_results.txt"),
pbsv_without_DUP = temp("results/caller_comparison_long_read/eval/pbsv_without_DUP.all_results.txt"),
all = "results/caller_comparison_long_read/eval/all_results.txt"
threads: 1
run:
shell("cat {input.igenvar} > {output.igenvar}")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
rule plot_pr_all_results:
input:
"results/caller_comparison_long_read/eval/all_results.txt"
output:
"results/caller_comparison_long_read/eval/results.all.png"
log:
"logs/rplot.all.log"
shell:
"""
Rscript --vanilla Repos/iGenVar/test/benchmark/caller_comparison_long_read/workflow/scripts/plot_all_results.R \
{input} Repos/iGenVar/test/benchmark/F1_score.csv {output} > {log}
"""
13 changes: 13 additions & 0 deletions test/benchmark/caller_comparison_short_read/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
configfile: "Repos/iGenVar/test/benchmark/config/caller_comparison_config.yaml"

include: "workflow/rules/callers.smk"
include: "workflow/rules/eval.smk"
include: "workflow/rules/plots.smk"

##### Target rules #####

rule all:
input:
# SV calling
expand("results/caller_comparison/SVIM/variants.vcf"),
expand("results/caller_comparison/eval/results.all.png")
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
sample = config["parameters"]["sample"],
min_var_length = config["parameters"]["min_var_length"],
max_var_length = config["parameters"]["max_var_length"]

# iGenVar
# TODO (irallia 07.03.2022): change this to short read input
rule run_iGenVar_S:
input:
bam = config["long_bam"]
output:
vcf = "results/caller_comparison_short_read/iGenVar/variants.vcf"
threads: 1
shell:
"""
./build/iGenVar/bin/iGenVar --input_long_reads {input.bam} --output {output.vcf} --vcf_sample_name {sample} \
--threads {threads} --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1
"""

# TODO (irallia 07.03.2022): add short read input
rule run_iGenVar_SL:
input:
bam = config["long_bam"]
output:
vcf = "results/caller_comparison_short_read/iGenVar/variants.vcf"
threads: 1
shell:
"""
./build/iGenVar/bin/iGenVar --input_long_reads {input.bam} --output {output.vcf} --vcf_sample_name {sample} \
--threads {threads} --min_var_length {min_var_length} --max_var_length {max_var_length} --min_qual 1
"""
# Defaults:
# --method cigar_string --method split_read --method read_pairs --method read_depth
# --clustering_methods hierarchical_clustering --refinement_methods no_refinement
# --max_tol_inserted_length 50 --max_overlap 10 --hierarchical_clustering_cutoff 0.5
Loading