Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: map to pangenome with vg #174

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions workflow/envs/vg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- vg =1.43.0
- samblaster =0.1.26
- samtools =1.16.1
2 changes: 1 addition & 1 deletion workflow/rules/benchmarking.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ruleorder: chm_eval_sample > map_reads
ruleorder: chm_eval_sample > vg_giraffe


rule gather_benchmark_calls:
Expand Down
34 changes: 14 additions & 20 deletions workflow/rules/mapping.smk
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
rule map_reads:
rule vg_giraffe:
input:
index=f"{genome}.gbz",
reads=get_map_reads_input,
idx=rules.bwa_index.output,
output:
temp("results/mapped/{sample}.bam"),
log:
"logs/bwa_mem/{sample}.log",
"logs/vg/giraffe/{sample}.log",
params:
extra=get_read_group,
sorting="samtools",
sort_order="coordinate",
threads: 8
wrapper:
"v1.10.0/bio/bwa/mem"
rg=get_read_group,
threads: workflow.cores
shell:
"vg giraffe --threads {threads} --gbz-name {input.index} "
"--fastq-in {input.reads} --output-format BAM --read-group {params.rg} | "
"samblaster --ignoreUnmated | samtools sort -Ob > {output} 2> {log}"


rule annotate_umis:
Expand Down Expand Up @@ -61,24 +61,18 @@ rule calc_consensus_reads:
"rbt collapse-reads-to-fragments bam {input} {output} &> {log}"


rule map_consensus_reads:
use rule vg_giraffe as map_consensus_reads with:
input:
index=f"{genome}.gbz",
reads=get_processed_consensus_input,
idx=rules.bwa_index.output,
output:
temp("results/consensus/{sample}.consensus.{read_type}.mapped.bam"),
params:
index=lambda w, input: os.path.splitext(input.idx[0])[0],
extra=lambda w: "-C {}".format(get_read_group(w)),
sort="samtools",
sort_order="coordinate",
wildcard_constraints:
read_type="pe|se",
rg=get_read_group,
log:
"logs/bwa_mem/{sample}.{read_type}.consensus.log",
threads: 8
wrapper:
"v1.10.0/bio/bwa/mem"
wildcard_constraints:
read_type="pe|se",


rule merge_consensus_reads:
Expand Down
42 changes: 34 additions & 8 deletions workflow/rules/ref.smk
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,20 @@ rule get_known_variants:
"v1.12.0/bio/reference/ensembl-variation"


rule get_hprc_pangenome:
output:
"resources/hprc-pangenome.vcf.gz",
log:
"logs/get-hprc-pangenome.log",
conda:
"../envs/curl.yaml"
cache: "omit-software"
shell:
"curl -L "
"https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.0-mc.grch38.vcfbub.a100k.wave.vcf.gz "
"> {output} 2> {log}"


rule get_annotation_gz:
output:
"resources/annotation.gtf.gz",
Expand Down Expand Up @@ -111,18 +125,30 @@ rule remove_iupac_codes:
"(rbt vcf-fix-iupac-alleles < {input} | bcftools view -Oz > {output}) 2> {log}"


rule bwa_index:
rule vg_index:
input:
genome,
genome=genome,
# Use the HPRC human pangenome if possible. Otherwise known variation from Ensembl.
# The latter has the disadvantage that it does not contain haplotypes.
# See https://github.com/vgteam/vg/issues/3776.
variants="resources/hprc-pangenome.vcf.gz"
if config["ref"]["species"] == "homo_sapiens"
and config["ref"]["build"] == "GRCh38"
else "resources/variation.vcf.gz",
output:
idx=multiext(genome, ".amb", ".ann", ".bwt", ".pac", ".sa"),
f"{genome}.gbz",
log:
"logs/bwa_index.log",
resources:
mem_mb=369000,
"logs/vg/index.log",
params:
prefix=lambda w, input: os.path.splitext(input.genome)[0],
conda:
"../envs/vg.yaml"
cache: True
wrapper:
"v1.10.0/bio/bwa/index"
threads: workflow.cores
shell:
"vg autoindex -t {threads} --workflow giraffe "
"-r {input.genome} -v {input.variants} -p x "
"--prefix {params.prefix} 2> {log}"


rule get_vep_cache:
Expand Down