From fc7c1858616f4a4cc5576fba5cfdeaf45fe8fbf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Thu, 10 Nov 2022 19:45:13 -0500 Subject: [PATCH 1/4] feat: map with vg --- workflow/envs/vg.yaml | 9 +++++++++ workflow/rules/mapping.smk | 34 ++++++++++++++-------------------- workflow/rules/ref.smk | 20 ++++++++++++++++++++ 3 files changed, 43 insertions(+), 20 deletions(-) create mode 100644 workflow/envs/vg.yaml diff --git a/workflow/envs/vg.yaml b/workflow/envs/vg.yaml new file mode 100644 index 000000000..e2df66485 --- /dev/null +++ b/workflow/envs/vg.yaml @@ -0,0 +1,9 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - vg =1.43.0 + - samblaster + - samtools =0.1.26 + - samtools =1.16.1 diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 6ce1434e1..9790a2ffd 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -1,18 +1,18 @@ -rule map_reads: +rule vg_giraffe: input: + index=f"{genome}.gbz", reads=get_map_reads_input, - idx=rules.bwa_index.output, output: temp("results/mapped/{sample}.bam"), log: - "logs/bwa_mem/{sample}.log", + "logs/vg/giraffe/{sample}.log", params: - extra=get_read_group, - sorting="samtools", - sort_order="coordinate", - threads: 8 - wrapper: - "v1.10.0/bio/bwa/mem" + rg=get_read_group, + threads: workflow.cores + shell: + "vg giraffe --threads {threads} --gbz-name {input.index} " + "--fastq-in {input.reads} --output-format BAM --read-group {params.rg} | " + "samblaster --ignoreUnmated | samtools sort -Ob > {output} 2> {log}" rule annotate_umis: @@ -61,24 +61,18 @@ rule calc_consensus_reads: "rbt collapse-reads-to-fragments bam {input} {output} &> {log}" -rule map_consensus_reads: +use rule vg_giraffe as map_consensus_reads with: input: + index=f"{genome}.gbz", reads=get_processed_consensus_input, - idx=rules.bwa_index.output, output: temp("results/consensus/{sample}.consensus.{read_type}.mapped.bam"), params: - index=lambda w, input: os.path.splitext(input.idx[0])[0], - extra=lambda w: "-C {}".format(get_read_group(w)), - sort="samtools", - sort_order="coordinate", - wildcard_constraints: - read_type="pe|se", + rg=get_read_group, log: "logs/bwa_mem/{sample}.{read_type}.consensus.log", - threads: 8 - wrapper: - "v1.10.0/bio/bwa/mem" + wildcard_constraints: + read_type="pe|se", rule merge_consensus_reads: diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 8f448d938..739ac5f48 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -125,6 +125,26 @@ rule bwa_index: "v1.10.0/bio/bwa/index" +rule vg_index: + input: + genome=genome, + variants="resources/variation.vcf.gz", + output: + f"{genome}.gbz", + log: + "logs/vg/index.log", + params: + prefix=lambda w, input: os.path.splitext(input.genome)[0], + conda: + "../envs/vg.yaml" + cache: True + threads: workflow.cores + shell: + "vg autoindex -t {threads} --workflow giraffe " + "-r {input.genome} -v {input.variants} -p x " + "--prefix {params.prefix} 2> {log}" + + rule get_vep_cache: output: directory("resources/vep/cache"), From 364304f939ecefb8333f3b7d84e254fb305a337a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Thu, 10 Nov 2022 22:18:43 -0500 Subject: [PATCH 2/4] fix ruleorder --- workflow/rules/benchmarking.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/benchmarking.smk b/workflow/rules/benchmarking.smk index 28cd46acf..5787c86c0 100644 --- a/workflow/rules/benchmarking.smk +++ b/workflow/rules/benchmarking.smk @@ -1,4 +1,4 @@ -ruleorder: chm_eval_sample > map_reads +ruleorder: chm_eval_sample > vg_giraffe rule gather_benchmark_calls: From 0c9bbcbb5ae54ae2cff71982378173927375e149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Thu, 10 Nov 2022 22:35:22 -0500 Subject: [PATCH 3/4] fix env --- workflow/envs/vg.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflow/envs/vg.yaml b/workflow/envs/vg.yaml index e2df66485..b65e2c43b 100644 --- a/workflow/envs/vg.yaml +++ b/workflow/envs/vg.yaml @@ -4,6 +4,5 @@ channels: - nodefaults dependencies: - vg =1.43.0 - - samblaster - - samtools =0.1.26 + - samblaster =0.1.26 - samtools =1.16.1 From f5efc1fc0294c1d0398a7b87f15d653cf41b4eaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Sat, 12 Nov 2022 07:13:40 -0500 Subject: [PATCH 4/4] fixes --- workflow/rules/ref.smk | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 739ac5f48..36a78d857 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -59,6 +59,20 @@ rule get_known_variants: "v1.12.0/bio/reference/ensembl-variation" +rule get_hprc_pangenome: + output: + "resources/hprc-pangenome.vcf.gz", + log: + "logs/get-hprc-pangenome.log", + conda: + "../envs/curl.yaml" + cache: "omit-software" + shell: + "curl -L " + "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/minigraph-cactus/hprc-v1.0-mc.grch38.vcfbub.a100k.wave.vcf.gz " + "> {output} 2> {log}" + + rule get_annotation_gz: output: "resources/annotation.gtf.gz", @@ -111,24 +125,16 @@ rule remove_iupac_codes: "(rbt vcf-fix-iupac-alleles < {input} | bcftools view -Oz > {output}) 2> {log}" -rule bwa_index: - input: - genome, - output: - idx=multiext(genome, ".amb", ".ann", ".bwt", ".pac", ".sa"), - log: - "logs/bwa_index.log", - resources: - mem_mb=369000, - cache: True - wrapper: - "v1.10.0/bio/bwa/index" - - rule vg_index: input: genome=genome, - variants="resources/variation.vcf.gz", + # Use the HPRC human pangenome if possible. Otherwise known variation from Ensembl. + # The latter has the disadvantage that it does not contain haplotypes. + # See https://github.com/vgteam/vg/issues/3776. + variants="resources/hprc-pangenome.vcf.gz" + if config["ref"]["species"] == "homo_sapiens" + and config["ref"]["build"] == "GRCh38" + else "resources/variation.vcf.gz", output: f"{genome}.gbz", log: