-
Notifications
You must be signed in to change notification settings - Fork 5
/
process.smake
138 lines (115 loc) · 3.64 KB
/
process.smake
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
##
## Rules for pre-processing
##
def get_ref_sample():
return ref_path('VCF')
#
# Phase With Reference Geonme
#
rule phase_with_ref:
input:
"{dir}/{file}.vcf.gz"
output:
"{dir}_RPH/{file}.vcf.gz"
params:
prefix="{dir}_RPH/{file}",
ref_sample = get_ref_sample()
threads: 64
shell:
"beaglew impute=false gt={input} ref={params.ref_sample}/{wildcards.file}.vcf.gz out={params.prefix}"
#
# Phase WITHOUT Reference Geonme
#
rule phase_without_ref:
input:
"{dir}/{file}.vcf.gz"
output:
"{dir}_PH/{file}.vcf.gz"
params:
prefix="{dir}_PH/{file}"
threads: 64
shell:
"beaglew impute=false gt={input} out={params.prefix}"
#
# Run LD Prunning
#
rule ld_prune:
input:
vcf="{dir}/{file}.vcf.gz",
idx="{dir}/{file}.vcf.gz.csi"
output:
vcf="{dir}_LD/{file}.vcf.gz",
tsv="{dir}_LD/{file}.tsv.gz"
params:
ref_sample = get_ref_sample()
shell:
"bcftools isec -n=2 -w 1 {params.ref_sample}/{wildcards.file}.vcf.gz {input.vcf} -Ou | bcftools +prune -l 0.95 -w 1kb -Ou - | bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' -H - | bgzip > {output.tsv} && tabix -s1 -b2 -e2 {output.tsv} && bcftools view -R {output.tsv} {input.vcf} -Oz -o {output.vcf}"
#
# Annotate with population AF (EUR_AF) and filter by MAF
#
rule annotate_eur_af:
input:
"{dir}/chr{chri}.vcf.gz"
output:
"{dir}_EurAF@{maf,[^_]+}/chr{chri}.vcf.gz"
params:
maf_upper = lambda wildcards: str(1-float(wildcards['maf']))
shell:
"bcftools annotate -a {REF_DIR}/G1K_SNP_EUR_EAF/chr{wildcards.chri}.tsv.gz -c 'CHROM,POS,REF,ALT,EUR_AF' -h {RES_DIR}/header_EUR_AF.vcf -Ou {input} | bcftools view -i 'INFO/EUR_AF>={wildcards.maf} & INFO/EUR_AF<={params.maf_upper}' -Oz -o {output} -"
#
# Annotate with population AF (REF_AF) and filter by MAF
#
rule fileter_with_ref_maf:
input:
vcf="{dir}/chr{chri}.vcf.gz",
idx="{dir}/chr{chri}.vcf.gz.csi"
output:
"{dir}_MAF@{maf,[^_]+}/chr{chri}.vcf.gz"
params:
maf_upper = lambda wildcards: str(1-float(wildcards['maf'])),
ref_sample = get_ref_sample()
shell:
"bcftools annotate -a {params.ref_sample}/chr{wildcards.chri}.vcf.gz -c 'REF_AF:=AF' -Ou {input.vcf} | bcftools view -i 'REF_AF>={wildcards.maf} & REF_AF<={params.maf_upper}' -Oz -o {output} -"
#
# Filter: retain only Bi Allelic SNPs with No Missing Genotypes
#
rule biallelic_snps_no_miss:
input:
"{dir}/{file}.vcf.gz"
output:
"{dir}_BiSnpNM/{file}.vcf.gz"
shell:
"bcftools view -m2 -M2 -v snps -g ^miss -Oz -o {output} {input}"
rule biallelic_snps:
input:
"{dir}/{file}.vcf.gz"
output:
"{dir}_BiSnp/{file}.vcf.gz"
shell:
"bcftools view -m2 -M2 -v snps -Oz -o {output} {input}"
rule no_miss:
input:
"{dir}/{file}.vcf.gz"
output:
"{dir}_NM/{file}.vcf.gz"
shell:
"bcftools view -g ^miss -Oz -o {output} {input}"
#
# Apply quality filters
#
rule quality_checks:
input:
"{dir}/{file}.vcf.gz"
output:
"{dir}_QC/{file}.vcf.gz"
shell:
"bcftools view -i 'INFO/MQ>59 & INFO/MQRankSum>-2 & AVG(FORMAT/DP)>20 & AVG(FORMAT/DP)<100 & INFO/QD>15 & INFO/BaseQRankSum>-2 & INFO/SOR<1' -f PASS -Ou {input} | "
"bcftools annotate -x INFO,FMT -Oz -o {output} -"
rule select_sample:
input:
vcf="{dir}/{file}.vcf.gz",
csi="{dir}/{file}.vcf.gz.csi"
output:
"{dir}_S@{sample,[^_]+}/{file}.vcf.gz"
shell:
"bcftools view -S _samples/{wildcards.sample}.txt --force-samples -Oz -o {output} {input.vcf}"