-
Notifications
You must be signed in to change notification settings - Fork 0
/
crg.vcf2cre.sh
executable file
·98 lines (73 loc) · 3.66 KB
/
crg.vcf2cre.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/bin/bash
#PBS -l walltime=23:00:00,nodes=1:ppn=1
#PBS -joe .
#PBS -d .
#PBS -l vmem=50g,mem=50g
# 50g is crucial -20,30 crashes sometimes
# annotates vcf with VEP and vcfanno for cre for cre report generation
# parameters:
# original_vcf = file.vcf.gz, no tabix index is needed in the dir
# project = case = family = S11 (example)
# [optional] ped = file.ped
# qsub ~/cre/cre.vcf2cre.sh -v original_vcf=file.vcf,project=412
# in old bcbio vcf files from rna-seq pipeline, i.e. combined-annotated-rnaedit.vcf.gz, bcbio-nextgen1.0.4, some info field formats are wrong:
# AC, AF, MLEAC, MLEAF Number=1 not A. Because of that vt is not properly decomposing multiallelic variants, and vcf2db can't create gemini database
# solution is to put Number=A in the vcf header or rerun variant calling with the latest bcbio
# if input is from TCAG (HAS) it does not have DP INFO field, we need to fake it from FORMAT DP for SNVs and from DPI for indels:
# cre.vcf.has2dp.sh
# gunzip -c 331606_S1.flt.nochr.vcf.gz | grep "^#" > 331606.vcf
# add to header:
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
# gunzip -c 331606_S1.flt.nochr.vcf.gz | grep -v "^#" | grep PASS | sed s/":DPI:"/":DP:"awk -F ':' '{print $0"\tDP="$9}' | awk -F "\t" '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$11";"$8"\t"$9"\t"$10}' >> 331606.vcf
bname=`basename $original_vcf .vcf.gz`
echo "###############################################"
echo `date` ": Removing annotation of " $original_vcf
echo "###############################################"
cre.annotation.strip.sh $original_vcf
gunzip -c $bname.no_anno.vcf.gz | grep "^#" > $project.vcf
gunzip -c $bname.no_anno.vcf.gz | grep -v "^#" | grep PASS | grep -v possible_rnaedit | egrep -v "^GL000" >> $project.vcf
bgzip $project.vcf
tabix $project.vcf.gz
bcftools sort -Oz $project.vcf.gz > $project.sorted.vcf.gz
tabix $project.sorted.vcf.gz
cre.vt.decompose.sh $project.sorted.vcf.gz
echo "################################################"
echo `date` "Annotating with VEP..."
echo "################################################"
cre.vep.sh $project.sorted.decomposed.vcf.gz
echo "################################################"
echo `date` "Annotating with vcfanno ..."
echo "################################################"
crg.vcfanno.sh $project.sorted.decomposed.vepeffects.vcf.gz
if [ -z $ped ]
then
echo "no ped file, generating ..."
bcftools query -l $original_vcf > samples.txt
> $project.ped
for sample in `cat samples.txt`
do
echo -e "1\t"$sample"\t0\t0\t0\t0" >> $project.ped
done
ped=$project.ped
fi
echo "#################################################"
echo `date` "Generating gemini database"
echo "#################################################"
vcf2db.py $project.sorted.decomposed.vepeffects.annotated.vcf.gz $ped ${project}-ensemble.db
mkdir $project
mv ${project}-ensemble.db $project
mv $project.sorted.decomposed.vepeffects.annotated.vcf.gz ${project}/${project}-ensemble-annotated-decomposed.vcf.gz
mv $project.sorted.decomposed.vepeffects.annotated.vcf.gz.tbi ${project}/${project}-ensemble-annotated-decomposed.vcf.gz.tbi
cd $project
ln -s ${project}-ensemble-annotated-decomposed.vcf.gz ${project}-gatk-haplotype-annotated-decomposed.vcf.gz
ln -s ${project}-ensemble-annotated-decomposed.vcf.gz.tbi ${project}-gatk-haplotype-annotated-decomposed.vcf.gz.tbi
cd ..
rm $bname.no_anno.vcf.gz
rm $bname.no_anno.vcf.gz.tbi
rm $project.vcf.gz
rm $project.vcf.gz.tbi
rm $project.sorted*.vcf.gz
rm $project.sorted*.vcf.gz.tbi
echo "#####################################################"
echo `date` " DONE"
echo "#####################################################"