-
Notifications
You must be signed in to change notification settings - Fork 0
/
GATK_CNV.py
189 lines (128 loc) · 7.7 KB
/
GATK_CNV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env python
from __future__ import print_function
import sys
import os.path
import argparse
import re
from subprocess import call
"""
Script for parsing a VCF file
"""
epi = ('\
\n\
Run GATK CNV pipeline from tumour and normal sample\n\
Your genome.intervals_list is output from GATK pipeline, and a panel-of-normals (from PON.py) \n\
\n\
')
# Describe what the script does
parser = argparse.ArgumentParser(description='This script writes the commands for running GATK CNV pipeline from tumour and normal data', epilog= epi, formatter_class=argparse.RawTextHelpFormatter)
# Get inputs
parser.add_argument('-g', '--genome', default=None, dest='genome', action='store', required=True, help="genome.interval_list")
parser.add_argument('-d', '--genome.dict', default=None, dest='gdic', action='store', required=True, help="genome.dict file")
parser.add_argument('-f', '--genome.fa', default=None, dest='gfa', action='store', required=True, help="genome.fa file")
parser.add_argument('-n', '--normal_bam', default=None, dest='norm', action='store', required=True, help="Normal.bam file")
parser.add_argument('-t', '--tumour_bam', default=None, dest='tum', action='store', required=True, help="Tumour.bam file")
parser.add_argument('-p', '--pon', default=None, dest='pon', action='store', required=True, help="Panel of Normal CNV hdf5 file")
# Check for no input
if len(sys.argv)==1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
# Check if input files exist
if not os.path.isfile(args.genome)==True:
print("Cannot find input file ",args.genome)
sys.exit(1)
# Check if input files exist
if not os.path.isfile(args.norm)==True:
print("Cannot find input file ",args.norm)
sys.exit(1)
# Check if input files exist
if not os.path.isfile(args.tum)==True:
print("Cannot find input file ",args.tum)
sys.exit(1)
# Check if input files exist
if not os.path.isfile(args.gdic)==True:
print("Cannot find input file ",args.gdic)
sys.exit(1)
# Check if input files exist
if not os.path.isfile(args.pon)==True:
print("Cannot find input file ",args.pon)
sys.exit(1)
# Create prefixes and files
norm=args.norm.rstrip()
normpx = norm.split("/")[-1]
normpx = ''.join(normpx.split(".bam")[0:-1])
tum=args.tum.rstrip()
tumpx = tum.split("/")[-1]
tumpx = ''.join(tumpx.split(".bam")[0:-1])
# Create versions of the genome, and link to here
genome=args.genome.split("/")[-1]
genomepx = '.'.join(genome.split(".")[0:-1])
px=tumpx
print ('gatk AnnotateIntervals -L 2_Segments/genome.preprocessed_intervals.interval_list -R genome.fa --sequence-dictionary genome.dict --interval-merging-rule OVERLAPPING_ONLY -O genome.GC.AnnotateIntervals')
# Create hdf5 files for tumour and normal
print ('bsub -P Analysis -J CRC%s -o CRC%s.o -e CRC%s.e -n 1 -R "rusage[mem=9500]" -q bio gatk --java-options "-Xmx9g" \
CollectReadCounts -I %s -L %s --interval-merging-rule OVERLAPPING_ONLY -O %s.hdf5' % (normpx,normpx,normpx,norm,args.genome,normpx))
print ('bsub -P Analysis -J CRC%s -o CRC%s.o -e CRC%s.e -n 1 -R "rusage[mem=9500]" -q bio gatk --java-options "-Xmx9g" \
CollectReadCounts -I %s -L %s --interval-merging-rule OVERLAPPING_ONLY -O %s.hdf5' % (tumpx,tumpx,tumpx,tum,args.genome,tumpx))
# Denoise the read counts for all tumour and normal samples, using the panel of normals
print('bsub -P Analysis -J DRC%s -o DRC%s.o -e DRC%s.e -n 1 -R "rusage[mem=9500]" -q bio gatk --java-options "-Xmx9g" DenoiseReadCounts -I %s.hdf5 \
--annotated-intervals genome.GC.AnnotateIntervals --count-panel-of-normals %s --standardized-copy-ratios %s.standardizedCR.tsv \
--denoised-copy-ratios %s.denoisedCR.tsv' % (normpx,normpx,normpx,normpx,args.pon,normpx,normpx))
print('bsub -P Analysis -J DRC%s -o DRC%s.o -e DRC%s.e -n 1 -R "rusage[mem=9500]" -q bio gatk --java-options "-Xmx9g" DenoiseReadCounts -I %s.hdf5 \
--annotated-intervals genome.GC.AnnotateIntervals --count-panel-of-normals %s --standardized-copy-ratios %s.standardizedCR.tsv \
--denoised-copy-ratios %s.denoisedCR.tsv' % (tumpx,tumpx,tumpx,tumpx,args.pon,tumpx,tumpx))
# Use customised R
print ('source activate r-3.5.0\nmkdir Plots')
# Plot the two files showed above to highlight differences and sanity check
print ('bsub -P Analysis -J PCR%s -o PCR%s.o -e PCR%s.e -n 1 -q bio \
gatk PlotDenoisedCopyRatio --standardized-copy-ratios %s.standardizedCR.tsv --denoised-copy-ratios %s.denoisedCR.tsv \
--sequence-dictionary %s.dict --minimum-contig-length 46709983 --output Plots --output-prefix %s' % (normpx,normpx,normpx,normpx,normpx,genomepx,normpx))
print ('bsub -P Analysis -J PCR%s -o PCR%s.o -e PCR%s.e -n 1 -q bio \
gatk PlotDenoisedCopyRatio --standardized-copy-ratios %s.standardizedCR.tsv --denoised-copy-ratios %s.denoisedCR.tsv \
--sequence-dictionary %s.dict --minimum-contig-length 46709983 --output Plots --output-prefix %s' % (tumpx,tumpx,tumpx,tumpx,tumpx,genomepx,tumpx))
#Collect counts at germline variant sites for the matched-control
print ('bsub -P Analysis -J CAC%s -o CAC%s.o -e CAC%s.e -n 1 -R "rusage[mem=30000]" -q bio \
gatk --java-options "-Xmx29g" CollectAllelicCounts -L /home/mzarowiecki/scratch/REF/af-only-gnomad.hg38.01.chr.vcf.gz \
-I %s -R %s -O %s.allelicCounts.tsv' % (normpx,normpx,normpx,args.norm,args.gfa,normpx))
#Collect counts at the same sites for the case sample
print ('bsub -P Analysis -J CAC%s -o CAC%s.o -e CAC%s.e -n 1 -R "rusage[mem=30000]" -q bio \
gatk --java-options "-Xmx29g" CollectAllelicCounts -L /home/mzarowiecki/scratch/REF/af-only-gnomad.hg38.01.chr.vcf.gz \
-I %s -R %s -O %s.allelicCounts.tsv' % (tumpx,tumpx,tumpx,args.tum,args.gfa,tumpx) )
# ModelSegments create segments across the genome for paired tumour and normal sample
print ("mkdir Segments")
print ('bsub -P Analysis -J MS%s -o MS%s.o -e MS%s.e -n 1 -R "rusage[mem=30000]" -q bio gatk --java-options "-Xmx29g" ModelSegments \
--denoised-copy-ratios %s.denoisedCR.tsv \
--allelic-counts %s.allelicCounts.tsv \
--normal-allelic-counts %s.allelicCounts.tsv \
--output Segments \
--output-prefix %s ' % (tumpx,tumpx,tumpx,tumpx,tumpx,normpx,tumpx) )
# Call copy-neutral, amplified and deleted segments with CallCopyRatioSegments
print ('bsub -P Analysis -J CRS%s -o CRS%s.o -e CRS%s.e -n 1 -q bio gatk CallCopyRatioSegments \
--input Segments/%s.cr.seg \
--output Segments/%s.called.seg ' % (tumpx,tumpx,tumpx,tumpx,tumpx) )
print ('source activate r-3.5.0\nmkdir SegPlots')
# PlotModeledSegments visualizes copy and allelic ratio segmentation results.
print ('bsub -P Analysis -J PMS%s -o PMS%s.o -e PMS%s.e -n 1 -q bio gatk PlotModeledSegments \
--denoised-copy-ratios %s.denoisedCR.tsv \
--allelic-counts Segments/%s.hets.tsv \
--segments Segments/%s.modelFinal.seg \
--sequence-dictionary %s \
--minimum-contig-length 46709983 \
--output SegPlots \
--output-prefix %s ' % (tumpx,tumpx,tumpx,tumpx,tumpx,tumpx,args.gdic,tumpx) )
# Tag germline events
print '(gatk TagGermlineEvents)'
# Check outputs
##################
#
# Continue with the GATK ACNV workflow
# https://gatkforums.broadinstitute.org/gatk/discussion/7387/description-and-examples-of-the-steps-in-the-acnv-case-workflow
#
##################
print ('gatk GetBayesianHetCoverage --reference %s --snpIntervals /home/mzarowiecki/scratch/REF/af-only-gnomad.hg38.01.chr.vcf.gz --tumor %s --tumorHets %s.tum.hets.tsv --normal %s --normalHets %s.norm.hets.tsv --hetCallingStringency 30' % (args.gfa,args.tum,tumpx,args.norm,normpx))
print ('gatk AllelicCNV --tumorHets %s.tum.hets.tsv --tangentNormalized <coverage_profile> --segments <called_segments> --outputPrefix <output_prefix>' % (tumpx))
print ('gatk CallCNLoHAndSplits --tumorHets <tumor_het_pulldown>
--segments <acnv_segments> --tangentNormalized <coverage_profile> --outputDir <output_dir>
--rhoThreshold 0.2 --numIterations 10 ')
quit()