-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipeline.sh
124 lines (103 loc) · 4.08 KB
/
pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/bin/bash
# WARNING: Do not execute this file all at once.
# Several commands submit batch jobs and need to be run sequentially.
### bash variables ###
# Prefix for reference genome files.
# For example, if your reference genome is in the file `/Users/username/reference/ref_sequence.fasta`,
# REFPREFIX should be `/Users/username/reference/ref_sequence`
REFPREFIX=path_to_reference_genome
# NOTE: bowtie2 and GATK may require you to make some index files for your reference,
# but I don't remember which ones. Please see the docs for those programs.
# File with the name of one chromosome on each line.
# Names should match your reference sequence file.
CHROMFILE=chromosomes.txt
# Text file containing metadata for your sequenced timepoints
# The first four columns specify nextera tag information.
# Column 5 gives the population name.
# Column 6 gives the generation number.
# Rows should be sorted by population and then by generation.
PLANFILE=planfile.txt
# Directory containing your zipped original fastq files
SOURCEDIR=path_to_raw_fastq_files/
# Directory to store large intermediate files (e.g. .fastq, .bam)
DATADIR=path_to_intermediate_data_files/
# Path to GATK .jar file, wherever it is installed.
GATKPATH=/absolute/path/GenomeAnalysisTK.jar
# Create a log directory for slurm log files and directories for output files
mkdir -p log/
mkdir -p $DATADIR/fastq/
mkdir -p $DATADIR/bam/
mkdir -p $DATADIR/vcf/
### Batch jobs on computing cluster ###
# Unzip and rename fastq files according to population and generation.
# Creates new files, does not modify source files.
# WARNING: Make sure that your fastq filenames match the format in unzip_and_rename.slurm
sbatch -J unzip -o log/unzip.out -e log/unzip.err \
src/unzip_and_rename.slurm \
$PLANFILE \
$SOURCEDIR/ \
$DATADIR/fastq/
# Trim adaptor sequences from fastq files.
sbatch -J trim -o log/trim.out -e log/trim.err \
src/trim_fastq.slurm \
$PLANFILE \
$DATADIR/fastq/
# Align reads to reference sequence with bowtie2.
# NOTE: Change to load the modules on your cluster
module load centos6/bowtie2-2.1.0
module load centos6/samtools-0.1.19
module load hpc/picard-tools-1.44
bash src/submit_bowtie_and_picard_jobs.sh \
$PLANFILE \
$DATADIR/fastq/ \
$DATADIR/bam/ \
$REFPREFIX
# Make a list of the duplicate-marked bam files you want to include in GATK run.
# NOTE: edit this file to ignore some bam files or include others.
ls -d $DATADIR/bam/*.dm.bam > bam_list.txt
# Call candidate variants using GATK.
sbatch --array=1-$(wc -l $CHROMFILE) \
src/run_GATK.slurm \
$GATKPATH \
$REFPREFIX \
$CHROMFILE \
bam_list.txt \
$DATADIR/vcf/snps_and_indels
# Parse vcf and merge files.
sbatch src/parse_vcfs.slurm \
$CHROMFILE \
$DATADIR/vcf/snps_and_indels \
$PLANFILE \
$DATADIR/vcf/snps_and_indels_parsed.txt
### May be done locally or interactively ###
# Filter candidate mutations. This is done in two steps:
# This script does most of the filtering. It filters on:
# - number of alternate alleles (=1)
# - total read count supporting the mutation (>=10)
# - maximum frequency within the focal population (>0.1)
# - average frequency outside of the focal population (<0.05)
# - frequency lag-1 autoccorelation (>0.2)
# NOTE: filter values are "magic numbers" in the code. You can modify them there.
python filter_mutations.py \
$DATADIR/vcf/snps_and_indels_parsed.txt \
$PLANFILE \
> snps_and_indels_filtered1.txt
# This script does one additonal filtering step:
# calculates the likelihood of the data under the model:
# k ~ Binom(n,p),
# where k is the total allele count within the focal popualtion,
# n is the total coverage in the focal population,
# and p is the allele frequency outside of the focal population.
python filter_by_binomial_pvalues.py \
snps_and_indels_filtered1.txt \
1e-5 low \
$PLANFILE \
> snps_and_indels_filtered2.txt
# Annotate mutations.
python src/annotate_mutations.py \
$CHROMFILE \
src/genetic_code_table.txt \
$REFPREFIX.gff \
$REFPREFIX.fasta \
< snps_and_indels_filtered2.txt \
> snps_and_indels_filtered_annotated.txt