-
Notifications
You must be signed in to change notification settings - Fork 0
/
NGS_tools.txt
120 lines (92 loc) · 3.97 KB
/
NGS_tools.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#For Yale MPS sequencing workshop
#by Ben Evans
#v 0.2
#some useful sites and tools in parsing through your data
##formats##
#fastQ (sequence paired with quality scores)
#http://en.wikipedia.org/wiki/FASTQ_format
#SAM/BAM (alignments)
#http://samtools.sourceforge.net/samtools.shtml#5
#http://genome.sph.umich.edu/wiki/SAM
#VCF (variants)
#http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
##getting software##
#download to current directory
wget http://www.mysite.com/magical_package.tgz
#or
curl http://www.mysite.com/magical_package.tgz
#extract contents
tar xzvf magical_package.tgz
tar xzvf magical_package.tar.gz
tar xjvf magical_package.tar.bz2
#hopefully it's ready for you to use, if not...
#try typing make in the directory you extracted if it has a file named "Makefile"
make
#copy your fancy new executable(s) to a place where you like to keep useful software
#I like to keep mine in ~/bin
cp magical_program ~/bin/
#if you haven't already, tell bash where you keep your useful software
#add this to the end of ~/.bashrc or ~/.bash_profile files, whichever exists on your system
export PATH="$HOME/bin/:$PATH"
##Data Q/C##
#fastQC
#http://www.bioinformatics.babraham.ac.uk/projects/fastqc/
#get an archive with your qc results for fastq1.fastq
fastqc --noextract -t numcpus fastq1.fastq
#fastx
#install http://hannonlab.cshl.edu/fastx_toolkit/download.html
#examples http://hannonlab.cshl.edu/fastx_toolkit/commandline.html
#if you are using PHRED+33 encoded fastq, use -Q33
fastq_quality_filter -Q33 -i fastq1.fastq -q 20 -p 80 -o quality_filtered.fastq
##Mapping##
#bowtie2
#http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml
#samtools
#http://samtools.sourceforge.net/
#build your index
#name of reference fasta, name that you want to use in bowtie
bowtie2-build tort_sptree.fasta tort_sptree
#align your reads
#pipeline: align -> convert to bam -> position sort bam (necessary for variant calling)
#The --rg (read group) options are good practice as they specify inside your file information about your sample
#more info in the full sam spec, page 3 http://samtools.sourceforge.net/SAMv1.pdf
#They also are required if you decide to use GATK2 to variant call
bowtie2 --local --rg-id sample1 --rg SM:sample1 --rg LB:amplicon_seq --rg PL:IONTORRENT -x path/to/tort_sptree -U sequences/fastq1.fastq | samtools view -Shu - | samtools sort - sample1
#index your new bam file
samtools index sample1.bam
#using samtools to view your alignment
samtools tview sample1.bam path/to/tort_sptree.fasta
##variant calling##
#using samtools to variant call
#remove the option "I" to get indel calls too
#if you have more than one sample, simply list them
samtools mpileup -Q 17 -IuDf path/to/tort_sptree.fasta sample1.bam | bcftools view -vcg - > sample1_raw.vcf
#GATK2 variant calling
#prepare your reference
#http://gatkforums.broadinstitute.org/discussion/1601/how-can-i-prepare-a-fasta-file-to-use-as-reference
#read groups are REQUIRED in your alignments
java -jar CreateSequenceDictionary.jar R=ref.fasta O=ref.dict
samtools faidx ref.fasta
#http://www.broadinstitute.org/gatk/gatkdocs/org_broadinstitute_sting_gatk_walkers_genotyper_UnifiedGenotyper.html
##examining & filtering variants##
#vcftools
#http://sourceforge.net/projects/vcftools/files/
#http://vcftools.sourceforge.net/options.html
vcftools --vcf sample1_raw.vcf
##converting formats##
#other than stated above, you can sometimes interconvert formats with PGD spider
#http://www.cmpg.unibe.ch/software/PGDSpider/index.htm
#though be VERY careful and be sure to double check your output and the assumptions your next program will make about it!
#some linux fun
#convert all the bam files in the current directory back into fastq
for s in *.bam
do
samtools view $s | awk '!/^@/ {print "@"$1"\n"$10"\n+\n"$11}' > ${s/%bam/fastq}
done
#bash string replacement
#replace .fq with _trimmed.fq to maintain original files
#${variable/from/to}
for f in *.fq
do
fastx_trimmer -l 71 -i $f > ${f/.fq/_trimmed.fq}
done