-
Notifications
You must be signed in to change notification settings - Fork 0
/
fkReadsProcessingProtocolCLEAN
865 lines (601 loc) · 34.4 KB
/
fkReadsProcessingProtocolCLEAN
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
koko login: [email protected]
###FK MCAV Reads Processing Protocol
##Downloading Reads from Basespace
#Go to the bin on your local drive
cd ~/bin
brew tap basespace/basespace && brew install bs-cli #Use homebrew to install the basespace command line downloader
bs auth #Should generate a link to authenticate the usage, go to the link in your browser
bs download project --name JA20179 -o [email protected]:~/2bRAD/floridaKeys/rawReads #--name is the project name and -o is the output directory, in this case uploading the reads directly to koko
# bs download project --name JA20179 -o /Users/student/Documents/Florida\ Keys/rawReads #Or download to s home directory
cd ~/2bRAD/floridaKeys/rawReads #There should be 50 files
################################################################################
mkdir concatReads #Make a new directory where you can concatenate your reads between the two lanes.
srun cp ./rawReads/*.fastq.gz ./concatReads
cd concatReads
gunzip *.fastq.gz #unzip all of your files
nano concat.sh #See script below, concatenate all of your files across lanes
#!/bin/sh
#SBATCH --partition shortq7
#SBATCH --nodes 1
#SBATCH --exclusive
#SBATCH --mail-type=all
#SBATCH [email protected]
cat 10_S38_L001_R1_001.fastq 10_S38_L002_R1_001.fastq > fk10.fq
cat 11_S39_L001_R1_001.fastq 11_S39_L002_R1_001.fastq > fk11.fq
cat 12_S40_L001_R1_001.fastq 12_S40_L002_R1_001.fastq > fk12.fq
cat 13_S41_L001_R1_001.fastq 13_S41_L002_R1_001.fastq > fk13.fq
cat 14_S42_L001_R1_001.fastq 14_S42_L002_R1_001.fastq > fk14.fq
cat 15_S43_L001_R1_001.fastq 15_S43_L002_R1_001.fastq > fk15.fq
cat 16_S44_L001_R1_001.fastq 16_S44_L002_R1_001.fastq > fk16.fq
cat 17_S45_L001_R1_001.fastq 17_S45_L002_R1_001.fastq > fk17.fq
cat 18_S46_L001_R1_001.fastq 18_S46_L002_R1_001.fastq > fk18.fq
cat 19_S47_L001_R1_001.fastq 19_S47_L002_R1_001.fastq > fk19.fq
cat 1_S29_L001_R1_001.fastq 1_S29_L002_R1_001.fastq > fk1.fq
cat 20_S48_L001_R1_001.fastq 20_S48_L002_R1_001.fastq > fk20.fq
cat 21_S49_L001_R1_001.fastq 21_S49_L002_R1_001.fastq > fk21.fq
cat 22_S50_L001_R1_001.fastq 22_S50_L002_R1_001.fastq > fk22.fq
cat 23_S51_L001_R1_001.fastq 23_S51_L002_R1_001.fastq > fk23.fq
cat 24_S52_L001_R1_001.fastq 24_S52_L002_R1_001.fastq > fk24.fq
cat 25_S53_L001_R1_001.fastq 25_S53_L002_R1_001.fastq > fk25.fq
cat 2_S30_L001_R1_001.fastq 2_S30_L002_R1_001.fastq > fk2.fq
cat 3_S31_L001_R1_001.fastq 3_S31_L002_R1_001.fastq > fk3.fq
cat 4_S32_L001_R1_001.fastq 4_S32_L002_R1_001.fastq > fk4.fq
cat 5_S33_L001_R1_001.fastq 5_S33_L002_R1_001.fastq > fk5.fq
cat 6_S34_L001_R1_001.fastq 6_S34_L002_R1_001.fastq > fk6.fq
cat 7_S35_L001_R1_001.fastq 7_S35_L002_R1_001.fastq > fk7.fq
cat 8_S36_L001_R1_001.fastq 8_S36_L002_R1_001.fastq > fk8.fq
cat 9_S37_L001_R1_001.fastq 9_S37_L002_R1_001.fastq > fk9.fq
################################################################################
mkdir trimmedReads
srun cp ./concatReads/*.fq ./trimmedReads
#Set up a script to trim and deduplicate the files
2bRAD_trim_launch_dedup.pl fq > trims #Trim and deduplicate files
launcher_creator.py -j trims -n trims -t 2:00:00 -e [email protected] -q shortq7
sbatch trims.slurm
################################################################################
mkdir renamedReads
srun cp ./trimmedReads/*.tr0 ./renamedReads
#Create and scp a csv file with a column of file names (as deduplicated with the in-line barcodes) and what you want to rename the filed to (sample ID #). Upload the csv to the working directory and copy the sampleRename.py script
python sampleRename.py -i sampleRename -f tr0
################################################################################
mkdir highQualityReads
srun cp ./renamedReads/*.tr0 ./highQualityReads
# Quality filtering using fastx_toolkit
# Creating a list of filtering commands:
# The options -q 20 -p 90 mean that 90% or more of all bases within the read should have PHRED quality of at least 20 (i.e., probability of error 1% or less)
# PHRED quality=10*(-log10(P.error))
ls *.tr0 | perl -pe 's/^(\S+)\.tr0$/cat $1\.tr0 \| fastq_quality_filter -q 20 -p 90 >$1\.trim/' >filt0
# NOTE: run the next line ONLY if your qualities are 33-based (GSAF results are 33-based):
cat filt0 | perl -pe 's/filter /filter -Q33 /' > filt
################################################################################
mkdir mappedReadsNoConcat
srun cp ./highQualityReads/*.trim ./mappedReadsNoConcat
mkdir referenceGenome #Ensure that the concatenated MCAV and algal symbiont transcriptomes are in this directory along with the associated genome index files
cd mappedReadsNoConcat
GENOME_FASTA=/home/asturm2017/2bRAD/floridaKeys/referenceGenome/mcav_syms_updated.fasta
# mapping with --local option, enables clipping of mismatching ends (guards against deletions near ends of RAD tags)
2bRAD_bowtie2_launch.pl '\.trim$' $GENOME_FASTA > maps #Execute all commands in maps
#Check to ensure that the number of sams files made matches the number of bams files
################################################################################
mkdir bamsNoConcat
srun cp ./mappedReadsNoConcat/*.sam ./bamsNoConcat
>s2b
for file in *.sam; do
echo "samtools sort -O bam -o ${file/.sam/}.bam $file && samtools index ${file/.sam/}.bam">>s2b;
done
#Execute all commands written to s2b
#Ensure that the number of output bams files matches the number of original sams files
###Now we do ANGSD for the purpose of producing an IBS matrix so we can generate an IBS dendrogram and check to make sure all re-extracts, technical replicates, etc. are clones of one another
export GENOME_REF=/home/asturm2017/2bRAD/floridaKeys/referenceGenome/Mcavernosa_July2018.fasta
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -baq 1 -ref $GENOME_REF -maxDepth 2570"
TODO="-doQsDist 1 -doDepth 1 -doCounts 1"
angsd -b bams -GL 1 $FILTERS $TODO -P 1 -out dd
Rscript ~/bin/plotQC.R dd
cat dd.info
# scp dd.pdf to laptop to look at distribution of base quality scores, fraction of sites in each sample passing coverage thresholds, and fraction of sites passing genotyping rates cutoffs. Use these to guide choices of -minQ, -minIndDepth and -minInd filters in subsequent ANGSD runs
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 30 -minInd 205 -snp_pval 1e-5 -minMaf 0.05 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5"
TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 32 -doPost 1 -doGlf 2"
angsd -b bams -GL 1 $FILTERS $TODO -P 1 -out noSamplesConcat
NSITES=`zcat noSamplesConcat.mafs.gz | wc -l`
echo $NSITES
## 13,819 SNPs
#scp the ibsMAT and use the output to generate a cluster dendrogram so you can identify, repeated samples, clones, etc.
################################################################################
mkdir tr0Concat
srun cp ./renamedReads/*.tr0 ./tr0Concat
###Concatenate any files that may be re-extracts or bad barcode files that are clones of good bc samples.
##For now leave technical replicates and natural clones
#YOU SHOULD HAVE 223 FILES
################################################################################
mkdir highQualityConcat
srun cp ./tr0Concat/*.tr0 ./highQualityConcat
# Quality filtering using fastx_toolkit
# The options -q 20 -p 90 mean that 90% or more of all bases within the read should have PHRED quality of at least 20 (i.e., probability of error 1% or less)
# PHRED quality=10*(-log10(P.error))
ls *.tr0 | perl -pe 's/^(\S+)\.tr0$/cat $1\.tr0 \| fastq_quality_filter -q 20 -p 90 >$1\.trim/' >filt0
# NOTE: run the next line ONLY if your qualities are 33-based (GSAF results are 33-based):
cat filt0 | perl -pe 's/filter /filter -Q33 /' > filt
#if you did NOT run the line above, run this one:
mv filt0 filt
launcher_creator.py -j filt -n filt -t 1:00:00 -e [email protected] -q shortq7
sbatch filt.slurm
################################################################################
#After initial reviews there needed to be some reanalysis all of this between the >>>> was moved to "oldAnalysis"
mkdir sams
srun cp ./highQualityConcat/*.trim ./sams
#Make sure your references are indexed with bowtie2
cd ~/genomes/symbiontGenomes/singleChromo
#Here I made a mega concatenated version of genomes for Symbiodinium, Breviolum, Cladocopium, and Durusdinium. The papers these are from and which chromosomes each genus is associated with are listed.
#Symbiodinium microadriacticum (Aranda et al. 2016), Chromosomes 1-5
#Breviolum minutum (Shoguchi et al. 2013), Chromosomes 6-9
#Cladocopium goreaui (Liu et al. 2018), Chromosomes 10-15
#Durusdinium trenchii (Shoguchi et al. 2021), Chromosomes 16-19
export GENOME_FASTA=~/genomes/symbiontGenomes/singleChromo/concatZooxGenomes.fasta
echo 'bowtie2-build $GENOME_FASTA $GENOME_FASTA' > btb
launcher_creator.py -j btb -n btb -t 6:00:00 -e [email protected] -q shortq7
sbatch btb.slurm
srun samtools faidx $GENOME_FASTA
# First need to Map reads to algal symbiont reference genome with soft-clipping (Bowtie2 --local option) to avoid indels near read ends
GENOME_FASTA=~/genomes/symbiontGenomes/singleChromo/concatZooxGenomes.fasta.gz
cd sams
mkdir zoox
mkdir unmapped
#2bRAD_bowtie2_launch_zoox.pl This new script aligns trim files to the concatenated algal symbiont genomes. It then produces fastq files with aligned reads, fastq files with unaligned reads, and sam files which allow us to calculate the alignment rates to the algal symbiont genomes.
#Adapted the bowtie script to do the following
./2bRAD_bowtie2_launch_zoox.pl '\.trim$' ~/genomes/symbiontGenomes/singleChromo/concatZooxGenomes.fasta.gz > maps1
launcher_creator.py -N 10 -j maps1 -n maps1 -t 6:00:00 -e [email protected] -q shortq7
sbatch maps1.slurm
>alignmentRates
for F in `ls *trim`; do
M=`grep -E '^[ATGCN]+$' $F | wc -l | grep -f - maps.e* -A 4 | tail -1 | perl -pe 's/maps\.e\d+-|% overall alignment rate//g'` ;
echo "$F.sam $M">>alignmentRates;
done
##############################################################################
cd ~/2bRAD/floridaKeys/sams/zoox
mkdir zooxOnly
mkdir zooxAndMcav
./2bRAD_bowtie2_launch_dual_filter.pl '\.fq$' ~/genomes/Mcav_genome/Mcavernosa_July2018.fasta > mapsZoox2Mcav
launcher_creator.py -N 10 -j mapsZoox2Mcav -n mapsZoox2Mcav -t 6:00:00 -e [email protected] -q shortq7
sbatch mapsZoox2Mcav.slurm
#As a sanity check
cd zooxOnly
2bRAD_bowtie2_launch.pl '\.fq$' ~/genomes/symbiontGenomes/singleChromo/concatZooxGenomes.fasta.gz > zoox2Zoox
launcher_creator.py -N 5 -j zoox2Zoox -n zoox2Zoox -t 6:00:00 -e [email protected] -q shortq7
sbatch zoox2Zoox.slurm
#Check the zoox2Zoox.e* file and make sure these have 100% alignment rate
mkdir ../zooxSams
################################################################################
mkdir zooxBamsNoClones
srun cp ./zooxSams/*.sam ./zooxBamsNoClones
#Remove the host clone files
#Compressing, sorting and indexing the SAM files, so they become BAM files:
>s2b
for file in *.sam; do
echo "samtools sort -O bam -o ${file/.sam/}.bam $file && samtools index ${file/.sam/}.bam">>s2b;
done
launcher_creator.py -j s2b -n s2b -t 1:00:00 -N 5 -e [email protected] -q shortq7
sbatch s2b.slurm
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 25 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5 -hetbias_pval 1e-5 -skipTriallelic 1 -minInd 162 -snp_pval 1e-5 -minMaf 0.05"
TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 8 -doBcf 1 -doPost 1 -doGlf 2"
# Starting angsd with -P the number of parallel processes. Funny but in many cases angsd runs faster on -P 1
srun angsd -b zooxBamsNoClones -GL 1 $FILTERS $TODO -P 1 -out fkZooxNoClones
# how many SNPs?
NSITES=`zcat fkZooxNoClones.mafs.gz | wc -l`
echo $NSITES
#Only 4 sites, not enough to do anything with
#But can count alignments
for file in *.bam; do
for i in *.bam; do
echo $i >> output
samtools idxstats $i | cut -f 1,3 >> output
done
################################################################################
cd ~/2bRAD/floridaKeys/sams/unmapped
2bRAD_bowtie2_launch.pl '\.fq$' ~/genomes/Mcav_genome/Mcavernosa_July2018.fasta > mapsUn2Mcav
launcher_creator.py -N 10 -j mapsUn2Mcav -n mapsUn2Mcav -t 6:00:00 -e [email protected] -q shortq7
sbatch mapsUn2Mcav.slurm
mkdir ../mcavSams
#Compressing, sorting and indexing the SAM files, so they become BAM files:
>s2b
for file in *.sam; do
echo "samtools sort -O bam -o ${file/.sam/}.bam $file && samtools index ${file/.sam/}.bam">>s2b;
done
launcher_creator.py -j s2b -n s2b -t 6:00:00 -N 5 -e [email protected] -q shortq7
sbatch s2b.slurm
################################################################################
mkdir mcavANGSDClones
srun mv ./mcavBams/*bam* ./mcavANGSDClones/
ls *.bam > bamsClones
# angsd settings:
# -minMapQ 20 : only highly unique mappings (prob of erroneous mapping = 1%)
# -baq 1 : realign around indels (not terribly relevant for 2bRAD reads mapped with --local option)
# -maxDepth : highest total depth (sum over all samples) to assess; set to 10x number of samples
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -maxDepth 2230"
# T O D O :
TODO="-doQsDist 1 -doDepth 1 -doCounts 1 -dumpCounts 2"
srun angsd -b bamsClones -GL 1 $FILTERS $TODO -P 1 -out ddClones
# summarizing results (using modified script by Matteo Fumagalli)
srun --mem=200GB Rscript ~/bin/plotQC.R ddClones > qranks
# proportion of sites covered at >5x:
cat qranks
# scp dd.pdf to laptop to look at distribution of base quality scores, fraction of sites in each sample passing coverage thresholds, and fraction of sites passing genotyping rates cutoffs. Use these to guide choices of -minQ, -minIndDepth and -minInd filters in subsequent ANGSD runs
##ANGSD WITH NEW FILTERS W CLONES
# Note: PCA and Admixture are not supposed to be run on data that contain clones or genotyping replicates. For PCA, these can be removed without rerunning ANGSD from the IBS distance matrix; but for ngsAdmix ANGSD must be rerun.
# Generating genotype likelihoods from highly confident (non-sequencing-error) SNPs
# set minInd to 75-80% of your total number of bams
# if you expect very highly differentiated populations with nearly fixed alternative alleles, remove '-hwe_pval 1e-5' form FILTERS
# -doGeno 8 : genotype likelihood format setting for ngsLD; if you want to run PCA, use -doGeno 32 (but I recommend using ibsMat for all ordination work)
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 25 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5 -hetbias_pval 1e-5 -skipTriallelic 1 -minInd 168 -snp_pval 1e-5 -minMaf 0.05"
TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 8 -doBcf 1 -doPost 1 -doGlf 2"
# Starting angsd with -P the number of parallel processes. Funny but in many cases angsd runs faster on -P 1
srun angsd -b bamsClones -GL 1 $FILTERS $TODO -P 1 -out fkMcavClones
# how many SNPs?
NSITES=`zcat fkMcavClones.mafs.gz | wc -l`
echo $NSITES
#scp the ibs matrix to identify clones
################################################################################
mkdir mcavBamsNoClones
srun cp ./mcavBams/*.bam* ./mcavBamsNoClone
#Manually remove clones and technical replicates based on IBS dendrogram
#YOU SHOULD HAVE 215 FILES
################################################################################
mkdir mcavANGSDNoClones
srun cp ./mcavBamsNoClone/*bam* ./mcavANGSDNoClones
# Note: PCA and Admixture are not supposed to be run on data that contain clones or genotyping replicates. For PCA, these can be removed without rerunning ANGSD from the IBS distance matrix; but for ngsAdmix ANGSD must be rerun.
# Generating genotype likelihoods from highly confident (non-sequencing-error) SNPs
# set minInd to 75-80% of your total number of bams
# if you expect very highly differentiated populations with nearly fixed alternative alleles, remove '-hwe_pval 1e-5' form FILTERS
# -doGeno 8 : genotype likelihood format setting for ngsLD; if you want to run PCA, use -doGeno 32 (but I recommend using ibsMat for all ordination work)
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 25 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5 -hetbias_pval 1e-5 -skipTriallelic 1 -minInd 162 -snp_pval 1e-5 -minMaf 0.05"
TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 8 -doBcf 1 -doPost 1 -doGlf 2"
# Starting angsd with -P the number of parallel processes. Funny but in many cases angsd runs faster on -P 1
srun angsd -b bamsNoClones -GL 1 $FILTERS $TODO -P 1 -out fkMcavNoClones
# how many SNPs?
NSITES=`zcat fkMcavNoClones.mafs.gz | wc -l`
echo $NSITES
# NgsAdmix for K from 2 to 11 : do not run if the dataset contains clones or genotyping replicates!
for K in `seq 2 11` ;
do
NGSadmix -likes fkMcavNoClones.beagle.gz -K $K -P 10 -o fkMcavNoClones_k${K};
done
srun ~/bin/evalAdmix/evalAdmix -beagle fkMcavNoClones.beagle.gz -fname fkMcavNoClones_k2.fopt.gz -qname fkMcavNoClones_k2.qopt -o fkMcavNoClones_k2_residuals
#Run evalAdmix
for K in `seq 2 11` ;
do
srun ~/bin/evalAdmix/evalAdmix -beagle fkMcavNoClones.beagle.gz -fname fkMcavNoClones_k${K}.fopt.gz -qname fkMcavNoClones_k${K}.qopt -o fkMcavNoClones_k${K}_residuals -P 20
done
mkdir ngsAdmixKselect
srun cp *.beagle.gz ./ngsAdmixKselect
cd ngsAdmixKselect
## Next, run NGSadmix ten iterations for each K take the likelihood value from each run of NGSadmix and put them into a file that can be used with Clumpak to calculate the most likely K using the methods of Evanno et al. (2005).
>ngsAdmix
for R in {1..10}; do
for K in {1..11}; do
echo "NGSadmix -likes fkMcavNoClones.beagle.gz -K $K -P 10 -o fkMcavNoClones_k${K}_run$R" >> ngsAdmix;
done;
done
launcher_creator.py -j ngsAdmix -n ngsAdmix -q shortq7 -N 3 -t 06:00:00
sbatch ngsAdmix.slurm
> logfile
for log in *.log; do
grep -Po 'like=\K[^ ]+' $log >> logfile;
done
#format for CLUMPAK in R
R
# you are now using R in the terminal
logs <- as.data.frame(read.table("logfile"))
logs$K <- c(rep("10",10),rep("11",10),rep("1",10),rep("2",10),rep("3",10),rep("4",10),rep("5",10),rep("6", 10),rep("7",10),rep("8",10),rep("9",10))
write.table(logs[, c(2, 1)], "logfile_formatted", row.names = F, col.names = F, quote = F)
quit()
#Save and upload to Clumpak
mkdir structureSelector
srun cp *.qopt ./structureSelector #Copy all your ngsadmix runs to a directory
rename .qopt .Q *.qopt
#Download this directory and zip it, upload along with a popMap to structureSelector to run the Puechamille method
################################################################################
#Calculating population parameters no filter on excess heterozygosity
# estimating site frequency likelihoods for each population, also saving allele frequencies (for genome scan)
mkdir angsdPopStats
srun cp ./mcavBamsNoClones/*bam* ./angsdPopStats #copy over all the bam files (clones removed) and the bam lists for each pop. Pop definitions are below
FILTERS="-uniqueOnly 1 -remove_bads 1 -skipTriallelic 1 -minMapQ 20 -minQ 25 -doHWE 1 -sb_pval 1e-5 -hetbias_pval 1e-5 -minInd 162" #Note there are no MAF or snp filters so as not to affect allelic frequencies that may mess with heterozygosity calcs
TODO="-doMajorMinor 1 -doMaf 1 -dosnpstat 1 -doPost 2 -doGeno 11 -doGlf 2"
srun angsd -b bamsNoClones -GL 1 $FILTERS $TODO -P 1 -out fkMcavNoSNPFilter
NSITES=`zcat fkMcavNoSNPFilter.mafs.gz | wc -l`
echo $NSITES
srun --mem=100GB Rscript ~/bin/heterozygosity_beagle.R fkMcavNoSNPFilter.beagle.gz
#Prints individual heterozygosity
echo '#!/bin/bash' > RHet.sh
echo 'Rscript ~/bin/heterozygosity_beagle.R fkMcavNoSNPFilter.beagle.gz' >> RHet.sh
sbatch -e RHet.e%j -o RHet.o%j --mem=200GB --mail-user [email protected] --mail-type=ALL RHet.sh
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
mkdir samsConcat
srun cp ./highQualityConcat/*.trim ./samsConcat
# Mapping reads to a reference genome with soft-clipping (Bowtie2 --local option) to avoid indels near read ends
GENOME_FASTA=/home/asturm2017/2bRAD/floridaKeys/referenceGenome/mcav_syms_updated.fasta
2bRAD_bowtie2_launch.pl '\.trim$' $GENOME_FASTA > maps
launcher_creator.py -j maps -n maps -t 2:00:00 -e [email protected] -q shortq7
sbatch maps.slurm
#See how many reads aligned with zoox transcriptomes
#Using ~2Mb contig as a host reference; look up contig lengths in the header of any sam file
#Run scripts to count read alignments to each of the algal symbiont transcriptomes
zooxType.pl host="Sc0000000" >zooxCounts.txt
#scp zooxCounts.txt to local directory and open as csv file
################################################################################
mkdir mcavSams
srun cp ./samsConcat/*.sam ./mcavSams
#Re-write sams files without any reads aligning to algal symbiont transcriptomes
for f in *.sam
do
egrep -v "chr11|chr12|chr13|chr14" < "$f" > "$f".mcav.sam
done
#These sam files should now only have MCAV reads
################################################################################
mkdir zooxSams
srun cp ./samsConcat/*.sam ./zooxSams
#Re-write sams files without any reads aligning to algal symbiont transcriptomes
for f in *.trim.bt2.sam
do
egrep "chr11|chr12|chr13|chr14" < "$f" > "$f".trim.bt2.zoox.sam
done
#These sam files should now only have zoox reads
################################################################################
mkdir mcavBams
srun cp ./mcavSams/*.sam ./mcavBams
#Compressing, sorting and indexing the SAM files, so they become BAM files:
>s2b
for file in *.sam; do
echo "samtools sort -O bam -o ${file/.sam/}.bam $file && samtools index ${file/.sam/}.bam">>s2b;
done
launcher_creator.py -j s2b -n s2b -t 1:00:00 -N 5 -e [email protected] -q shortq7
sbatch s2b.slurm
ls *bam >bams
#Archive these BAMS files
#Read counts
for i in *.bam; do
echo $i >> output
samtools view $i | cut -f1 | sort | uniq | wc -l >> output
done
################################################################################
mkdir zooxBams
srun cp ./zooxSams/*.sam ./zooxBams
#Compressing, sorting and indexing the SAM files, so they become BAM files:
>s2b
for file in *.sam; do
echo "samtools sort -O bam -o ${file/.sam/}.bam $file && samtools index ${file/.sam/}.bam">>s2b;
done
launcher_creator.py -j s2b -n s2b -t 1:00:00 -N 5 -e [email protected] -q shortq7
sbatch s2b.slurm
ls *bam >bams
#Archive these BAMS files
#Read counts
for i in *.bam; do
echo $i >> output
samtools view $i | cut -f1 | sort | uniq | wc -l >> output
done
################################################################################
mkdir mcavANGSDClones
srun cp ./mcavBams/*bam* ./mcavANGSDClones/
# angsd settings:
# -minMapQ 20 : only highly unique mappings (prob of erroneous mapping = 1%)
# -baq 1 : realign around indels (not terribly relevant for 2bRAD reads mapped with --local option)
# -maxDepth : highest total depth (sum over all samples) to assess; set to 10x number of samples
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -maxDepth 2230"
# T O D O :
TODO="-doQsDist 1 -doDepth 1 -doCounts 1 -dumpCounts 2"
srun angsd -b bams -GL 1 $FILTERS $TODO -P 1 -out ddClones
# summarizing results (using modified script by Matteo Fumagalli)
srun Rscript ~/bin/plotQC.R ddClones > qranks
# proportion of sites covered at >5x:
cat qranks
# scp dd.pdf to laptop to look at distribution of base quality scores, fraction of sites in each sample passing coverage thresholds, and fraction of sites passing genotyping rates cutoffs. Use these to guide choices of -minQ, -minIndDepth and -minInd filters in subsequent ANGSD runs
##ANGSD WITH NEW FILTERS W CLONES
# Note: PCA and Admixture are not supposed to be run on data that contain clones or genotyping replicates. For PCA, these can be removed without rerunning ANGSD from the IBS distance matrix; but for ngsAdmix ANGSD must be rerun.
# Generating genotype likelihoods from highly confident (non-sequencing-error) SNPs
# set minInd to 75-80% of your total number of bams
# if you expect very highly differentiated populations with nearly fixed alternative alleles, remove '-hwe_pval 1e-5' form FILTERS
# -doGeno 8 : genotype likelihood format setting for ngsLD; if you want to run PCA, use -doGeno 32 (but I recommend using ibsMat for all ordination work)
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 25 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5 -hetbias_pval 1e-5 -skipTriallelic 1 -minInd 168 -snp_pval 1e-5 -minMaf 0.05"
TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 8 -doVcf 1 -doPost 1 -doGlf 2"
# Starting angsd with -P the number of parallel processes. Funny but in many cases angsd runs faster on -P 1
srun angsd -b bams -GL 1 $FILTERS $TODO -P 1 -out fkMcavClones
# how many SNPs?
NSITES=`zcat fkMcavClones.mafs.gz | wc -l`
echo $NSITES
#scp the ibs matrix to identify clones
################################################################################
mkdir mcavBamsNoClones
srun cp ./mcavBams/*.bam* ./mcavBamsNoClone
#Manually remove clones and technical replicates based on IBS dendrogram
#YOU SHOULD HAVE 215 FILES
################################################################################
mkdir mcavANGSDNoClones
srun cp ./mcavBamsNoClone/*bam* ./mcavANGSDNoClones
# Note: PCA and Admixture are not supposed to be run on data that contain clones or genotyping replicates. For PCA, these can be removed without rerunning ANGSD from the IBS distance matrix; but for ngsAdmix ANGSD must be rerun.
# Generating genotype likelihoods from highly confident (non-sequencing-error) SNPs
# set minInd to 75-80% of your total number of bams
# if you expect very highly differentiated populations with nearly fixed alternative alleles, remove '-hwe_pval 1e-5' form FILTERS
# -doGeno 8 : genotype likelihood format setting for ngsLD; if you want to run PCA, use -doGeno 32 (but I recommend using ibsMat for all ordination work)
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 25 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5 -hetbias_pval 1e-5 -skipTriallelic 1 -minInd 162 -snp_pval 1e-5 -minMaf 0.05"
TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 8 -doBcf 1 -doPost 1 -doGlf 2"
# Starting angsd with -P the number of parallel processes. Funny but in many cases angsd runs faster on -P 1
srun angsd -b bamsNoClones -GL 1 $FILTERS $TODO -P 1 -out fkMcavNoClones
# how many SNPs?
NSITES=`zcat fkMcavNoClones.mafs.gz | wc -l`
echo $NSITES
# NgsAdmix for K from 2 to 11 : do not run if the dataset contains clones or genotyping replicates!
for K in `seq 2 11` ;
do
NGSadmix -likes fkMcavNoClones.beagle.gz -K $K -P 10 -o fkMcavNoClones_k${K};
done
## Next, take the likelihood value from each run of NGSadmix and put them into a file that can be used with Clumpak to calculate the most likely K using the methods of Evanno et al. (2005).
>ngsAdmix
for R in {1..10}; do
for K in {1..11}; do
echo "NGSadmix -likes fkMcavNoClones.beagle.gz -K $K -P 10 -o fkMcavNoClones_k${K}_run$R" >> ngsAdmix;
done;
done
launcher_creator.py -j ngsAdmix -n ngsAdmix -q shortq7 -N 3 -t 06:00:00
sbatch ngsAdmix
> logfile
for log in *.log; do
grep -Po 'like=\K[^ ]+' $log >> logfile;
done
#format for CLUMPAK in R
R
# you are now using R in the terminal
logs <- as.data.frame(read.table("logfile"))
logs$K <- c(rep("10", 10), rep("11", 10), rep("1", 10), rep("2", 10), rep("3", 10), rep("4", 10), rep("5", 10), rep("6", 10), rep("7", 10), rep("8", 10), rep("9", 10))
write.table(logs[, c(2, 1)], "logfile_formatted", row.names = F, col.names = F, quote = F)
quit()
# alternatively, to use real ADMIXTURE on called SNPs (requires plink and ADMIXTURE):
cat fkMcavNoClones.vcf | sed 's/xpSc//g' >fkMcavNoClones_chr.vcf
cat fkMcavNoClones_chr.vcf | sed 's/xfSc//g' >fkMcavNoClones_chr1.vcf
cat fkMcavNoClones_chr1.vcf | sed 's/Sc//g' >fkMcavNoClones_chr2.vcf
plink --vcf fkMcavNoClones_chr2.vcf --make-bed --allow-extra-chr --out fkMcavNoClones
for K in `seq 1 11`; \
do admixture --cv fkMcavNoClones.bed $K | tee fkMcavNoClones_${K}.out; done
grep -h CV fkMcavNoClones*.out
plink --vcf fkMcavNoClones.bcf --make-bed --allow-extra-chr --out fkMcavNoClones
for K in `seq 1 11`; \
do admixture --cv fkMcavNoClones.bed $K | tee fkMcavNoClones_${K}.out; done
grep -h CV fkMcavNoClones*.out
####
CV error (K=1): 0.48446
CV error (K=2): 0.43832
CV error (K=3): 0.43216
CV error (K=4): 0.43160
CV error (K=5): 0.43222
CV error (K=6): 0.43130
CV error (K=7): 0.43355
CV error (K=8): 0.44174
CV error (K=9): 0.45271
CV error (K=10): 0.46330
CV error (K=11): 0.46838
bcftools reheader fkMcavNoClones.vcf.gz -s sampleListNoClones -o fkMcavNoClonesRenamed.vcf.gz
################################################################################
mkdir bayescan
cd bayescan
srun cp ../macvANGSDNoClones/fkMcavNoClones.vcf.gz . #Note do not use the renamed version, pgdspider has a tough time parsing the samples into pops when you do
srun gunzip fkMcavNoClones.vcf.gz
#scp bspops.txt to koko
echo "############
# VCF Parser questions
PARSER_FORMAT=VCF
# Do you want to include a file with population definitions?
VCF_PARSER_POP_QUESTION=true
# Only input following regions (refSeqName:start:end, multiple regions: whitespace separated):
VCF_PARSER_REGION_QUESTION=
# What is the ploidy of the data?
VCF_PARSER_PLOIDY_QUESTION=DIPLOID
# Only output following individuals (ind1, ind2, ind4, ...):
VCF_PARSER_IND_QUESTION=
# Output genotypes as missing if the read depth of a position for the sample is below:
VCF_PARSER_READ_QUESTION=
# Take most likely genotype if "PL" or "GL" is given in the genotype field?
VCF_PARSER_PL_QUESTION=true
# Do you want to exclude loci with only missing data?
VCF_PARSER_EXC_MISSING_LOCI_QUESTION=false
# Select population definition file:
VCF_PARSER_POP_FILE_QUESTION=./bspops.txt
# Only output SNPs with a phred-scaled quality of at least:
VCF_PARSER_QUAL_QUESTION=
# Do you want to include non-polymorphic SNPs?
VCF_PARSER_MONOMORPHIC_QUESTION=false
# Output genotypes as missing if the phred-scale genotype quality is below:
VCF_PARSER_GTQUAL_QUESTION=
# GESTE / BayeScan Writer questions
WRITER_FORMAT=GESTE_BAYE_SCAN
# Specify which data type should be included in the GESTE / BayeScan file (GESTE / BayeScan can only analyze one data type per file):
GESTE_BAYE_SCAN_WRITER_DATA_TYPE_QUESTION=SNP
############" >vcf2bayescan.spid
java -Xmx1024m -Xms512m -jar ~/bin/PGDSpider_2.0.7.1/PGDSpider2-cli.jar -inputfile fkMcavNoClones.vcf -outputfile fkMcav.bayescan -spid vcf2bayescan.spid
# launching bayescan (this might take 12-24 hours)
srun bayescan fkMcav.bayescan -threads=20
removeBayescanOutliers.pl bayescan=fkMcav.baye_fst.txt vcf=fkMcavNoClones.vcf FDR=0.1 mode=extract > fkVcfOutliers.vcf
srun sed 's/^##fileformat=VCFv4.2(angsd version)/##fileformat=VCFv4.2/' fkVcfOutliers.vcf > fk1VcfOutliers.vcf
removeBayescanOutliers.pl bayescan=fkMcav.baye_fst.txt vcf=fkMcavNoClones.vcf FDR=0.1 mode=delete > fkVcfNeutral.vcf
#Download bayescenv (had to run it on local computer)
cd /Users/student/bin/bayescenv-1.1/bin/mac64 #where the command line bayescenv is downloaded
#scp the fkMcav.bayescan
nano envDepth
32.2 16.6 36.2 25.5 17.8 31.4 40.5 23.6 #average depths of populations in order of the fkMcav.bayescan input doc
./bayescenv fkMcav.bayescan -env envDepth
################################################################################
#Calculating population parameters no filter on excess heterozygosity
# estimating site frequency likelihoods for each population, also saving allele frequencies (for genome scan)
mkdir angsdPopStats
srun cp ./mcavBamsNoClones/*bam* ./angsdPopStats #copy over all the bam files (clones removed) and the bam lists for each pop. Pop definitions are below
FILTERS="-uniqueOnly 1 -remove_bads 1 -skipTriallelic 1 -minMapQ 20 -minQ 25 -doHWE 1 -sb_pval 1e-5 -hetbias_pval 1e-5 -minInd 162 " #Note there are no MAF or snp filters so as not to affect allelic frequencies that may mess with heterozygosity calcs
TODO="-doMajorMinor 1 -doMaf 1 -dosnpstat 1 -doPost 2 -doGeno 8 -doGlf 2"
srun angsd -b bamsNoClones -GL 1 $FILTERS $TODO -P 1 -out fkMcavNoSNPFilter
NSITES=`zcat fkMcavNoSNPFilter.mafs.gz | wc -l`
echo $NSITES
###1,537,436
# individual heterozygosities (proportion of heterozygotes across SNPs that pass filters)
srun Rscript ~/bin/heterozygosity_beagle.R fkMcavNoSNPFilter.beagle.gz
#Prints individual heterozygosity
#pop0=Lower Keys-Meso, 13
#pop1=Lower Keys-shallow, 30
#pop2=TER-North-Meso, 23
#pop3=TER-North-shallow, 27
#pop4=TER-South-Meso, 37
#pop5=TER-South-shallow, 28
#pop6=Upper-Keys-Meso, 25
#pop7=Upper-Keys-Shallow, 32
# estimating site frequency likelihoods for each population, also saving allele frequencies (for genome scan)
export GENOME_REF=/home/asturm2017/2bRAD/floridaKeys/referenceGenome/Mcavernosa_July2018.fasta
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 25 -baq 1"
TODO="-doSaf 1 -anc $GENOME_REF -ref $GENOME_REF"
srun angsd -b pop0.bams -minInd 10 -GL 1 -P 1 $TODO $FILTERS -out pop0
srun angsd -b pop1.bams -minInd 23 -GL 1 -P 1 $TODO $FILTERS -out pop1
srun angsd -b pop2.bams -minInd 18 -GL 1 -P 1 $TODO $FILTERS -out pop2
srun angsd -b pop3.bams -minInd 21 -GL 1 -P 1 $TODO $FILTERS -out pop3
srun angsd -b pop4.bams -minInd 28 -GL 1 -P 1 $TODO $FILTERS -out pop4
srun angsd -b pop5.bams -minInd 21 -GL 1 -P 1 $TODO $FILTERS -out pop5
srun angsd -b pop6.bams -minInd 19 -GL 1 -P 1 $TODO $FILTERS -out pop6
srun angsd -b pop7.bams -minInd 24 -GL 1 -P 1 $TODO $FILTERS -out pop7
# generating per-population SFS
nano sfs
#!/bin/sh
#SBATCH --partition shortq7
#SBATCH --nodes 1
#SBATCH --exclusive
#SBATCH --mail-type=all
#SBATCH [email protected]
#SBATCH --job-name=sfs
#SBATCH --output=sfs-%j.out
#SBATCH --error=sfs-%j.err
realSFS pop0.saf.idx >pop0.sfs
realSFS pop1.saf.idx >pop1.sfs
realSFS pop2.saf.idx >pop2.sfs
realSFS pop3.saf.idx >pop3.sfs
realSFS pop4.saf.idx >pop4.sfs
realSFS pop5.saf.idx >pop5.sfs
realSFS pop6.saf.idx >pop6.sfs
realSFS pop7.saf.idx >pop7.sfs
##Producing thetas
nano theta
#!/bin/sh
#SBATCH --partition shortq7
#SBATCH --nodes 1
#SBATCH --exclusive
#SBATCH --mail-type=all
#SBATCH [email protected]
#SBATCH --job-name=theta
realSFS saf2theta pop0.saf.idx -sfs pop0.sfs -outname pop0
thetaStat do_stat pop0.thetas.idx
realSFS saf2theta pop1.saf.idx -sfs pop1.sfs -outname pop1
thetaStat do_stat pop1.thetas.idx
realSFS saf2theta pop2.saf.idx -sfs pop2.sfs -outname pop2
thetaStat do_stat pop2.thetas.idx
realSFS saf2theta pop3.saf.idx -sfs pop3.sfs -outname pop3
thetaStat do_stat pop3.thetas.idx
realSFS saf2theta pop4.saf.idx -sfs pop4.sfs -outname pop4
thetaStat do_stat pop4.thetas.idx
realSFS saf2theta pop5.saf.idx -sfs pop5.sfs -outname pop5
thetaStat do_stat pop5.thetas.idx
realSFS saf2theta pop6.saf.idx -sfs pop6.sfs -outname pop6
thetaStat do_stat pop6.thetas.idx
realSFS saf2theta pop7.saf.idx -sfs pop7.sfs -outname pop7
thetaStat do_stat pop7.thetas.idx
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>