fkReadsProcessingProtocolCLEAN


koko login: asturm2017@koko-login.hpc.fau.edu

###FK MCAV Reads Processing Protocol

##Downloading Reads from Basespace
#Go to the bin on your local drive

cd ~/bin
brew tap basespace/basespace && brew install bs-cli #Use homebrew to install the basespace command line downloader
bs auth #Should generate a link to authenticate the usage, go to the link in your browser
bs download project --name JA20179 -o asturm2017@koko-login.hpc.fau.edu:~/2bRAD/floridaKeys/rawReads #--name is the project name and -o is the output directory, in this case uploading the reads directly to koko

# bs download project --name JA20179 -o /Users/student/Documents/Florida\ Keys/rawReads #Or download to s home directory

cd ~/2bRAD/floridaKeys/rawReads #There should be 50 files

################################################################################

mkdir concatReads #Make a new directory where you can concatenate your reads between the two lanes.
srun cp ./rawReads/*.fastq.gz ./concatReads
cd concatReads
gunzip *.fastq.gz #unzip all of your files

nano concat.sh #See script below, concatenate all of your files across lanes

#!/bin/sh
#SBATCH --partition shortq7
#SBATCH --nodes 1
#SBATCH --exclusive
#SBATCH --mail-type=all
#SBATCH --mail-user=asturm2017@fau.edu

cat 10_S38_L001_R1_001.fastq 10_S38_L002_R1_001.fastq > fk10.fq
cat 11_S39_L001_R1_001.fastq 11_S39_L002_R1_001.fastq > fk11.fq
cat 12_S40_L001_R1_001.fastq 12_S40_L002_R1_001.fastq > fk12.fq
cat 13_S41_L001_R1_001.fastq 13_S41_L002_R1_001.fastq > fk13.fq
cat 14_S42_L001_R1_001.fastq 14_S42_L002_R1_001.fastq > fk14.fq
cat 15_S43_L001_R1_001.fastq 15_S43_L002_R1_001.fastq > fk15.fq
cat 16_S44_L001_R1_001.fastq 16_S44_L002_R1_001.fastq > fk16.fq
cat 17_S45_L001_R1_001.fastq 17_S45_L002_R1_001.fastq > fk17.fq
cat 18_S46_L001_R1_001.fastq 18_S46_L002_R1_001.fastq > fk18.fq
cat 19_S47_L001_R1_001.fastq 19_S47_L002_R1_001.fastq > fk19.fq
cat 1_S29_L001_R1_001.fastq 1_S29_L002_R1_001.fastq > fk1.fq
cat 20_S48_L001_R1_001.fastq 20_S48_L002_R1_001.fastq > fk20.fq
cat 21_S49_L001_R1_001.fastq 21_S49_L002_R1_001.fastq > fk21.fq
cat 22_S50_L001_R1_001.fastq 22_S50_L002_R1_001.fastq > fk22.fq
cat 23_S51_L001_R1_001.fastq 23_S51_L002_R1_001.fastq > fk23.fq
cat 24_S52_L001_R1_001.fastq 24_S52_L002_R1_001.fastq > fk24.fq
cat 25_S53_L001_R1_001.fastq 25_S53_L002_R1_001.fastq > fk25.fq
cat 2_S30_L001_R1_001.fastq 2_S30_L002_R1_001.fastq > fk2.fq
cat 3_S31_L001_R1_001.fastq 3_S31_L002_R1_001.fastq > fk3.fq
cat 4_S32_L001_R1_001.fastq 4_S32_L002_R1_001.fastq > fk4.fq
cat 5_S33_L001_R1_001.fastq 5_S33_L002_R1_001.fastq > fk5.fq
cat 6_S34_L001_R1_001.fastq 6_S34_L002_R1_001.fastq > fk6.fq
cat 7_S35_L001_R1_001.fastq 7_S35_L002_R1_001.fastq > fk7.fq
cat 8_S36_L001_R1_001.fastq 8_S36_L002_R1_001.fastq > fk8.fq
cat 9_S37_L001_R1_001.fastq 9_S37_L002_R1_001.fastq > fk9.fq

################################################################################
mkdir trimmedReads
srun cp ./concatReads/*.fq ./trimmedReads

#Set up a script to trim and deduplicate the files
2bRAD_trim_launch_dedup.pl fq > trims #Trim and deduplicate files
launcher_creator.py -j trims -n trims -t 2:00:00 -e asturm2017@fau.edu -q shortq7
sbatch trims.slurm

################################################################################
mkdir renamedReads
srun cp ./trimmedReads/*.tr0 ./renamedReads

#Create and scp a csv file with a column of file names (as deduplicated with the in-line barcodes) and what you want to rename the filed to (sample ID #). Upload the csv to the working directory and copy the sampleRename.py script

python sampleRename.py -i sampleRename -f tr0

################################################################################
mkdir highQualityReads
srun cp ./renamedReads/*.tr0 ./highQualityReads

# Quality filtering using fastx_toolkit
# Creating a list of filtering commands:
# The options -q 20 -p 90 mean that 90% or more of all bases within the read should have PHRED quality of at least 20 (i.e., probability of error 1% or less)
# PHRED quality=10*(-log10(P.error))
ls *.tr0 | perl -pe 's/^(\S+)\.tr0$/cat $1\.tr0 \| fastq_quality_filter -q 20 -p 90 >$1\.trim/' >filt0

# NOTE: run the next line ONLY if your qualities are 33-based (GSAF results are 33-based):
cat filt0 | perl -pe 's/filter /filter -Q33 /' > filt

################################################################################
mkdir mappedReadsNoConcat
srun cp ./highQualityReads/*.trim ./mappedReadsNoConcat

mkdir referenceGenome #Ensure that the concatenated MCAV and algal symbiont transcriptomes are in this directory along with the associated genome index files

cd mappedReadsNoConcat

GENOME_FASTA=/home/asturm2017/2bRAD/floridaKeys/referenceGenome/mcav_syms_updated.fasta

# mapping with --local option, enables clipping of mismatching ends (guards against deletions near ends of RAD tags)
2bRAD_bowtie2_launch.pl '\.trim$' $GENOME_FASTA > maps #Execute all commands in maps

#Check to ensure that the number of sams files made matches the number of bams files

################################################################################
mkdir bamsNoConcat
srun cp ./mappedReadsNoConcat/*.sam ./bamsNoConcat

>s2b
for file in *.sam; do
echo "samtools sort -O bam -o ${file/.sam/}.bam $file && samtools index ${file/.sam/}.bam">>s2b;
done

#Execute all commands written to s2b
#Ensure that the number of output bams files matches the number of original sams files

###Now we do ANGSD for the purpose of producing an IBS matrix so we can generate an IBS dendrogram and check to make sure all re-extracts, technical replicates, etc. are clones of one another

export GENOME_REF=/home/asturm2017/2bRAD/floridaKeys/referenceGenome/Mcavernosa_July2018.fasta


FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -baq 1 -ref $GENOME_REF -maxDepth 2570"
TODO="-doQsDist 1 -doDepth 1 -doCounts 1"
angsd -b bams -GL 1 $FILTERS $TODO -P 1 -out dd


Rscript ~/bin/plotQC.R dd
cat dd.info
# scp dd.pdf to laptop to look at distribution of base quality scores, fraction of sites in each sample passing coverage thresholds, and fraction of sites passing genotyping rates cutoffs. Use these to guide choices of -minQ,  -minIndDepth and -minInd filters in subsequent ANGSD runs

FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 30 -minInd 205 -snp_pval 1e-5 -minMaf 0.05 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5"
TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 32 -doPost 1 -doGlf 2"
angsd -b bams -GL 1 $FILTERS $TODO -P 1 -out noSamplesConcat

NSITES=`zcat noSamplesConcat.mafs.gz | wc -l`
echo $NSITES
## 13,819 SNPs

#scp the ibsMAT and use the output to generate a cluster dendrogram so you can identify, repeated samples, clones, etc.

################################################################################
mkdir tr0Concat
srun cp ./renamedReads/*.tr0 ./tr0Concat
###Concatenate any files that may be re-extracts or bad barcode files that are clones of good bc samples.
##For now leave technical replicates and natural clones

#YOU SHOULD HAVE 223 FILES

################################################################################
mkdir highQualityConcat
srun cp ./tr0Concat/*.tr0 ./highQualityConcat

# Quality filtering using fastx_toolkit
# The options -q 20 -p 90 mean that 90% or more of all bases within the read should have PHRED quality of at least 20 (i.e., probability of error 1% or less)
# PHRED quality=10*(-log10(P.error))
ls *.tr0 | perl -pe 's/^(\S+)\.tr0$/cat $1\.tr0 \| fastq_quality_filter -q 20 -p 90 >$1\.trim/' >filt0

# NOTE: run the next line ONLY if your qualities are 33-based (GSAF results are 33-based):
cat filt0 | perl -pe 's/filter /filter -Q33 /' > filt
#if you did NOT run the line above, run this one:
mv filt0 filt

launcher_creator.py -j filt -n filt -t 1:00:00 -e asturm2017@fau.edu -q shortq7
sbatch filt.slurm

################################################################################
#After initial reviews there needed to be some reanalysis all of this between the >>>> was moved to "oldAnalysis"

mkdir sams
srun cp ./highQualityConcat/*.trim ./sams

#Make sure your references are indexed with bowtie2
cd ~/genomes/symbiontGenomes/singleChromo

#Here I made a mega concatenated version of genomes for Symbiodinium, Breviolum, Cladocopium, and Durusdinium. The papers these are from and which chromosomes each genus is associated with are listed.

#Symbiodinium microadriacticum (Aranda et al. 2016), Chromosomes 1-5
#Breviolum minutum (Shoguchi et al. 2013), Chromosomes 6-9
#Cladocopium goreaui (Liu et al. 2018), Chromosomes 10-15
#Durusdinium trenchii (Shoguchi et al. 2021), Chromosomes 16-19

export GENOME_FASTA=~/genomes/symbiontGenomes/singleChromo/concatZooxGenomes.fasta

echo 'bowtie2-build $GENOME_FASTA $GENOME_FASTA' > btb
launcher_creator.py -j btb -n btb -t 6:00:00 -e asturm2017@fau.edu -q shortq7
sbatch btb.slurm

srun samtools faidx $GENOME_FASTA

# First need to Map reads to algal symbiont reference genome with soft-clipping (Bowtie2 --local option) to avoid indels near read ends
GENOME_FASTA=~/genomes/symbiontGenomes/singleChromo/concatZooxGenomes.fasta.gz

cd sams
mkdir zoox
mkdir unmapped

#2bRAD_bowtie2_launch_zoox.pl This new script aligns trim files to the concatenated algal symbiont genomes. It then produces fastq files with aligned reads, fastq files with unaligned reads, and sam files which allow us to calculate the alignment rates to the algal symbiont genomes.

#Adapted the bowtie script to do the following

./2bRAD_bowtie2_launch_zoox.pl '\.trim$' ~/genomes/symbiontGenomes/singleChromo/concatZooxGenomes.fasta.gz > maps1
launcher_creator.py -N 10 -j maps1 -n maps1 -t 6:00:00 -e asturm2017@fau.edu -q shortq7
sbatch maps1.slurm

>alignmentRates
for F in `ls *trim`; do
M=`grep -E '^[ATGCN]+$' $F | wc -l | grep -f - maps.e* -A 4 | tail -1 | perl -pe 's/maps\.e\d+-|% overall alignment rate//g'` ;
echo "$F.sam $M">>alignmentRates;
done

##############################################################################
cd ~/2bRAD/floridaKeys/sams/zoox

mkdir zooxOnly
mkdir zooxAndMcav


./2bRAD_bowtie2_launch_dual_filter.pl '\.fq$' ~/genomes/Mcav_genome/Mcavernosa_July2018.fasta > mapsZoox2Mcav
launcher_creator.py -N 10 -j mapsZoox2Mcav -n mapsZoox2Mcav -t 6:00:00 -e asturm2017@fau.edu -q shortq7
sbatch mapsZoox2Mcav.slurm

#As a sanity check
cd zooxOnly
2bRAD_bowtie2_launch.pl '\.fq$' ~/genomes/symbiontGenomes/singleChromo/concatZooxGenomes.fasta.gz > zoox2Zoox
launcher_creator.py -N 5 -j zoox2Zoox -n zoox2Zoox -t 6:00:00 -e asturm2017@fau.edu -q shortq7
sbatch zoox2Zoox.slurm

#Check the zoox2Zoox.e* file and make sure these have 100% alignment rate

mkdir ../zooxSams

################################################################################
mkdir zooxBamsNoClones
srun cp ./zooxSams/*.sam ./zooxBamsNoClones

#Remove the host clone files

#Compressing, sorting and indexing the SAM files, so they become BAM files:
>s2b
for file in *.sam; do
echo "samtools sort -O bam -o ${file/.sam/}.bam $file && samtools index ${file/.sam/}.bam">>s2b;
done

launcher_creator.py -j s2b -n s2b -t 1:00:00 -N 5 -e asturm2017@fau.edu -q shortq7
sbatch s2b.slurm

FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 25 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5 -hetbias_pval 1e-5 -skipTriallelic 1 -minInd 162 -snp_pval 1e-5 -minMaf 0.05"

TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 8 -doBcf 1 -doPost 1 -doGlf 2"

# Starting angsd with -P the number of parallel processes. Funny but in many cases angsd runs faster on -P 1

srun angsd -b zooxBamsNoClones -GL 1 $FILTERS $TODO -P 1 -out fkZooxNoClones

# how many SNPs?
NSITES=`zcat fkZooxNoClones.mafs.gz | wc -l`
echo $NSITES

#Only 4 sites, not enough to do anything with
#But can count alignments
for file in *.bam; do

for i in *.bam; do
    echo $i >> output
    samtools idxstats $i | cut -f 1,3 >> output
done
################################################################################
cd ~/2bRAD/floridaKeys/sams/unmapped

2bRAD_bowtie2_launch.pl '\.fq$' ~/genomes/Mcav_genome/Mcavernosa_July2018.fasta > mapsUn2Mcav
launcher_creator.py -N 10 -j mapsUn2Mcav -n mapsUn2Mcav -t 6:00:00 -e asturm2017@fau.edu -q shortq7
sbatch mapsUn2Mcav.slurm

mkdir ../mcavSams


#Compressing, sorting and indexing the SAM files, so they become BAM files:
>s2b
for file in *.sam; do
echo "samtools sort -O bam -o ${file/.sam/}.bam $file && samtools index ${file/.sam/}.bam">>s2b;
done

launcher_creator.py -j s2b -n s2b -t 6:00:00 -N 5 -e asturm2017@fau.edu -q shortq7
sbatch s2b.slurm

################################################################################
mkdir mcavANGSDClones
srun mv ./mcavBams/*bam* ./mcavANGSDClones/

ls *.bam > bamsClones

# angsd settings:
# -minMapQ 20 : only highly unique mappings (prob of erroneous mapping = 1%)
# -baq 1 : realign around indels (not terribly relevant for 2bRAD reads mapped with --local option)
# -maxDepth : highest total depth (sum over all samples) to assess; set to 10x number of samples
FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -maxDepth 2230"

# T O   D O :
TODO="-doQsDist 1 -doDepth 1 -doCounts 1 -dumpCounts 2"
srun angsd -b bamsClones -GL 1 $FILTERS $TODO -P 1 -out ddClones

# summarizing results (using modified script by Matteo Fumagalli)
srun --mem=200GB Rscript ~/bin/plotQC.R ddClones > qranks

# proportion of sites covered at >5x:

cat qranks

# scp dd.pdf to laptop to look at distribution of base quality scores, fraction of sites in each sample passing coverage thresholds, and fraction of sites passing genotyping rates cutoffs. Use these to guide choices of -minQ,  -minIndDepth and -minInd filters in subsequent ANGSD runs

##ANGSD WITH NEW FILTERS W CLONES

# Note: PCA and Admixture are not supposed to be run on data that contain clones or genotyping replicates. For PCA, these can be removed without rerunning ANGSD from the IBS distance matrix; but for ngsAdmix ANGSD must be rerun.

# Generating genotype likelihoods from highly confident (non-sequencing-error) SNPs
# set minInd to 75-80% of your total number of bams
# if you expect very highly differentiated populations with nearly fixed alternative alleles, remove '-hwe_pval 1e-5' form FILTERS
# -doGeno 8 : genotype likelihood format setting for ngsLD; if you want to run PCA, use -doGeno 32 (but I recommend using ibsMat for all ordination work)

FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 25 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5 -hetbias_pval 1e-5 -skipTriallelic 1 -minInd 168 -snp_pval 1e-5 -minMaf 0.05"

TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 8 -doBcf 1 -doPost 1 -doGlf 2"

# Starting angsd with -P the number of parallel processes. Funny but in many cases angsd runs faster on -P 1

srun angsd -b bamsClones -GL 1 $FILTERS $TODO -P 1 -out fkMcavClones

# how many SNPs?
NSITES=`zcat fkMcavClones.mafs.gz | wc -l`
echo $NSITES

#scp the ibs matrix to identify clones

################################################################################
mkdir mcavBamsNoClones
srun cp ./mcavBams/*.bam* ./mcavBamsNoClone

#Manually remove clones and technical replicates based on IBS dendrogram
#YOU SHOULD HAVE 215 FILES

################################################################################
mkdir mcavANGSDNoClones
srun cp ./mcavBamsNoClone/*bam* ./mcavANGSDNoClones

# Note: PCA and Admixture are not supposed to be run on data that contain clones or genotyping replicates. For PCA, these can be removed without rerunning ANGSD from the IBS distance matrix; but for ngsAdmix ANGSD must be rerun.

# Generating genotype likelihoods from highly confident (non-sequencing-error) SNPs
# set minInd to 75-80% of your total number of bams
# if you expect very highly differentiated populations with nearly fixed alternative alleles, remove '-hwe_pval 1e-5' form FILTERS
# -doGeno 8 : genotype likelihood format setting for ngsLD; if you want to run PCA, use -doGeno 32 (but I recommend using ibsMat for all ordination work)

FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 25 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5 -hetbias_pval 1e-5 -skipTriallelic 1 -minInd 162 -snp_pval 1e-5 -minMaf 0.05"

TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 8 -doBcf 1 -doPost 1 -doGlf 2"

# Starting angsd with -P the number of parallel processes. Funny but in many cases angsd runs faster on -P 1

srun angsd -b bamsNoClones -GL 1 $FILTERS $TODO -P 1 -out fkMcavNoClones

# how many SNPs?
NSITES=`zcat fkMcavNoClones.mafs.gz | wc -l`
echo $NSITES

# NgsAdmix for K from 2 to 11 : do not run if the dataset contains clones or genotyping replicates!
for K in `seq 2 11` ;
do
NGSadmix -likes fkMcavNoClones.beagle.gz -K $K -P 10 -o fkMcavNoClones_k${K};
done


srun ~/bin/evalAdmix/evalAdmix -beagle fkMcavNoClones.beagle.gz -fname fkMcavNoClones_k2.fopt.gz -qname fkMcavNoClones_k2.qopt -o fkMcavNoClones_k2_residuals

#Run evalAdmix
for K in `seq 2 11` ;
do
srun ~/bin/evalAdmix/evalAdmix -beagle fkMcavNoClones.beagle.gz -fname fkMcavNoClones_k${K}.fopt.gz -qname fkMcavNoClones_k${K}.qopt -o fkMcavNoClones_k${K}_residuals -P 20
done

mkdir ngsAdmixKselect
srun cp *.beagle.gz ./ngsAdmixKselect
cd ngsAdmixKselect

## Next, run NGSadmix ten iterations for each K take the likelihood value from each run of NGSadmix and put them into a file that can be used with Clumpak to calculate the most likely K using the methods of Evanno et al. (2005).
>ngsAdmix
for R in {1..10}; do
for K in {1..11}; do
echo "NGSadmix -likes fkMcavNoClones.beagle.gz -K $K -P 10 -o fkMcavNoClones_k${K}_run$R" >> ngsAdmix;
done;
done


launcher_creator.py -j ngsAdmix -n ngsAdmix -q shortq7 -N 3 -t 06:00:00

sbatch ngsAdmix.slurm


> logfile
for log in *.log; do
grep -Po 'like=\K[^ ]+' $log >> logfile;
done

#format for CLUMPAK in R
R
# you are now using R in the terminal
logs <- as.data.frame(read.table("logfile"))

logs$K <- c(rep("10",10),rep("11",10),rep("1",10),rep("2",10),rep("3",10),rep("4",10),rep("5",10),rep("6", 10),rep("7",10),rep("8",10),rep("9",10))

write.table(logs[, c(2, 1)], "logfile_formatted", row.names = F, col.names = F, quote = F)      
quit()

#Save and upload to Clumpak

mkdir structureSelector

srun cp *.qopt ./structureSelector #Copy all your ngsadmix runs to a directory
rename .qopt .Q *.qopt

#Download this directory and zip it, upload along with a popMap to structureSelector to run the Puechamille method
################################################################################

#Calculating population parameters no filter on excess heterozygosity
# estimating site frequency likelihoods for each population, also saving allele frequencies (for genome scan)

mkdir angsdPopStats

srun cp ./mcavBamsNoClones/*bam* ./angsdPopStats #copy over all the bam files (clones removed) and the bam lists for each pop. Pop definitions are below

FILTERS="-uniqueOnly 1 -remove_bads 1  -skipTriallelic 1 -minMapQ 20 -minQ 25 -doHWE 1 -sb_pval 1e-5 -hetbias_pval 1e-5 -minInd 162" #Note there are no MAF or snp filters so as not to affect allelic frequencies that may mess with heterozygosity calcs

TODO="-doMajorMinor 1 -doMaf 1 -dosnpstat 1 -doPost 2 -doGeno 11 -doGlf 2"

srun angsd -b bamsNoClones -GL 1 $FILTERS $TODO -P 1 -out fkMcavNoSNPFilter

NSITES=`zcat fkMcavNoSNPFilter.mafs.gz | wc -l`
echo $NSITES

srun --mem=100GB Rscript ~/bin/heterozygosity_beagle.R fkMcavNoSNPFilter.beagle.gz
#Prints individual heterozygosity

echo '#!/bin/bash' > RHet.sh
echo 'Rscript ~/bin/heterozygosity_beagle.R fkMcavNoSNPFilter.beagle.gz' >> RHet.sh
sbatch -e RHet.e%j -o RHet.o%j --mem=200GB --mail-user asturm2017@fau.edu --mail-type=ALL RHet.sh


>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
mkdir samsConcat
srun cp ./highQualityConcat/*.trim ./samsConcat

# Mapping reads to a reference genome with soft-clipping (Bowtie2 --local option) to avoid indels near read ends
GENOME_FASTA=/home/asturm2017/2bRAD/floridaKeys/referenceGenome/mcav_syms_updated.fasta
2bRAD_bowtie2_launch.pl '\.trim$' $GENOME_FASTA > maps
launcher_creator.py -j maps -n maps -t 2:00:00 -e asturm2017@fau.edu -q shortq7
sbatch maps.slurm

#See how many reads aligned with zoox transcriptomes
#Using ~2Mb contig as a host reference; look up contig lengths in the header of any sam file
#Run scripts to count read alignments to each of the algal symbiont transcriptomes

zooxType.pl host="Sc0000000" >zooxCounts.txt

#scp zooxCounts.txt to local directory and open as csv file

################################################################################
mkdir mcavSams
srun cp ./samsConcat/*.sam ./mcavSams

#Re-write sams files without any reads aligning to algal symbiont transcriptomes

for f in *.sam
do
  egrep -v "chr11|chr12|chr13|chr14" < "$f" > "$f".mcav.sam
done

#These sam files should now only have MCAV reads

################################################################################
mkdir zooxSams
srun cp ./samsConcat/*.sam ./zooxSams

#Re-write sams files without any reads aligning to algal symbiont transcriptomes

for f in *.trim.bt2.sam
do
  egrep "chr11|chr12|chr13|chr14" < "$f" > "$f".trim.bt2.zoox.sam
done

#These sam files should now only have zoox reads

################################################################################
mkdir mcavBams
srun cp ./mcavSams/*.sam ./mcavBams

#Compressing, sorting and indexing the SAM files, so they become BAM files:
>s2b
for file in *.sam; do
echo "samtools sort -O bam -o ${file/.sam/}.bam $file && samtools index ${file/.sam/}.bam">>s2b;
done

launcher_creator.py -j s2b -n s2b -t 1:00:00 -N 5 -e asturm2017@fau.edu -q shortq7
sbatch s2b.slurm

ls *bam >bams

#Archive these BAMS files

#Read counts
for i in *.bam; do
    echo $i >> output
    samtools view $i | cut -f1 | sort | uniq | wc -l >> output
done

################################################################################
mkdir zooxBams
srun cp ./zooxSams/*.sam ./zooxBams

#Compressing, sorting and indexing the SAM files, so they become BAM files:
>s2b
for file in *.sam; do
echo "samtools sort -O bam -o ${file/.sam/}.bam $file && samtools index ${file/.sam/}.bam">>s2b;
done

launcher_creator.py -j s2b -n s2b -t 1:00:00 -N 5 -e asturm2017@fau.edu -q shortq7
sbatch s2b.slurm

ls *bam >bams

#Archive these BAMS files

#Read counts
for i in *.bam; do
    echo $i >> output
    samtools view $i | cut -f1 | sort | uniq | wc -l >> output
done

################################################################################
mkdir mcavANGSDClones
srun cp ./mcavBams/*bam* ./mcavANGSDClones/

# angsd settings:
# -minMapQ 20 : only highly unique mappings (prob of erroneous mapping = 1%)
# -baq 1 : realign around indels (not terribly relevant for 2bRAD reads mapped with --local option)
# -maxDepth : highest total depth (sum over all samples) to assess; set to 10x number of samples

FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -maxDepth 2230"

# T O   D O :

TODO="-doQsDist 1 -doDepth 1 -doCounts 1 -dumpCounts 2"

srun angsd -b bams -GL 1 $FILTERS $TODO -P 1 -out ddClones

# summarizing results (using modified script by Matteo Fumagalli)

srun Rscript ~/bin/plotQC.R ddClones > qranks

# proportion of sites covered at >5x:

cat qranks

# scp dd.pdf to laptop to look at distribution of base quality scores, fraction of sites in each sample passing coverage thresholds, and fraction of sites passing genotyping rates cutoffs. Use these to guide choices of -minQ,  -minIndDepth and -minInd filters in subsequent ANGSD runs

##ANGSD WITH NEW FILTERS W CLONES

# Note: PCA and Admixture are not supposed to be run on data that contain clones or genotyping replicates. For PCA, these can be removed without rerunning ANGSD from the IBS distance matrix; but for ngsAdmix ANGSD must be rerun.

# Generating genotype likelihoods from highly confident (non-sequencing-error) SNPs
# set minInd to 75-80% of your total number of bams
# if you expect very highly differentiated populations with nearly fixed alternative alleles, remove '-hwe_pval 1e-5' form FILTERS
# -doGeno 8 : genotype likelihood format setting for ngsLD; if you want to run PCA, use -doGeno 32 (but I recommend using ibsMat for all ordination work)

FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 25 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5 -hetbias_pval 1e-5 -skipTriallelic 1 -minInd 168 -snp_pval 1e-5 -minMaf 0.05"

TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 8 -doVcf 1 -doPost 1 -doGlf 2"

# Starting angsd with -P the number of parallel processes. Funny but in many cases angsd runs faster on -P 1

srun angsd -b bams -GL 1 $FILTERS $TODO -P 1 -out fkMcavClones

# how many SNPs?
NSITES=`zcat fkMcavClones.mafs.gz | wc -l`
echo $NSITES

#scp the ibs matrix to identify clones

################################################################################
mkdir mcavBamsNoClones
srun cp ./mcavBams/*.bam* ./mcavBamsNoClone

#Manually remove clones and technical replicates based on IBS dendrogram
#YOU SHOULD HAVE 215 FILES

################################################################################
mkdir mcavANGSDNoClones
srun cp ./mcavBamsNoClone/*bam* ./mcavANGSDNoClones

# Note: PCA and Admixture are not supposed to be run on data that contain clones or genotyping replicates. For PCA, these can be removed without rerunning ANGSD from the IBS distance matrix; but for ngsAdmix ANGSD must be rerun.

# Generating genotype likelihoods from highly confident (non-sequencing-error) SNPs
# set minInd to 75-80% of your total number of bams
# if you expect very highly differentiated populations with nearly fixed alternative alleles, remove '-hwe_pval 1e-5' form FILTERS
# -doGeno 8 : genotype likelihood format setting for ngsLD; if you want to run PCA, use -doGeno 32 (but I recommend using ibsMat for all ordination work)

FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 25 -dosnpstat 1 -doHWE 1 -hwe_pval 1e-5 -sb_pval 1e-5 -hetbias_pval 1e-5 -skipTriallelic 1 -minInd 162 -snp_pval 1e-5 -minMaf 0.05"

TODO="-doMajorMinor 1 -doMaf 1 -doCounts 1 -makeMatrix 1 -doIBS 1 -doCov 1 -doGeno 8 -doBcf 1 -doPost 1 -doGlf 2"

# Starting angsd with -P the number of parallel processes. Funny but in many cases angsd runs faster on -P 1

srun angsd -b bamsNoClones -GL 1 $FILTERS $TODO -P 1 -out fkMcavNoClones

# how many SNPs?
NSITES=`zcat fkMcavNoClones.mafs.gz | wc -l`
echo $NSITES

# NgsAdmix for K from 2 to 11 : do not run if the dataset contains clones or genotyping replicates!
for K in `seq 2 11` ;
do
NGSadmix -likes fkMcavNoClones.beagle.gz -K $K -P 10 -o fkMcavNoClones_k${K};
done


## Next, take the likelihood value from each run of NGSadmix and put them into a file that can be used with Clumpak to calculate the most likely K using the methods of Evanno et al. (2005).
>ngsAdmix
for R in {1..10}; do
for K in {1..11}; do
echo "NGSadmix -likes fkMcavNoClones.beagle.gz -K $K -P 10 -o fkMcavNoClones_k${K}_run$R" >> ngsAdmix;
done;
done

launcher_creator.py -j ngsAdmix -n ngsAdmix -q shortq7 -N 3 -t 06:00:00

sbatch ngsAdmix

> logfile
for log in *.log; do
grep -Po 'like=\K[^ ]+' $log >> logfile;
done

#format for CLUMPAK in R
R
# you are now using R in the terminal
logs <- as.data.frame(read.table("logfile"))

logs$K <- c(rep("10", 10), rep("11", 10), rep("1", 10), rep("2", 10), rep("3", 10), rep("4", 10), rep("5", 10), rep("6", 10), rep("7", 10), rep("8", 10), rep("9", 10))

write.table(logs[, c(2, 1)], "logfile_formatted", row.names = F, col.names = F, quote = F)      
quit()

# alternatively, to use real ADMIXTURE on called SNPs (requires plink and ADMIXTURE):
cat fkMcavNoClones.vcf | sed 's/xpSc//g' >fkMcavNoClones_chr.vcf
cat fkMcavNoClones_chr.vcf | sed 's/xfSc//g' >fkMcavNoClones_chr1.vcf
cat fkMcavNoClones_chr1.vcf | sed 's/Sc//g' >fkMcavNoClones_chr2.vcf
plink --vcf fkMcavNoClones_chr2.vcf --make-bed --allow-extra-chr --out fkMcavNoClones
for K in `seq 1 11`; \
do admixture --cv fkMcavNoClones.bed $K | tee fkMcavNoClones_${K}.out; done

grep -h CV fkMcavNoClones*.out

plink --vcf fkMcavNoClones.bcf --make-bed --allow-extra-chr --out fkMcavNoClones
for K in `seq 1 11`; \
do admixture --cv fkMcavNoClones.bed $K | tee fkMcavNoClones_${K}.out; done

grep -h CV fkMcavNoClones*.out


####
CV error (K=1): 0.48446
CV error (K=2): 0.43832
CV error (K=3): 0.43216
CV error (K=4): 0.43160
CV error (K=5): 0.43222
CV error (K=6): 0.43130
CV error (K=7): 0.43355
CV error (K=8): 0.44174
CV error (K=9): 0.45271
CV error (K=10): 0.46330
CV error (K=11): 0.46838

bcftools reheader fkMcavNoClones.vcf.gz -s sampleListNoClones -o fkMcavNoClonesRenamed.vcf.gz

################################################################################
mkdir bayescan
cd bayescan
srun cp ../macvANGSDNoClones/fkMcavNoClones.vcf.gz . #Note do not use the renamed version, pgdspider has a tough time parsing the samples into pops when you do
srun gunzip fkMcavNoClones.vcf.gz

#scp bspops.txt to koko

echo "############
# VCF Parser questions
PARSER_FORMAT=VCF
# Do you want to include a file with population definitions?
VCF_PARSER_POP_QUESTION=true
# Only input following regions (refSeqName:start:end, multiple regions: whitespace separated):
VCF_PARSER_REGION_QUESTION=
# What is the ploidy of the data?
VCF_PARSER_PLOIDY_QUESTION=DIPLOID
# Only output following individuals (ind1, ind2, ind4, ...):
VCF_PARSER_IND_QUESTION=
# Output genotypes as missing if the read depth of a position for the sample is below:
VCF_PARSER_READ_QUESTION=
# Take most likely genotype if "PL" or "GL" is given in the genotype field?
VCF_PARSER_PL_QUESTION=true
# Do you want to exclude loci with only missing data?
VCF_PARSER_EXC_MISSING_LOCI_QUESTION=false
# Select population definition file:
VCF_PARSER_POP_FILE_QUESTION=./bspops.txt
# Only output SNPs with a phred-scaled quality of at least:
VCF_PARSER_QUAL_QUESTION=
# Do you want to include non-polymorphic SNPs?
VCF_PARSER_MONOMORPHIC_QUESTION=false
# Output genotypes as missing if the phred-scale genotype quality is below:
VCF_PARSER_GTQUAL_QUESTION=
# GESTE / BayeScan Writer questions
WRITER_FORMAT=GESTE_BAYE_SCAN
# Specify which data type should be included in the GESTE / BayeScan file  (GESTE / BayeScan can only analyze one data type per file):
GESTE_BAYE_SCAN_WRITER_DATA_TYPE_QUESTION=SNP
############" >vcf2bayescan.spid

java -Xmx1024m -Xms512m -jar ~/bin/PGDSpider_2.0.7.1/PGDSpider2-cli.jar -inputfile fkMcavNoClones.vcf -outputfile fkMcav.bayescan -spid vcf2bayescan.spid

# launching bayescan (this might take 12-24 hours)
srun bayescan fkMcav.bayescan -threads=20

removeBayescanOutliers.pl bayescan=fkMcav.baye_fst.txt vcf=fkMcavNoClones.vcf FDR=0.1 mode=extract > fkVcfOutliers.vcf

srun sed 's/^##fileformat=VCFv4.2(angsd version)/##fileformat=VCFv4.2/' fkVcfOutliers.vcf > fk1VcfOutliers.vcf

removeBayescanOutliers.pl bayescan=fkMcav.baye_fst.txt vcf=fkMcavNoClones.vcf FDR=0.1 mode=delete > fkVcfNeutral.vcf

#Download bayescenv (had to run it on local computer)
cd /Users/student/bin/bayescenv-1.1/bin/mac64 #where the command line bayescenv is downloaded
#scp the fkMcav.bayescan
nano envDepth
32.2 16.6 36.2 25.5 17.8 31.4 40.5 23.6 #average depths of populations in order of the fkMcav.bayescan input doc
./bayescenv fkMcav.bayescan -env envDepth

################################################################################

#Calculating population parameters no filter on excess heterozygosity
# estimating site frequency likelihoods for each population, also saving allele frequencies (for genome scan)

mkdir angsdPopStats

srun cp ./mcavBamsNoClones/*bam* ./angsdPopStats #copy over all the bam files (clones removed) and the bam lists for each pop. Pop definitions are below

FILTERS="-uniqueOnly 1 -remove_bads 1  -skipTriallelic 1 -minMapQ 20 -minQ 25 -doHWE 1 -sb_pval 1e-5 -hetbias_pval 1e-5 -minInd 162 " #Note there are no MAF or snp filters so as not to affect allelic frequencies that may mess with heterozygosity calcs

TODO="-doMajorMinor 1 -doMaf 1 -dosnpstat 1 -doPost 2 -doGeno 8 -doGlf 2"

srun angsd -b bamsNoClones -GL 1 $FILTERS $TODO -P 1 -out fkMcavNoSNPFilter

NSITES=`zcat fkMcavNoSNPFilter.mafs.gz | wc -l`
echo $NSITES

###1,537,436

# individual heterozygosities (proportion of heterozygotes across SNPs that pass filters)
srun Rscript ~/bin/heterozygosity_beagle.R fkMcavNoSNPFilter.beagle.gz
#Prints individual heterozygosity

#pop0=Lower Keys-Meso, 13
#pop1=Lower Keys-shallow, 30
#pop2=TER-North-Meso, 23
#pop3=TER-North-shallow, 27
#pop4=TER-South-Meso, 37
#pop5=TER-South-shallow, 28
#pop6=Upper-Keys-Meso, 25
#pop7=Upper-Keys-Shallow, 32


# estimating site frequency likelihoods for each population, also saving allele frequencies (for genome scan)
export GENOME_REF=/home/asturm2017/2bRAD/floridaKeys/referenceGenome/Mcavernosa_July2018.fasta

FILTERS="-uniqueOnly 1 -remove_bads 1 -minMapQ 20 -minQ 25 -baq 1"
TODO="-doSaf 1 -anc $GENOME_REF -ref $GENOME_REF"

srun angsd -b pop0.bams -minInd 10 -GL 1 -P 1 $TODO $FILTERS -out pop0
srun angsd -b pop1.bams -minInd 23 -GL 1 -P 1 $TODO $FILTERS -out pop1
srun angsd -b pop2.bams -minInd 18 -GL 1 -P 1 $TODO $FILTERS -out pop2
srun angsd -b pop3.bams -minInd 21 -GL 1 -P 1 $TODO $FILTERS -out pop3
srun angsd -b pop4.bams -minInd 28 -GL 1 -P 1 $TODO $FILTERS -out pop4
srun angsd -b pop5.bams -minInd 21 -GL 1 -P 1 $TODO $FILTERS -out pop5
srun angsd -b pop6.bams -minInd 19 -GL 1 -P 1 $TODO $FILTERS -out pop6
srun angsd -b pop7.bams -minInd 24 -GL 1 -P 1 $TODO $FILTERS -out pop7

# generating per-population SFS
nano sfs

#!/bin/sh
#SBATCH --partition shortq7
#SBATCH --nodes 1
#SBATCH --exclusive
#SBATCH --mail-type=all
#SBATCH --mail-user=asturm2017@fau.edu
#SBATCH --job-name=sfs
#SBATCH --output=sfs-%j.out
#SBATCH --error=sfs-%j.err

realSFS pop0.saf.idx >pop0.sfs
realSFS pop1.saf.idx >pop1.sfs
realSFS pop2.saf.idx >pop2.sfs
realSFS pop3.saf.idx >pop3.sfs
realSFS pop4.saf.idx >pop4.sfs
realSFS pop5.saf.idx >pop5.sfs
realSFS pop6.saf.idx >pop6.sfs
realSFS pop7.saf.idx >pop7.sfs

##Producing thetas

nano theta

#!/bin/sh
#SBATCH --partition shortq7
#SBATCH --nodes 1
#SBATCH --exclusive
#SBATCH --mail-type=all
#SBATCH --mail-user=asturm2017@fau.edu
#SBATCH --job-name=theta

realSFS saf2theta pop0.saf.idx -sfs pop0.sfs -outname pop0
thetaStat do_stat pop0.thetas.idx
realSFS saf2theta pop1.saf.idx -sfs pop1.sfs -outname pop1
thetaStat do_stat pop1.thetas.idx
realSFS saf2theta pop2.saf.idx -sfs pop2.sfs -outname pop2
thetaStat do_stat pop2.thetas.idx
realSFS saf2theta pop3.saf.idx -sfs pop3.sfs -outname pop3
thetaStat do_stat pop3.thetas.idx
realSFS saf2theta pop4.saf.idx -sfs pop4.sfs -outname pop4
thetaStat do_stat pop4.thetas.idx
realSFS saf2theta pop5.saf.idx -sfs pop5.sfs -outname pop5
thetaStat do_stat pop5.thetas.idx
realSFS saf2theta pop6.saf.idx -sfs pop6.sfs -outname pop6
thetaStat do_stat pop6.thetas.idx
realSFS saf2theta pop7.saf.idx -sfs pop7.sfs -outname pop7
thetaStat do_stat pop7.thetas.idx
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>