plink_files/plink_for_main_GBS

# 8 October 2019
# The purpose of this code is to use plink for creation of a PCA plot for the main GBS project and to make the SNP able

# WD: /home/jkimball/haasx092/main_GBS/190910_samtools

plink --vcf SeventeenLargestMerged.vcf.gz --double-id --maf 0.05 --allow-extra-chr --recode --out myplink

# PCA calculation
plink --pca --file myplink --allow-extra-chr -out myplink_pca


module load R/3.6.0
# Switch to R

# Load required package.
library(data.table)

# Read in eigenvectors to plot PCA
x <- fread("myplink_pca.eigenvec")

# Remove column 2 (redundant)
x[, V2 := NULL]

# Load in sample key
y <- fread("~/main_GBS/191008_main_GBS_sample_key.csv")

# set column names
setnames(x, c("sample", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12", "PC13",
			"PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20"))
				
setnames(y, c("ID", "sample", "class", "col"))

# Shorten sample name (change from directory path format) so that it can be merged with data table y
x[, sample := sub("/.+$", "", sample)]

# Remove a sample from data table y that is missing from x
y[sample != "Sample_0199"] -> y

# Read in eigenvalues (to determine % variation explained by each PC)
v <- fread("myplink_pca.eigenval")


# Calculate percent variation (note: I didn't bother renaming the columns to something informative since there is only one)
percentVar = c(PC1=v[1, V1] / sum(v$V1), PC2=v[2, V1] / sum(v$V1))


# Merge data tables
x[y, on="sample"] -> z

# Make the plot
pdf("191008_main_GBS_pca_from_plink.pdf", height=12, width=16)
z[, plot(PC1, PC2, xlab=paste0("PC1: ", round(percentVar[1]*100), "%"), ylab=paste0("PC2: ", round(percentVar[2]*100), "%"), main="main GBS", pch=16, col=col, cex=1.5, yaxt='n')]
axis(2, las=1)

par(oma=c(0,0,0,0))
legend("bottomright", legend=c("Breeding Line", "Necktie River", "Clearwater River", "Upper Rice Lake", "Garfield Lake", "Plantagenet", "Z. aquatica", "other"), pch=16, col=c(1:8), cex=1.5)
dev.off()

# Save the data
save(x, y, z, v, percentVar, file="191008_main_GBS_pca_from_plink.Rdata")

# Started here on 9 October 2019
# Not simply overwriting the above section because although it is largely the same, I want to preserve it since I use
# the data saved in the above section, but import a slightly different version of the sample key (colored differently).

# Load required package
library(data.table)

# Load data
load("191008_main_GBS_pca_from_plink.Rdata")

# Load in sample key (more identities of breedling lines are now known)
y <- fread("~/main_GBS/191008_main_GBS_sample_key.csv")

# Change column names to assist with merging data tables...
setnames(y, c("ID", "sample", "class", "col"))

# Remove a sample from data table y that is missing from x
y[sample != "Sample_0199"] -> y

# Merge data tables
x[y, on="sample"] -> z

# Make the plot
pdf("191009_main_GBS_PCA_Itasca_and_JohnsonDoraF4.pdf", height=10, width=10)
z[, plot(PC1, PC2, xlab=paste0("PC1: ", round(percentVar[1]*100), "%"), ylab=paste0("PC2: ", round(percentVar[2]*100), "%"), main="main GBS", pch=16, col=col, cex=1.5, yaxt='n')]
axis(2, las=1)
legend("bottomright", legend=c("Itasca-C12", "Itasca Haploid", "Johnson x Dora F4", "Z. latifolia", "Z. aquatica", "other"), pch=16, col=c(1,2,3,4,7,8), cex=1.5)
dev.off()

# Save data
save(x, y, z, v, percentVar, file="191009_main_GBS_pca_from_plink.Rdata")