-
Notifications
You must be signed in to change notification settings - Fork 0
/
plink_for_main_GBS
92 lines (62 loc) · 3.38 KB
/
plink_for_main_GBS
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# 8 October 2019
# The purpose of this code is to use plink for creation of a PCA plot for the main GBS project and to make the SNP able
# WD: /home/jkimball/haasx092/main_GBS/190910_samtools
plink --vcf SeventeenLargestMerged.vcf.gz --double-id --maf 0.05 --allow-extra-chr --recode --out myplink
# PCA calculation
plink --pca --file myplink --allow-extra-chr -out myplink_pca
module load R/3.6.0
# Switch to R
# Load required package.
library(data.table)
# Read in eigenvectors to plot PCA
x <- fread("myplink_pca.eigenvec")
# Remove column 2 (redundant)
x[, V2 := NULL]
# Load in sample key
y <- fread("~/main_GBS/191008_main_GBS_sample_key.csv")
# set column names
setnames(x, c("sample", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12", "PC13",
"PC14", "PC15", "PC16", "PC17", "PC18", "PC19", "PC20"))
setnames(y, c("ID", "sample", "class", "col"))
# Shorten sample name (change from directory path format) so that it can be merged with data table y
x[, sample := sub("/.+$", "", sample)]
# Remove a sample from data table y that is missing from x
y[sample != "Sample_0199"] -> y
# Read in eigenvalues (to determine % variation explained by each PC)
v <- fread("myplink_pca.eigenval")
# Calculate percent variation (note: I didn't bother renaming the columns to something informative since there is only one)
percentVar = c(PC1=v[1, V1] / sum(v$V1), PC2=v[2, V1] / sum(v$V1))
# Merge data tables
x[y, on="sample"] -> z
# Make the plot
pdf("191008_main_GBS_pca_from_plink.pdf", height=12, width=16)
z[, plot(PC1, PC2, xlab=paste0("PC1: ", round(percentVar[1]*100), "%"), ylab=paste0("PC2: ", round(percentVar[2]*100), "%"), main="main GBS", pch=16, col=col, cex=1.5, yaxt='n')]
axis(2, las=1)
par(oma=c(0,0,0,0))
legend("bottomright", legend=c("Breeding Line", "Necktie River", "Clearwater River", "Upper Rice Lake", "Garfield Lake", "Plantagenet", "Z. aquatica", "other"), pch=16, col=c(1:8), cex=1.5)
dev.off()
# Save the data
save(x, y, z, v, percentVar, file="191008_main_GBS_pca_from_plink.Rdata")
# Started here on 9 October 2019
# Not simply overwriting the above section because although it is largely the same, I want to preserve it since I use
# the data saved in the above section, but import a slightly different version of the sample key (colored differently).
# Load required package
library(data.table)
# Load data
load("191008_main_GBS_pca_from_plink.Rdata")
# Load in sample key (more identities of breedling lines are now known)
y <- fread("~/main_GBS/191008_main_GBS_sample_key.csv")
# Change column names to assist with merging data tables...
setnames(y, c("ID", "sample", "class", "col"))
# Remove a sample from data table y that is missing from x
y[sample != "Sample_0199"] -> y
# Merge data tables
x[y, on="sample"] -> z
# Make the plot
pdf("191009_main_GBS_PCA_Itasca_and_JohnsonDoraF4.pdf", height=10, width=10)
z[, plot(PC1, PC2, xlab=paste0("PC1: ", round(percentVar[1]*100), "%"), ylab=paste0("PC2: ", round(percentVar[2]*100), "%"), main="main GBS", pch=16, col=col, cex=1.5, yaxt='n')]
axis(2, las=1)
legend("bottomright", legend=c("Itasca-C12", "Itasca Haploid", "Johnson x Dora F4", "Z. latifolia", "Z. aquatica", "other"), pch=16, col=c(1,2,3,4,7,8), cex=1.5)
dev.off()
# Save data
save(x, y, z, v, percentVar, file="191009_main_GBS_pca_from_plink.Rdata")