-
Notifications
You must be signed in to change notification settings - Fork 0
/
LD_decay_separate_classes.R
217 lines (185 loc) · 8.55 KB
/
LD_decay_separate_classes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# Purpose of this script is to do LD analysis separately on Natural Stands and Cultivated Material
# Load required packages
library(data.table)
library(sommer)
# Read in imputed data
load("220221_imputed.Rdata")
# Long-formatted data are stored in the object called "yy"
# We want to be able to sort and filter based on whether samples belong to Natural Stands or Cultivated Material, so we need to read in some data to help with this
key <- fread("191021_main_GBS_sample_key_PBML-C20_renamed.csv")
# The next few steps are meant to replicate what `VLOOKUP()` could achieve in Excel
# Change first column of key to allow us to merge data.tables
setnames(key, "sample_number", "sample")
# Merge tables based on the sample column
merged <- merge(yy, key, on = "sample")
# Remove superfluous columns
# We'll want to remove some others later, but they might still be useful for now
merged[, ID := NULL]
merged[, sample_ID_extended := NULL]
# Check to make sure we only have the samples that we want to include
unique(merged[class == "Natural stand"]$sample_ID_simplified)
# From this, we can tell that the "old" Garfield and Shell Lake samples are included
# So is Big Fork River and Zizania aquatica. We'll want to filter those out
# Define samples to exclude
NS_to_exclude <- c("Garfield Lake Old", "Shell Lake Old", "Big Fork River", "Zizania aquatica")
# The next line simply verifies that the code works as expected
unique(merged[class == "Natural stand" & !(sample_ID_simplified %in% NS_to_exclude)]$sample_ID_simplified)
# Now, filter natural stands for real
natural_stands <- merged[class == "Natural stand" & !(sample_ID_simplified %in% NS_to_exclude)]
# Do the same for cultivated material ("breeding lines")
# It'll be easier to specify which ones we want to include rather than which ones we want to exclude
cultivated_to_keep <- c("14S*PS", "Barron", "FY-C20", "K2EF-C16", "Itasca-C12", "Itasca-C20", "PM3E")
# Again, we prove to ourselves that this approach works
unique(merged[class == "Breeding line" & sample_ID_simplified %in% cultivated_to_keep]$sample_ID_simplified)
cultivated <- merged[class == "Breeding line" & sample_ID_simplified %in% cultivated_to_keep]
# Convert natural stands to wide format
dcast(natural_stands, scaffold + position + snp_id + ref + alt ~ sample, value.var="GT") -> ns_wide
# Convert cultivated to wide wide format
dcast(cultivated, scaffold + position + snp_id + ref + alt ~ sample, value.var="GT") -> cm_wide
# Write separated data to their own CSV files
write.csv(ns_wide, file = "220401_natural_stand_only_for_LD_decay.csv")
write.csv(cm_wide, file = "220401_cultivated_only_for_LD_decay.csv")
## The next step was to remove the column names and row names so that we are left with only a matrix of 0s, 1s, and 2s. No row or column names.
## This is really where we get into the part of actually calculating LD/LD decay
# Read in cultivated (no zeros)
cm_dat <- fread("220401_cultivated_only_for_LD_decay_no_row_or_col_names.csv")
# Read in natural stands (no zeros)
ns_dat <- fread("220401_natural_stand_only_for_LD_decay_no_row_or_col_names.csv")
# Read in map data
map_data <- fread("/home/jkimball/haasx092/LD_decay/220401_map_data_for_LD_decay.csv")
# Change column names of map_data to suit needs of LD.decay() function
setnames(map_data, c("Locus", "LG", "Position"))
# Transpose data so that columns are markers and rows are individuals
cm_dat_t <- t(cm_dat)
ns_dat_t <- t(ns_dat)
# Convert data to matrix
cm_dat_t_m <- as.matrix(cm_dat_t)
ns_dat_t_m <- as.matrix(ns_dat_t)
# Convert data type back to numeric/integer from character. I thought it would help solve the issue of empty results. I was wrong-it didn't solve the problem--but also didn't appear to cause any problems.
class(cm_dat_t_m) <- "numeric"
class(ns_dat_t_m) <- "numeric"
# Set column names
colnames(cm_dat_t_m) <- map_data$Locus
colnames(ns_dat_t_m) <- map_data$Locus
# Set rownames
rownames(cm_dat_t_m) <- colnames(cm_wide[, c(6:143)])
rownames(ns_dat_t_m) <- colnames(ns_wide[,c(6:535)])
map_data[, LG := sub("ZPchr", "", LG)]
map_data[, LG := as.numeric(map_data$LG)]
map_data[LG == 458, LG := 17]
# Calculate LD decay (for cultivated)
cultivated_results <- LD.decay(cm_dat_t_m, map_data, unlinked = TRUE)
cultivated_results_linked <- LD.decay(cm_dat_t_m, map_data)
# Calculate LD decay (for natural stands)
natural_stands_results <- LD.decay(ns_dat_t_m, map_data, unlinked = TRUE)
natural_stands_results_linked <- LD.decay(ns_dat_t_m, map_data)
# Save data
save(cultivated_results, cultivated_results_linked , natural_stands_results, natural_stands_results_linked, file = "220401_LD_decay_separate_classes.Rdata")
# Make plots
# Natural stands
pdf("LD_plots_natural_stands.pdf", height = 15, width = 15)
layout(matrix(c(1, 5, 9, 13, 17,
2, 6, 10, 14, 0,
3, 7, 11, 15, 0,
4, 8, 12, 16, 0), nrow = 5), widths = c(1,1,1,1))
par(oma = c(3,6,3,0))
par(mar = c(4,4,4,2))
for (i in c(1:17)){
if(i <= 15){
data <- as.data.table(natural_stands_results_linked$by.LG[i])
data <- data[p <= 0.05] # filter out all insignificant p-values
data[p <= 0.05, col := "black"]
data[p <= 0.01, col := "#235e39"]
data[p <= 0.001, col := "#00a54c"]
data[, plot(x = d, y = r2, main = paste0("Chromosome ", i),
xlab = "distance",
ylab = "R^2",
family = "serif",
col = col,
pch = 16,
las = 1)]
}
if(i == 16){
data <- as.data.table(natural_stands_results_linked$by.LG[i])
data <- data[p <= 0.05] # filter out all insignificant p-values
data[p <= 0.05, col := "black"]
data[p <= 0.01, col := "#235e39"]
data[p <= 0.001, col := "#00a54c"]
data[, plot(x = d, y = r2, main = paste0("Scaffold 51"),
xlab = "distance",
ylab = "R^2",
family = "serif",
col = col,
pch = 16,
las = 1)]
}
if(i == 17){
data <- as.data.table(natural_stands_results_linked$by.LG[i])
data <- data[p <= 0.05] # filter out all insignificant p-values
data[p <= 0.05, col := "black"]
data[p <= 0.01, col := "#235e39"]
data[p <= 0.001, col := "#00a54c"]
data[, plot(x = d, y = r2, main = paste0("Scaffold 458"),
xlab = "distance",
ylab = "R^2",
family = "serif",
col = col,
pch = 16,
las = 1)]
}
}
dev.off()
# Make plots
# Cultivated
pdf("LD_plots_cultivated.pdf", height = 15, width = 15)
layout(matrix(c(1, 5, 9, 13, 17,
2, 6, 10, 14, 0,
3, 7, 11, 15, 0,
4, 8, 12, 16, 0), nrow = 5), widths = c(1,1,1,1))
par(oma = c(3,6,3,0))
par(mar = c(4,4,4,2))
for (i in c(1:17)){
if(i <= 15){
data <- as.data.table(cultivated_results_linked$by.LG[i])
data <- data[p <= 0.05] # filter out all insignificant p-values
data[p <= 0.05, col := "black"]
data[p <= 0.01, col := "#235e39"]
data[p <= 0.001, col := "#00a54c"]
data[, plot(x = d, y = r2, main = paste0("Chromosome ", i),
xlab = "distance",
ylab = "R^2",
family = "serif",
col = col,
pch = 16,
las = 1)]
}
if(i == 16){
data <- as.data.table(cultivated_results_linked$by.LG[i])
data <- data[p <= 0.05] # filter out all insignificant p-values
data[p <= 0.05, col := "black"]
data[p <= 0.01, col := "#235e39"]
data[p <= 0.001, col := "#00a54c"]
data[, plot(x = d, y = r2, main = paste0("Scaffold 51"),
xlab = "distance",
ylab = "R^2",
family = "serif",
col = col,
pch = 16,
las = 1)]
}
if(i == 17){
data <- as.data.table(cultivated_results_linked$by.LG[i])
data <- data[p <= 0.05] # filter out all insignificant p-values
data[p <= 0.05, col := "black"]
data[p <= 0.01, col := "#235e39"]
data[p <= 0.001, col := "#00a54c"]
data[, plot(x = d, y = r2, main = paste0("Scaffold 458"),
xlab = "distance",
ylab = "R^2",
family = "serif",
col = col,
pch = 16,
las = 1)]
}
}
dev.off()