-
Notifications
You must be signed in to change notification settings - Fork 0
/
07b_dgeAll.R
143 lines (109 loc) · 4.49 KB
/
07b_dgeAll.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# Create a dgeList object that contains both the samples used to train the classifier,
# the internal validation set that was part of the same dataset,
# and the now-unblinded validation samples
library(here)
library(edgeR)
library(tidyverse)
## Validation data ----
#Load the validation samples
dgeVal <- readRDS(here("Rds/06_dgeVal.Rds"))
head(dgeVal$samples)
#The case-control status of the blind validation set:
dict_df <- read_csv(here("07_validated_sample_dictionary.csv"))
head(dict_df)
## Pellet quality ----
#In the results sheet from the VUMC, there is some additional information regarding
#ery contamination and platelet pellet quality.
#Let's add this to the dictionary.
dict_df <- readxl::read_excel(here("dataset/third_party_val",
"Classify BRCA NKI july 2020 plus ery data 24-7-2020_VU.xlsx"),
sheet = "RNAs detected") %>%
select(sample, ery_contam = `Ery contamination`, pellet_qual = `Platelet pellet`) %>%
left_join(dict_df, ., by = "sample")
#The rest are NA
table(dict_df$ery_contam)
table(dict_df$pellet_qual)
## Case-control status ----
#Add the case-control status to the (no longer) blind validation set:
dgeVal$samples <- dgeVal$samples %>%
rownames_to_column("sample") %>%
left_join(., dict_df, by = "sample") %>%
mutate(
group = factor(
if_else(Status_description == "Case", "breastCancer", "healthyControl"),
levels = c("healthyControl", "breastCancer")),
stage_detailed = Stage,
stage = as.integer(substr(stage_detailed, 1, 1))
)
#Tibbles removes row names
rownames(dgeVal$samples) <- dgeVal$samples$sample
#Sanity checks
stopifnot(all(rownames(dgeVal$samples) == colnames(dgeVal)))
stopifnot(all((dgeVal$samples$group == "breastCancer") == (dgeVal$samples$Status_description == "Case")))
stopifnot(all((dgeVal$samples$group == "healthyControl") == (dgeVal$samples$Status_description == "Control")))
#Add a Label column for later
dgeVal$samples$Label <- "Blind Validation"
head(dgeVal$samples)
## Stage ----
#Recode the stage column so it matches the same factor levels we had before:
dgeVal$samples$stage <- dgeVal$samples$stage %>%
replace_na("healthyControl") %>%
dplyr::recode_factor(
"healthyControl" = "healthyControl",
"1" = "I", "2" = "II" ,
"3" = "III", "4" = "IV"
)
levels(dgeVal$samples$stage)
#Save the validation data as its own DGEList including metadata
saveRDS(dgeVal, here("Rds/07b_dgeVal.Rds"))
## Training data ----
#The count matrix used to train both models:
dgeFiltered <- readRDS(here("Rds/01_dgeFiltered.Rds"))
stopifnot(all(levels(dgeFiltered$samples$stage) == levels(dgeVal$samples$stage)))
## Combine val set with training set ----
#Create a new DGEList that contains both the samples used to train the
#models and the blind validation set:
dgeAll <- edgeR::DGEList(
counts = cbind(
dgeFiltered$counts, #Samples used to train the model
dgeVal$counts #New samples
),
samples = bind_rows(
#Training samples
#Keep original data partition for reference
#Change label to "training"
dgeFiltered$samples %>%
rownames_to_column("sample") %>%
mutate(Original_Label = Label) %>%
mutate(Dataset = "Original",
Label = "Training") %>%
select(group, sample, stage, Dataset, Label, Age,
Original_Label, isolationlocation),
#Blind validation samples
dgeVal$samples %>%
mutate(Dataset = "blindVal", isolationlocation = "blindNKI",
Label = "Validation") %>%
select(group, sample, stage, Dataset, Label, Age,
isolationlocation, ery_contam, pellet_qual)
),
genes = dgeFiltered$genes
)
#head(dgeAll$samples)
#sanity checks
stopifnot(all(colnames(dgeAll) == rownames(dgeAll$samples)))
stopifnot(all(colnames(dgeAll) == dgeAll$samples$sample))
stopifnot(all(rowSums(is.na(dgeAll$counts)) == 0))
#Remove the extra group columns
dgeAll$samples$group.1 <- NULL
head(dgeAll$samples)
## Recode hospital ---
#Some hospitals barely contributed any samples
dgeAll$samples$isolationlocation %>% table()
#Put low count hospitals into an "other" category
dgeAll$samples$hosp <- ifelse(dgeAll$samples$isolationlocation %in% c("AMC", "UMCU", "VIENNA"),
"other", dgeAll$samples$isolationlocation)
dgeAll$samples$hosp <- factor(dgeAll$samples$hosp,
levels = c("other", "VUMC", "MGH", "NKI", "blindNKI"))
table(dgeAll$samples$isolationlocation, dgeAll$samples$hosp)
## Write DGEList ----
saveRDS(object = dgeAll, file = here("Rds/07b_dgeAll.Rds"))