-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_DNA_Rep_Clustering.R
163 lines (147 loc) · 6.05 KB
/
01_DNA_Rep_Clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#######################################
######### Libraries ############
#######################################
library("SummarizedExperiment")
library(DESeq2)
library(NMF)
library("BiocParallel")
library(pheatmap)
#library(CancerSubtypes)
#library(ConsensusClusterPlus)
#library(RColorBrewer)
#library(gplots)
#######################################
######### Data and Normalization ######
#######################################
###### Data input
ddr.seq <- read.csv("ddr_mat.csv", header=TRUE, row.names = 1)
ddr.mat <- data.matrix(ddr.seq)
#Preprocessing
# Remove genes with <10 reads, Only removes 1 gene
reads.geq10 <- (apply(ddr.mat,1,sum) >= 10)
ddr.desq <- ddr.mat[reads.geq10,]
#TCGA Barcode
# https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode
barcode <- matrix(unlist(strsplit(colnames(ddr.mat), ".", fixed=TRUE)), ncol=7, byrow = TRUE)
colnames(barcode) <- c("Project", "TSS", "Participant", "Sample", "Portion", "Plate", "Center")
barcode <- data.frame(barcode)
barcode$Participant <- factor(barcode$Participant)
#This matches the variable of the same name in the clinical data
barcode$bcr_patient_barcode = paste0(barcode$Project,"-",barcode$TSS,"-",barcode$Participant)
# DESeq2-Based Normalization: Variance Stabilizing Transformation
# ddr.vsd is now the regularized log2-transformed data
ddr.vsd <- varianceStabilizingTransformation(ddr.mat, blind = TRUE)
#######################################
######### NMF Clustering ######
#######################################
#######################################
# The following code takes a long time to run (300 runs took 1.5 hours to run on 4 cores on my laptop)
#estim.r <- nmf(ddr.vsd, 2:6, nrun=30, .opt='vp4')
#save(estim.r, file='nmf_k_comparison.RData')
#plot(estim.r)
#consensusmap(estim.r, labCol=NA, labRow=NA)
#vsd.nmf <- nmf(x=ddr.vsd, rank=3, nrun=300, .opt='vp4')
vsd.nmf.k2 <- nmf(x=ddr.vsd, rank=2)
#save(vsd.nmf, file='./Results/nmf_k3_300.RData')
#save(vsd.nmf.k2, file='./Results/nmf_k2.RData')
load("./Results/nmf_k3_300.RData")
# Get details about the meta-genes
meta.genes <- extractFeatures(vsd.nmf)
c1.genes <- rownames(ddr.vsd)[meta.genes[[1]]]
c2.genes <- rownames(ddr.vsd)[meta.genes[[2]]]
c3.genes <- rownames(ddr.vsd)[meta.genes[[3]]]
write.table(paste0(c1.genes, collapse=", "), file="./Results/metagene_1_list.txt",
row.names=FALSE, col.names=FALSE, eol= "", quote = FALSE)
write.table(paste0(c2.genes, collapse=", "), file="./Results/metagene_2_list.txt",
row.names=FALSE, col.names=FALSE, eol= "", quote = FALSE)
write.table(paste0(c3.genes, collapse=", "), file="./Results/metagene_3_list.txt",
row.names=FALSE, col.names=FALSE, eol= "", quote = FALSE)
#######################################
######### Cluster Assignments ######
#######################################
#Assign Clusters from k=3 NMF Clustering
barcode$NMFC3 <- predict(vsd.nmf)
barcode$NMFC2 <- predict(vsd.nmf.k2)
write.csv(barcode, file = "./Results/cluster_results.csv", row.names = FALSE)
#######################################
######### DESeq2 ######
#######################################
# Create DeSeq dataset
ddr.desq <- DESeqDataSetFromMatrix(countData = ddr.mat,
colData = barcode,
design = ~ NMFC3)
#Calculate Results
ddr.desq <- DESeq(ddr.desq, parallel = TRUE, BPPARAM=MulticoreParam(4))
#save(ddr.desq, file='./Results/ddr.desq.Rdata')
ddr.res <- results(ddr.desq)
summary(ddr.res, alpha=.05)
#######################################
######### Plots ######
#######################################
pdf("basis_map.pdf")
basismap(vsd.nmf, subsetRow = TRUE)
dev.off()
pdf("coef_map.pdf")
coefmap(vsd.nmf)
dev.off()
pdf("consensus_map.pdf")
consensusmap(vsd.nmf)
dev.off()
pdf("consensus_map_k2.pdf")
consensusmap(vsd.nmf.k2)
dev.off()
pdf("basis_map_k2.pdf")
basismap(vsd.nmf.k2, subsetRow = TRUE)
dev.off()
pdf("coef_map_k2.pdf")
coefmap(vsd.nmf.k2)
dev.off()
#######################################
######### Old ######
#######################################
#######################################
######### Cluster Heatmaps ######
#######################################
# Function for selecting genes and creating a heatmap which clusters rows and columns
# Methods: most expressed, most varying, highest coefficient of variation, both expressed and varying
# n.genes sets the number of genes you want to select
# except for combined where the number of genes will be <= 2*n.genes
# Changes some of the defaults of pheatmap
phs <- function(sort.desq = ddr.desq, inp.vst = ddr.vsd, smethod = "mean", n.genes = 30, ord.dec = TRUE, ...){
stopifnot(any(smethod == c("mean", "var", "cv", "combined", "intersect")))
### Different methods for selecting genes
#Selects the most expressed genes
if(smethod == "mean"){
select <- order(rowMeans(assay(sort.desq)),
decreasing=ord.dec)[1:n.genes]
}
# Selects the genes with the highest variance
# Not sure the trade off between sample size and wanting genes which actually vary
if(smethod == "var"){
select <- order(rowVars(assay(sort.desq)),
decreasing=ord.dec)[1:n.genes]
}
# Selects the genes with the highest coefficient of variation
# cv = variance / mean
if(smethod == "cv"){
select <- order(sqrt(rowVars(assay(sort.desq)))/rowMeans(assay(sort.desq)),
decreasing=ord.dec)[1:n.genes]
}
# Selects both high variance and highly expressed genes
if(smethod == "combined"){
mean.select <- order(rowMeans(assay(sort.desq)),
decreasing=ord.dec)[1:n.genes]
var.select <- order(rowVars(assay(sort.desq)),
decreasing=ord.dec)[1:n.genes]
select <- unique(c(mean.select, var.select))
}
# Selects both high variance and highly expressed genes
if(smethod == "intersect"){
mean.select <- order(rowMeans(assay(sort.desq)),
decreasing=ord.dec)[1:n.genes]
var.select <- order(rowVars(assay(sort.desq)),
decreasing=ord.dec)[1:n.genes]
select <- mean.select[mean.select %in% var.select]
}
pheatmap(assay(inp.vst)[select,], ...)
}