-
Notifications
You must be signed in to change notification settings - Fork 5
/
code_for_submission.R
102 lines (88 loc) · 5.19 KB
/
code_for_submission.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
####Code workflow for clustering of cells in Olah et al. (2020)#####
####Note that this code requires an older version of Seurat (v2), and takes a long time to run in its entirety###
####As such, it can also be used as a general workflow for the procedures outlined in the paper###
require(Seurat)
require(randomForest)
require(Matrix)
memory.limit(80000)
source("code_for_submission_functions.R")
###Step 1: load data, remove genes often associated with technical artifacts, and filter by total UMI count###
alldat = read.csv("SupplementaryData14.csv",as.is=T,row.names=1,header=T)
alldat = alldat[grep("^LOC|^MT-|^RP[0-9]|^BC[0-9]|-PS",rownames(alldat),invert=T),]
alldat = alldat[,which(colSums(alldat)>=1000)]
alldat_cpm = sweep(alldat,2,colSums(alldat),"/")*10^6
batchval = read.csv("SupplementaryData15.csv",header=T,row.names=1)[,2]
###Step 2: run first-level clustering, ranging over parameters
###Note that the cluster IDs are saved as the names of the list, not as the values
top_clustering_over_parameters = cluster_over_parameters(alldat,batchval,keepcols=1:ncol(alldat),pcrange=5:15,
resrange=seq(from=0.2,to=0.8,by=0.2),krange=c(5,10,20,30),batchreg=1)
###Step 3: assess cluster robustness for every parameter set and select the one with the highest number of clusters
###with minimum prediction score >=0.75
cluster_numbers=rep(0,length(top_clustering_over_parameters))
predmatrices=list
for (parval in 1:length(top_clustering_over_parameters)) {
clusterids=strsplit(names(top_clustering_over_parameters)[parval],",")[[1]]
prediction_matrix=cluster_robustness(distweights=t(alldat_cpm),clids=clusterids,num_iterations=20)
minrows=apply(prediction_matrix,1,min)
mincols=apply(prediction_matrix,2,min)
cluster_numbers[parval]=length(which(minrows>=0.75 & mincols>=0.75))
predmatrices[[parval]]=prediction_matrix
}
optimal_pars=which(cluster_numbers==max(cluster_numbers))[1]
top_level_clusters=strsplit(names(top_clustering_over_parameters)[optimal_pars],",")
prediction_matrix=predmatrices[[optimal_pars]]
minrows=apply(prediction_matrix,1,min)
mincols=apply(prediction_matrix,2,min)
distinct_clusters=rownames(prediction_matrix)[which(minrows>=0.75 & mincols>=0.75)]
merge_cells=which(top_level_clusters %in% setdiff(rownames(prediction_matrix),distinct_clusters))
top_level_clusters[merge_cells]="merged1"
###show distribution of top level clusters
table(top_level_clusters)
###Step 4: for each top level cluster, re-run steps 2 and 3
subcluster_list=list()
unique_top_level_clusters=unique(top_level_clusters)
for (topcl in unique_top_level_clusters) {
keepcells=which(top_level_clusters==topcl)
sub_clustering_over_parameters = cluster_over_parameters(alldat,batchval,keepcols=keepcells,pcrange=5:15,
resrange=seq(from=0.2,to=0.8,by=0.2),krange=c(5,10,20,30),batchreg=1)
cluster_numbers=rep(0,length(sub_clustering_over_parameters))
predmatrices=list
for (parval in 1:length(sub_clustering_over_parameters)) {
clusterids=strsplit(names(sub_clustering_over_parameters)[parval],",")[[1]]
prediction_matrix=cluster_robustness(distweights=t(alldat_cpm[,keepcells]),clids=clusterids,num_iterations=20)
minrows=apply(prediction_matrix,1,min)
mincols=apply(prediction_matrix,2,min)
cluster_numbers[parval]=length(which(minrows>=0.75 & mincols>=0.75))
predmatrices[[parval]]=prediction_matrix
}
if (max(cluster_numbers)>1) {
optimal_pars=which(cluster_numbers==max(cluster_numbers))[1]
sub_level_clusters=strsplit(names(sub_clustering_over_parameters)[optimal_pars],",")
prediction_matrix=predmatrices[[optimal_pars]]
minrows=apply(prediction_matrix,1,min)
mincols=apply(prediction_matrix,2,min)
distinct_clusters=rownames(prediction_matrix)[which(minrows>=0.75 & mincols>=0.75)]
merge_cells=which(sub_level_clusters %in% setdiff(rownames(prediction_matrix),distinct_clusters))
sub_level_clusters[merge_cells]="merged1"
subcluster_list[[as.character(topcl)]]=sub_level_clusters
} else {
subcluster_list[[as.character(topcl)]]="No_subdivisions"
}
}
###compile top level and sub-level cluster names###
two_level_clusters=as.character(top_level_clusters)
for (ii in names(subcluster_list)) {
if (subcluster_list[[ii]][1]!="No_subdivisions") {
renamecells=which(two_level_clusters==ii)
two_level_clusters[renamecells]=paste0(two_level_clusters[renamecells],"_",subcluster_list[[ii]])
}
}
table(two_level_clusters)
###After clustering is finished, use random forest classification to generate prediction scores for each cell
###in each pair of clusters.This set of matrices is the basis for the constellation plot
clusterids = read.csv("SupplementaryData15.csv",header=T,row.names=1)[,1]
cell_predictions=cell_by_cell_prediction(dataset=t(alldat_cpm),clusterids=clusterids,crossval=4,iterations=100,
filename="rf_predictions_all_clusters.rda")
###Differential expression across clusters###
deg_pairwise=pairwise_differential_expression(alldat,clusterids)
deg_one_versus_all=one_versus_all_differential_expression(alldat,clusterids)