-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprepro-mRNA.R
236 lines (219 loc) · 8.93 KB
/
prepro-mRNA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#FROM:
## HANDS-ON: NGS ANALYSIS --> Quantification files
## By Carlos Martínez Mira, Nov-2016
## By Sonia Tarazona, Nov-2016
##https://www.bioconductor.org/packages/release/bioc/vignettes/NOISeq/inst/doc/NOISeq.pdf
## &&
## Data preparation:
## -Quality Control & bias removal
## By Cristobal Fresno - [email protected]
library(SummarizedExperiment)
library(TCGAbiolinks)
library(biomaRt)
subtype=read.table("subtype.tsv",header=T)
#get the data
xprssn <- GDCquery(project = "TCGA-BRCA",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts",
barcode=subtype$samples)
GDCdownload(xprssn)
expre=GDCprepare(xprssn,summarizedExperiment=F)#fails if T
temp=as.matrix(expre[,2:ncol(expre)])
rownames(temp)=expre$X1
expre=temp
write.table(expre,"RNAseq.tsv",sep='\t',quote=F)
#subtype to duplicates
i=substr(colnames(expre),1,19)
j=i[duplicated(i)]
designExp=subtype[c(which(!subtype$samples%in%j),
as.numeric(sapply(which(subtype$samples%in%j),rep,2))),]
designExp=designExp[order(match(designExp$samples,substr(colnames(expre),1,19))),]
designExp$barcode=colnames(expre)
#check everything matches
#keep only transcript id not version numbers
rownames(expre)=sapply(strsplit(rownames(expre),".",fixed=T),
function(x) x[1])
#annnotate GC content, length & biotype per transcript
mart=useEnsembl("ensembl",dataset="hsapiens_gene_ensembl")
myannot=getBM(attributes = c("ensembl_gene_id",
"percentage_gene_gc_content", "gene_biotype",
"start_position","end_position","hgnc_id","hgnc_symbol"),
filters = "ensembl_gene_id",
values=rownames(expre),mart=mart)
myannot$length=abs(myannot$end_position-myannot$start_position)
#filter transcripts withouth annotation
myannot=myannot[myannot$gene_biotype=="protein_coding"&
myannot$hgnc_symbol!="",]
myannot=myannot[!duplicated(myannot$ensembl_gene_id),]
exprots_hgnc=expre[rownames(expre)%in%myannot$ensembl_gene_id,]
dim(exprots_hgnc)
#[1] 19210 809
#check duplicated probes
#myannot[myannot$hgnc_id==
# myannot$hgnc_id[duplicated(myannot$hgnc_id)],]
#which(rownames(exprots_hgnc)=="ENSG00000279195")#not anymore
#exprots_hgnc[rownames(exprots_hgnc)=="ENSG00000204510",]=
# exprots_hgnc[rownames(exprots_hgnc)=="ENSG00000204510",]+
# exprots_hgnc[19192,]
#exprots_hgnc=exprots_hgnc[c(1:19191,19193:nrow(exprots_hgnc)),]
#dim(exprots_hgnc)
#myannot=myannot[myannot$ensembl_gene_id%in%rownames(exprots_hgnc),]
##################CHECK BIASES########################################################
library(NOISeq)
library(edgeR)
#format data for noiseq
noiseqData = readData(data = exprots_hgnc, gc = myannot[,1:2],
biotype = myannot[,c(1,3)],factor=designExp,
length=myannot[,c(1,8)])
#1)check expression bias per subtype
mycountsbio = dat(noiseqData, type = "countsbio", factor = "subtype")
#patients with repeated measures
png("CountsOri.png")
explo.plot(mycountsbio, plottype = "boxplot",samples = 1:5)
dev.off()
#2)check for low count genes
png("lowcountsOri.png")
explo.plot(mycountsbio, plottype = "barplot", samples = 1:5)
dev.off()
png("lowCountThres.png")
hist(rowMeans(cpm(exprots_hgnc,log=T)),ylab="genes",
xlab="mean of log CPM",col="gray")
abline(v=0,col="red")
dev.off()
#3)check for transcript composition bias
#each sample s is compared to a reference r (which can be arbitrarily chosen).
#by computing M values=log2(countss = countsr).
#Confidence intervals for the M median is computed by bootstrapping.
#If the median of M values for each comparison is not in the CI, the deviation
# of the sample is significant, therefore, normalization is needed
mycd = dat(noiseqData, type = "cd", norm = FALSE) #slooooow
#[1] "Warning: 107 features with 0 counts in all samples are to be removed for this analysis."
table(mycd@dat$DiagnosticTest[, "Diagnostic Test"])
#FAILED PASSED
# 695 113
png("MvaluesOri.png")
explo.plot(mycd,samples=sample(1:ncol(exprots_hgnc),10))
dev.off()
#4)check for length & GC bias
#A cubic spline regression model is fitted. Both the model p-value and the coefficient
# of determination (R2) are shown. If the model p-value is significant and R2 value is
# high (more than 70%), the expression depends on the feature
myGCcontent <- dat(noiseqData, k = 0, type = "GCbias",
factor = "subtype")
png("GCbiasOri.png",width=1000)
par(mfrow=c(1,5))
sapply(1:5,function(x) explo.plot(myGCcontent, samples = x))
dev.off()
#The GC-content of each gene does not change from sample to sample, so it can be expected to
#have little effect on differential expression analyses to a first approximation
mylenBias <- dat(noiseqData, k = 0, type = "lengthbias",
factor = "subtype")
png("lengthbiasOri.png",width=1000)
par(mfrow=c(1,5))
sapply(1:5,function(x) explo.plot(mylenBias, samples = x))
dev.off()
#BUT, since the gene has the same length in all your samples, there is no need to divide by the gene length
#5) check for batch effect
myPCA = dat(noiseqData, type = "PCA", norm = F, logtransf = F)
png("PCA_Ori.png")
explo.plot(myPCA, samples = c(1,2), plottype = "scores",
factor = "subtype")
dev.off()
#################SOLVE BIASES######################################################
library(EDASeq)
#1) filter low count genes.
#CPM=(counts/fragments sequenced)*one million.
#Filtering those genes with average CPM below 1, would be different
#to filtering by those with average counts below 1.
countMatrixFiltered = filtered.data(exprots_hgnc, factor = "subtype",
norm = FALSE, depth = NULL, method = 1, cpm = 0, p.adj = "fdr")
#17077 features are to be kept for differential expression analysis with filtering method 1
myannot=myannot[myannot$ensembl_gene_id%in%rownames(countMatrixFiltered),]
#all names must match
mydataEDA <- newSeqExpressionSet(
counts=as.matrix(countMatrixFiltered),
featureData=data.frame(myannot,row.names=myannot$ensembl_gene_id),
phenoData=data.frame(designExp,row.names=designExp$barcode))
#order for less bias
gcFull <- withinLaneNormalization(mydataEDA,
"percentage_gene_gc_content", which = "full")#corrects GC bias
lFull <- withinLaneNormalization(gcFull, "length", which = "full")#corrects length bias
fullfullTMM <-NOISeq::tmm(normCounts(lFull), long = 1000, lc = 0, k = 0)
#norm.counts <- betweenLaneNormalization(normCounts(lFull),
# which = "median", offset = FALSE)
#FAILED PASSED
# 290 518
noiseqData = NOISeq::readData(data = fullfullTMM,, factors=designExp)
#cd has to preceed ARSyN or won't work
mycd=NOISeq::dat(noiseqData,type="cd",norm=TRUE)
table(mycd@dat$DiagnosticTest[, "Diagnostic Test"])
#FAILED PASSED
# 297 511
#full
#FAILED PASSED
# 346 462
#############################SOLVE BATCH EFFECT#######################################################
myPCA = dat(noiseqData, type = "PCA", norm = T, logtransf = F)
png("preArsyn.png")
explo.plot(myPCA, samples = c(1,2), plottype = "scores",
factor = "subtype")
dev.off()
ffTMMARSyn=ARSyNseq(noiseqData, factor = "subtype", batch = F,
norm = "n", logtransf = T)
myPCA = dat(ffTMMARSyn, type = "PCA", norm = T,logtransf = T)
png("postArsyn.png")
explo.plot(myPCA, samples = c(1,2), plottype = "scores",
factor = "subtype")
dev.off()
#############################FINAL QUALITY CHECK#######################################################
noiseqData = readData(data = exprs(ffTMMARSyn), gc = myannot[,1:2],
biotype = myannot[,c(1,3)],factor=designExp,
length=myannot[,c(1,8)])
mycountsbio = dat(noiseqData, type = "countsbio", factor = "subtype",
norm=T)
png("CountsFinal.png")
explo.plot(mycountsbio, plottype = "boxplot",samples=1:5)
dev.off()
myGCcontent <- dat(noiseqData, k = 0, type = "GCbias",
factor = "subtype",norm=T)
png("GCbiasFinal.png",width=1000)
par(mfrow=c(1,5))
sapply(1:5,function(x) explo.plot(myGCcontent, samples = x))
dev.off()
mylenBias <- dat(noiseqData, k = 0, type = "lengthbias",
factor = "subtype",norm=T)
png("lengthbiasFinal.png",width=1000)
par(mfrow=c(1,5))
sapply(1:5,function(x) explo.plot(mylenBias, samples = x))
dev.off()
#############################RESOLVE DUPLICATES & SAVE##################################################
#get duplicates
i=designExp$samples[duplicated(designExp$samples)]
#get sample barcode per sample
i=lapply(i,function(x) designExp$barcode[designExp$samples==x])
#separate duplicates
final=exprs(ffTMMARSyn)
duplis=final[,colnames(final)%in%unlist(i)]
prefi=final[,!colnames(final)%in%unlist(i)]
#average duplicates
temp=do.call(cbind,lapply(i,function(x)
rowMeans(duplis[,colnames(duplis)%in%x])))
#identify samples with barcode
colnames(temp)=designExp$samples[duplicated(designExp$samples)]
colnames(prefi)=substr(colnames(prefi),1,19)
#joint matrices
final=cbind(prefi,temp)
dim(final)
#[1] 17077 805
final=final[,order(match(colnames(final),subtype$samples))]
write.table(final,"RNAseqnormalized.tsv",sep='\t',quote=F)
#duplicates share everything except the plate
# TCGA-A7-A13D-01A-13R-A12P-07
# TCGA-A7-A13D-01A-13R-A277-07
# TCGA-A7-A26E-01A-11R-A277-07
# TCGA-A7-A26E-01A-11R-A169-07
# TCGA-A7-A26J-01A-11R-A169-07
# TCGA-A7-A26J-01A-11R-A277-07
# TCGA-A7-A13E-01A-11R-A12P-07
# TCGA-A7-A13E-01A-11R-A277-07