title | author | date |
---|---|---|
Notebook1_preprocessing |
Laura Wolbeck |
2023-02-21 |
#Load helper functions
source("scRNA_helper_functions_preterm.R")
#Load libraries
library(conos) #version 1.4.0
library(pagoda2)
library(magrittr)
library(ggplot2)
library(CRMetrics)
Make list of count matrices (cms), early (E 18.5 and pre P0) and late (full P2 and pre P3) timepoints will be analysed separately
#create vector with paths to filtered cms
paths1 <- c(E18_5_1= "/data/home/lwolbeck/data/Japan_preterm_counts/E18_5_1/outs/filtered_feature_bc_matrix",
E18_5_2="/data/home/lwolbeck/data/Japan_preterm_counts/E18_5_2/outs/filtered_feature_bc_matrix",
pre_P0_1= "/data/home/lwolbeck/data/Japan_preterm_counts/pre_P0_1/outs/filtered_feature_bc_matrix",
pre_P0_2= "/data/home/lwolbeck/data/Japan_preterm_counts/pre_P0_2/outs/filtered_feature_bc_matrix")
paths2 <- c(full_P2_1 = "/data/home/lwolbeck/data/Japan_preterm_counts/full_P2_1/outs/filtered_feature_bc_matrix",
full_P2_2 ="/data/home/lwolbeck/data/Japan_preterm_counts/full_P2_2/outs/filtered_feature_bc_matrix",
pre_P3_1 = "/data/home/lwolbeck/data/Japan_preterm_counts/pre_P3_1/outs/filtered_feature_bc_matrix",
pre_P3_2= "/data/home/lwolbeck/data/Japan_preterm_counts/pre_P3_2/outs/filtered_feature_bc_matrix")
#read in data in a parallel manner
cms1 <- pagoda2::read.10x.matrices(paths1, n.cores=10)
cms2 <- pagoda2::read.10x.matrices(paths2, n.cores=10)
#to check if any colnames are duplicates
any(duplicated(unlist(lapply(cms1,colnames))))
any(duplicated(unlist(lapply(cms2,colnames))))
combine both matrices
cms <- append(cms1, cms2)
using the CRMetrics package to plot summary metrics
metadata <- data.frame(sample=names(cms), group= c("fullterm", "fullterm", "preterm", "preterm", "fullterm", "fullterm", "preterm", "preterm"))
crm <- CRMetrics$new(data.path="/data/Japan_preterm_counts", n.cores = 30, metadata = metadata)
plot Fig S2A
crm$plotSummaryMetrics(comp.group = "sample", second.comp.group = "group", metrics = "estimated number of cells",plot.geom = "bar") + xlab("") + ylab("number of cells") + theme(legend.position = "right", legend.title = element_blank())
ggsave("number_of_cells.pdf")
#adding count matrices to crm object
crm$addDetailedMetrics()
plot Fig S2B
crm$plotDetailedMetrics(comp.group = "group",
metrics = "gene_count",
plot.geom = "violin",hline = F) + ylab("genes per cell") + theme(legend.position = "right", legend.title = element_blank()) + stat_summary(fun.y=median, size=10, geom = "text", label = "-") + theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12))
ggsave("VlnPlot_genes_per_cell.pdf")
create a first conos object to use for filtering out low quality cells
#vector with sample names
names1 = c("E18_5_1_",
"E18_5_2_",
"pre_P0_1_",
"pre_P0_2_")
names2 = c("full_P2_1_",
"full_P2_2_",
"pre_P3_1_",
"pre_P3_2_")
#quickConos() is a wrapper function defined in scRNA_helper_functions_preterm.R
con1 <- quickConos(cms1,
names1,
n.cores.p2=10,
n.cores.con=20, alignment.strength = 0.2)
con1 <- con1$con
con2 <- quickConos(cms2,
names2,
n.cores.p2=10,
n.cores.con=20, alignment.strength = 0.2)
con2 <- con2$con
Create new directories scrublet1 and scrublet2
Write files for Srublet
mapply(function(x,y) write.table(as.matrix(x$misc$rawCounts), file = paste0("scrublet1/",y,".csv"), dec = ".", sep=","), x=con1$samples, y=names(con1$samples))
mapply(function(x,y) write.table(as.matrix(x$misc$rawCounts), file = paste0("scrublet2/",y,".csv"), dec = ".", sep=","), x=con2$samples, y=names(con2$samples))
Run Scrublet on each sample .csv file separately using the following python script "scrub.py":
#!/usr/bin/env python
import sys
import pandas
import scrublet
for filename in sys.argv[1:]:
print('processing '+filename);
print('Reading...')
df = pandas.read_csv(filename)
scrub = scrublet.Scrublet(df)
print('Scoring...')
doublet_scores, predicted_doublets = scrub.scrub_doublets()
print('Saving...')
pandas.DataFrame(doublet_scores).to_csv(filename+".doubletScores")
print('All done.')
Import doublet scores
doubletvec1 <- do.call("rbind", lapply(dir(path = "scrublet1/", pattern=".doubletScores", full.names = T), read.csv)) %>% .$X0 %>% setNames(con1$samples %>% sapply(function(x) rownames(x$misc$rawCounts)) %>% unlist)
doubletvec2 <- do.call("rbind", lapply(dir(path = "scrublet2/", pattern=".doubletScores", full.names = T), read.csv)) %>% .$X0 %>% setNames(con2$samples %>% sapply(function(x) rownames(x$misc$rawCounts)) %>% unlist)
singlets1 <- doubletvec1[doubletvec1>0.3] #758 cells (2.46%) are removed
cms1 %<>% lapply(function(s) s[,!colnames(s) %in% names(singlets1)])
singlets2 <- doubletvec2[doubletvec2>0.3] #940 cells (2.95%) are removed
cms2 %<>% lapply(function(s) s[,!colnames(s) %in% names(singlets2)])
plot <- con1$plotGraph(groups = doubletvec1>0.3, size=0.1, mark.groups=F, show.legend = F, title="Doublets E18.5 fetus and preterm P0 samples") +scale_colour_manual(values=c("grey", "red"))+ theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
rasterize(plot, layers='Point', dpi=1000)
ggsave("Doublets_early.pdf", width=7, height=4.3 )
plot <- con2$plotGraph(groups = doubletvec2>0.3, size=0.1, mark.groups=F, show.legend = F, title="Doublets in full-term P2 and preterm P3 samples") +scale_colour_manual(values=c("grey", "red"))+theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
rasterize(plot, layers='Point', dpi=1000)
ggsave("Doublets_late.pdf", width=7, height=4.3 )
Get mito fraction (percent of mitochondrial genes per cell) with helper function
mito1 <- mitoFraction(con1, "mouse")
mito2 <- mitoFraction(con2, "mouse")
cms1 %<>% lapply(function(s) s[,!colnames(s) %in% names(mito1[mito1>0.05])])
cms2 %<>% lapply(function(s) s[,!colnames(s) %in% names(mito2[mito2>0.05])])
get depth with helper function
depth1 <- getConosDepth(con1)
depth2 <- getConosDepth(con2)
cms1 %<>% lapply(function(s) s[,!colnames(s) %in% names(depth1[depth1<1000])])
cms2 %<>% lapply(function(s) s[,!colnames(s) %in% names(depth2[depth2<1000])])
pre-processing of each filtered dataset with pagoda2, followed by using conos to build the joint UMAP graph with forced alignment (alignment.strength = 0.2) to integrate samples well
con1 <- quickConos(cms1,
names1,
n.cores.p2=10,
n.cores.con=20, get.tsne = TRUE, alignment.strength=0.2)
con1 <- con1$con
con2 <- quickConos(cms2,
names2,
n.cores.p2=10,
n.cores.con=20, get.tsne = TRUE, alignment.strength=0.2)
con2 <- con2$con
Rerun leiden clustering to find additional clusters
con1$findCommunities(method = leiden.community, resolution = 5, min.group.size = 15)
con2$findCommunities(method = leiden.community, resolution = 5, min.group.size = 15)
leiden24 <- con1$clusters$leiden$groups %>% factor
leiden24_2 <- con2$clusters$leiden$groups %>% factor
con1$clusters$leiden$groups <- leiden24
con2$clusters$leiden$groups <- leiden24_2
in late timepoints (full P2 and pre P3) cluster 20 was removed since it could not be annotated and was spread between several clusters
cms2 %<>% lapply(function(s) s[,!colnames(s) %in% names(leiden24_2[leiden24_2=="20"])])
generating new joint UMAP embedding with cluster 20 removed
con2 <- quickConos(cms2,
names2,
n.cores.p2=10,
n.cores.con=20, get.tsne = TRUE, alignment.strength=0.2)
con2 <- con2$con
The final clusters were generated using a combination of manual selection and findSubcommunities() function of conos to split the clusters further. The clusters were annotated using known cell-type specific marker genes from the literature.