From 023f75f7da5c504da14b65a73c7b53aa5627fc86 Mon Sep 17 00:00:00 2001 From: Lijiao Ning Date: Tue, 23 Apr 2024 21:12:09 +0200 Subject: [PATCH] rephrase and reorder some codes --- docs/scRNAseq_basics/00_IOCsc_week0.md | 4 +-- docs/scRNAseq_basics/01_IOCsc_week1.md | 10 ++++---- docs/scRNAseq_basics/preprocessing.md | 34 +++++++++++++------------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/docs/scRNAseq_basics/00_IOCsc_week0.md b/docs/scRNAseq_basics/00_IOCsc_week0.md index f0d664c8..a966ce60 100644 --- a/docs/scRNAseq_basics/00_IOCsc_week0.md +++ b/docs/scRNAseq_basics/00_IOCsc_week0.md @@ -50,8 +50,8 @@ To complete this week you'll need to : - [x] 4. Create a Seurat Object - [x] 5. Create an annotation table of zebrafish genes using `biomaRt`. -Add your RMD/QMD in your trello card. +Add your RMD/QMD in your Trello card. **Thank you for your attention and see you next week :clap: :clap: :clap:** ----- +---- \ No newline at end of file diff --git a/docs/scRNAseq_basics/01_IOCsc_week1.md b/docs/scRNAseq_basics/01_IOCsc_week1.md index 6d772bb1..6886534f 100644 --- a/docs/scRNAseq_basics/01_IOCsc_week1.md +++ b/docs/scRNAseq_basics/01_IOCsc_week1.md @@ -4,14 +4,14 @@ The preprocessing is the most important part of a single cell analysis because y can skew your result if you filter too much **or too little** and you must really understand what's going on these steps. -Please go read the following pages to learn more about it : [Preprocessing.](preprocessing.md). - -The preprocesing is composed of : +The preprocesing is composed of: - Filtering of low quality barcodes -- Barcode Normalization +- Barcode normalization - Selection of most variable features +Please go read the [preprocessing](preprocessing.md) pages to learn more about it. + --- ![](../R-IOC/images/toolbox-do-it-yourself.png){: style="width:75px"} **Do it yourself!** @@ -39,7 +39,7 @@ To complete this week you'll need to : each step of your thoughts. In general, try to explain in your own words, each step of your analysis ! -Add your RMD/QMD in your trello card. +Add your RMD/QMD in your Trello card. **Thank you for your attention and see you next week :clap: :clap: :clap:** diff --git a/docs/scRNAseq_basics/preprocessing.md b/docs/scRNAseq_basics/preprocessing.md index dc0c37ce..46f5c82f 100644 --- a/docs/scRNAseq_basics/preprocessing.md +++ b/docs/scRNAseq_basics/preprocessing.md @@ -2,7 +2,7 @@ The pre-processing steps are used to clean the data in order not to distort the results of downstream analyses (clustering analysis, markers, differential -expression analysis). +expression analysis, *etc.*). ## Filter out low quality cells @@ -36,7 +36,7 @@ A cell is generally considered to be in apoptosis when the transcriptome detects more than 20% of the genes in the MT genome. Some are more stringent in lowering this threshold to 10%. -``` r +```r ## Retrieve genes from the MT genome using biomart genes_MT <- annotated_hg19$ensembl_gene_id[annotated_hg19$chromosome_name == "MT"] @@ -55,7 +55,7 @@ VlnPlot(object = pbmc_small, -``` r +```r ## Graphical representation of QC ggplot(pbmc_small@meta.data, aes(y = nCount_RNA, @@ -69,8 +69,9 @@ ggplot(pbmc_small@meta.data, high = "red", mid = "yellow", midpoint = 20) + - ggtitle("QC plot", "Number of detected genes in function of number of UMI")+ - labs(y = "Number of UMI per cell", x = "Number of detected genes by cell") + labs(x = "Number of detected genes by cell", + y = "Number of UMI per cell", + title = "QC plot", "Number of detected genes in function of number of UMI") ``` @@ -87,7 +88,7 @@ We can also use a histogram representation. I recommend the three types of figures because depending on the dataset, the best method to identify outliers is different. -``` r +```r hist(pbmc_small$nCount_RNA, breaks = 100, xlab = "Number of UMI per cell", @@ -98,7 +99,7 @@ abline(v = 10000, col = "red") -``` r +```r hist(pbmc_small$nFeature_RNA, breaks = 100, xlab = "Number of detected genes by cell", @@ -128,14 +129,12 @@ We will remove all the cells : - that detect less than 300 genes or more than 2300. - whose percentage of expressed genes of the MT genome exceeds 10% -``` r +```r ## Filtering SeuratObject pbmc_small <- subset(pbmc_small, percent_mito < 10 & - nCount_RNA > 650 & - nCount_RNA < 10000 & - nFeature_RNA > 300 & - nFeature_RNA < 2300) + (nCount_RNA > 650 & nCount_RNA < 10000) & + (nFeature_RNA > 300 & nFeature_RNA < 2300)) ## Plot ggplot(pbmc_small@meta.data, @@ -148,13 +147,14 @@ ggplot(pbmc_small@meta.data, high = "red", mid = "yellow", midpoint = 20) + - ggtitle("QC plot after filtering", "Number of detected genes in function of number of UMI")+ - labs(y = "Number of UMI per cell", x = "Number of detected genes by cell") + labs(x = "Number of detected genes by cell", + y = "Number of UMI per cell", + title = "QC plot after filtering", "Number of detected genes in function of number of UMI") ``` -``` r +```r ## Update object in R console pbmc_small ``` @@ -197,7 +197,7 @@ median of the library size (= total number of UMIs per cell, = `nCount_RNA` in `meta.data`). If the scale factor is equal to 1e6 then we would get log2(CPM+1). *CPM : Count Per Million*. -``` r +```r ## Inter-cell normalization pbmc_small <- NormalizeData(pbmc_small, #SeuratObject assay = "RNA", #Assay to use @@ -230,7 +230,7 @@ relation between the expression mean and the variance of each gene. With the `nfeatures` parameter we retrieve the 2000 most variable genes according to the vst method. -``` r +```r pbmc_small <- FindVariableFeatures(pbmc_small, #SeuratObject selection.method = "vst", #Method nfeatures = 2000) #Top HVG (Highly Variable Gene), default value