scrnaseq_hto.Rmd

---
date: "`r format(Sys.time(), '%B, %Y')`"
geometry: margin=2cm
output:
   html_document:
    toc: true
    toc_depth: 4
    toc_float: true
    code_folding: hide
    highlight: tango
    theme: paper
params:
  # Author
  author: !r Sys.info()[["user"]]

  # Project ID
  project_id: HTO_testDataset 
   
  # Input directory
  path_data: test_datasets/10x_pbmc_hto_GSE108313/counts
  
  # Metrics summary
  stats:
   
  # Output directory
  path_out: test_datasets/10x_pbmc_hto_GSE108313/demultiplexed
   
  # HTO names
  #   HTOs have an ID that is included in the "features.tsv" input file
  #   We additionally ask for readable names that are used throughout this report (optional)
  #   This could look like this, where HTO1-3 are the IDs included raw dataset
  #    - without names: 'hto.names: c("HTO1", "HTO2", "HTO3")'
  #    - with names: 'hto.names: !r setNames(c("NameA", "NameB", "NameC"), c("HTO1", "HTO2", "HTO3"))'
  # 
  hto_names: !r setNames(c("htoA","htoB","htoC","htoD","htoE","htoF","htoG","htoH"), c("htoA","htoB","htoC","htoD","htoE","htoF","htoG","htoH"))
   
  # HTO regular expression (alternative to HTO names)
  #   Identify HTOs via a regular expression. Names cannot be provided here.
  #   If specified will overrule hto_names
  hto_regex: NULL
   
  # HTO normalisation: CLR (default) or LogNormalize
  norm: "CLR"
   
  # Prefix of mitochondrial genes
  mt_names: "^MT-"
   
  # Main color to use for plots
  col: "palevioletred"
   
  # Downsample data to at most N cells (mainly for tests); set to NULL to deactivate
  downsample_cells_n: NULL
   
  # Path to git
  path_to_git: "."
   
  # Debugging mode: 
  # "default_debugging" for default, "terminal_debugger" for debugging without X11, "print_traceback" for non-interactive sessions 
  debugging_mode: "default_debugging"
title: "Single-cell RNA-seq data analysis of project `r params$project_id`"
subtitle: "Demultiplexing with hashtag oligos"
author: "`r params$author`"
---

```{r setup, message=FALSE, warning=FALSE}
# R Options
options(stringsAsFactors=FALSE,
        "citation_format"="pandoc", 
        dplyr.summarise.inform=FALSE, 
        knitr.table.format="html",
        future.globals.maxSize=2000000000, mc.cores=4, 
        future.fork.enable=TRUE, future.plan="multicore",
        future.rng.onMisuse="ignore")

# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator

# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr
# Tables: kableExtra
# Plots: ggsci

# Knitr default options
knitr::opts_chunk$set(echo=TRUE,                     # output code
                      cache=FALSE,                   # do not cache results
                      message=TRUE,                  # show messages
                      warning=TRUE,                  # show warnings
                      tidy=FALSE,                    # do not auto-tidy-up code
                      fig.width=10,                  # default fig width in inches
                      class.source="fold-hide",      # by default collapse code blocks
                      dev=c("png", "pdf"),           # create figures in png and pdf; the first device (png) will be used for HTML output
                      dev.args=list(png=list(type="cairo"),  # png: use cairo - works on cluster, supports anti-aliasing (more smooth)
                                    pdf=list(bg="white")),     # pdf: use cairo - works on cluster, supports anti-aliasing (more smooth)
                      dpi=96,                        # figure resolution (note: final = dpi*fig.retina; will not work with pandoc see below)
                      fig.retina=2                   # retina multiplier for HTML output (note: final = dpi*fig.retina)
                                     
)
```

```{r preparation_and_initial_checks}
# Path for figures in png and pdf format
knitr::opts_chunk$set(fig.path=paste(params$path_out, "figures/", sep="/"))

# When using pandoc, setting the dpi via opts_chunk does not work (https://github.com/yihui/knitr/issues/901)
# As workaround, we define a function that takes care of that
#knitr::opts_hooks$set(dpi = function(options) {options$dpi = 40;options})

# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("R/functions_io.R",
              "R/functions_plotting.R",
              "R/functions_util.R")
git_files_to_source = paste(params$path_to_git, git_files_to_source, sep="/")
file_exists = purrr::map_lgl(git_files_to_source, file.exists)

if (any(!file_exists)) stop(paste("The following files could not be found:",paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", params$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))

# Debugging mode: 
# "default_debugging" for default, "terminal_debugger" for debugging without X11, "print_traceback" for non-interactive sessions 
invisible(switch (params$debugging_mode, 
        default_debugging=on_error_default_debugging(), 
        terminal_debugger=on_error_start_terminal_debugger(),
        print_traceback=on_error_just_print_traceback(),
        on_error_default_debugging()))

# Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)

# Create output directories
if (!file.exists(params$path_out)) dir.create(params$path_out, recursive=TRUE, showWarnings=FALSE)

# Do checks
error_messages = c()
# Check installed packages
#error_messages = c(error_messages, check_installed_packages())
# Check parameters TODO
#error_messages = c(error_messages, check_parameters(param))
```

```{r initial_checks_failed, eval=length(error_messages)>0, include=length(error_messages)>0}
# Now format collected error messages as error boxes, print and exit
# Note: asis_output prints a content as is; knit_exist toggles knitr to exit after the current chunk
error_message_blocks = purrr::map(error_messages, format_error) %>% paste(collapse="")
knitr::knit_exit()
knitr::asis_output(error_message_blocks)
```

# Read input data
In this first section of the report, we begin by printing mapping statistics that have been produced prior to this workflow.
```{r mapping_stats, message=TRUE, results="asis"}
# Are statistics provided?
if (!is.null(params$stats)) {
  cat("\n## Stats {.tabset}\n\n")
  mapping_stats = read.delim(params$stats,sep=",", header=FALSE, check.names=FALSE) %>% t() %>% as.data.frame()
  colnames(mapping_stats) = c("V1","Dataset")
  rownames(mapping_stats) = mapping_stats$V1
  mapping_stats$V1 = NULL

  mapping_stats_general = mapping_stats[!grepl("^Antibody:",rownames(mapping_stats)),,drop=FALSE]
  mapping_stats_antibodies = mapping_stats[grepl("^Antibody:",rownames(mapping_stats)),,drop=FALSE]
  
  cat("\n### General\n\n")
  print(knitr::kable(mapping_stats_general, align="l", caption="General statistics") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover")))
 
  cat("\n### Antibodies\n\n")
  print(knitr::kable(mapping_stats_antibodies, align="l", caption="Antibody statistics") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover")))
} else { 
  message("Mapping statistics cannot be shown. No valid file provided.")
}
```

## Dataset
Then, we read 10X data from the files produced by Cell Ranger:  

* **barcodes.tsv.gz**: All cell barcodes  
* **features.tsv.gz**: (Ensembl) ID, name, and type for each feature  
* **matrix.mtx.gz**: Counts for all features  

and setup a Seurat object. This object includes different data types in separate assays:  

* **Gene Expresssion** in assay **RNA**  
* **Antibody Capture** in **HTO** and **ADT**,   
* **CRISPR Guide Capture** in **Crispr**, and  
* **Custom** in **Custom**.   

Note that "Antibody Capture" features can correspond to "HTO" and "ADT" and are distinguished based on provided HTO names. 
```{r hto_read}
# Load the dataset with its assays and create a Seurat object; pass hto_names (or hto_regex if set) so that HTO will be a separate assay
sc = ReadSparseMatrix(params$path_data, project=params$project_id, row_name_column=1, hto_names=params$hto_names, hto_regex=params$hto_regex)
sc = sc[[1]]

# If requested: sample at most n cells
if (!is.null(params$downsample_cells_n)) {
  sampled_barcodes = sample(Seurat::Cells(sc), min(params$downsample_cells_n, length(Seurat::Cells(sc))))
  sc = subset(sc, cells=sampled_barcodes)
}

# Remember original assay names
original_assay_names = setdiff(Seurat::Assays(sc), "HTO")

# Set colours
hto_samples = rownames(sc[["HTO"]])
hto_colours = list()
hto_colours$col_hto_global = ggsci::pal_npg()(3)
names(hto_colours$col_hto_global) = c("Negative", "Doublet", "Singlet")

hto_colours$col_hto_collapsed = ggsci::pal_npg()(length(hto_samples) + 2)
hto_levels = c("Negative", "Doublet", hto_samples[1:length(hto_samples)])
names(hto_colours$col_hto_collapsed) = hto_levels
```

## HTO counts
Sometimes demultiplexing fails, because some HTOs generally do not perform well, or some cells end up with zero HTO counts. To assure the script can run through, we remove HTOs and cells with zero counts. 
```{r hto_counts, results="asis"}
# Check for HTOs with 0 HTO counts, and remove them if necessary 
hto_counts = Seurat::GetAssayData(sc, assay="HTO", slot="counts") 
hto_counts_row = hto_counts %>% Matrix::rowSums() %>% as.data.frame()
colnames(hto_counts_row) = "HtoCounts"
if (any(hto_counts_row$HtoCounts == 0)) {
  sc[["HTO"]] = subset(sc[["HTO"]], features=rownames(which(hto_counts_row$HtoCounts > 0)))
  warning("Discarded ", sum(hto_counts_row$HtoCounts == 0), " HTOs with 0 HTO counts.")
}

# Check for cells with 0 HTO counts, and remove them if necessary 
hto_counts_col = hto_counts %>% Matrix::colSums() %>% summary() %>% as.matrix() %>% as.data.frame()
colnames(hto_counts_col) = "HtoCounts"
if (any(hto_counts_col$HtoCounts == 0)) {
  sc = subset(sc, cells=rownames(which(hto_counts_col[, "HtoCounts"] > 0)))
  warning("Discarded ", sum(hto_counts_col$HtoCounts == 0), " cells with 0 HTO counts.")
}

# Check for HTOs how many cells they have at least 1 count
hto_counts_row$CellsWithAtLeast1HtoCount = sapply(1:nrow(hto_counts), function(x) sum(hto_counts[x,] > 0))

knitr::kable(hto_counts_row, align="l", caption="HTO counts per HTO") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
knitr::kable(hto_counts_col, align="l", caption="HTO counts per cell (summary)") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
```

# Demultiplexing with HTOs
This section of the report shows how cells are assigned to their sample-of-origin. 

## Normalisation of HTO counts
We start the analysis by normalising raw HTO counts. If `LogNormalize` was chosen for normalisation, HTO counts for each cell are divided by the total counts for that cell, multiplied by 10,000 and then natural-log transformed. If `CLR` was chosen for normalisation, HTO counts for each cell are divided by the geometric mean of the counts for that cell and then natural-log transformed. 
```{r hto_normalisation}
# HTO
sc = suppressWarnings(Seurat::NormalizeData(sc, assay="HTO", normalization.method=params$norm, verbose=FALSE))
```

## Classification of cells based on normalised HTO data
We assign cells to their sample-of-origin, annotate negative cells that cannot be assigned to any sample, and doublet cells that are assigned to two samples. 
```{r hto_demux, results="asis"}
# Demultiplex HTOs
sc = Seurat::HTODemux(sc, assay="HTO", positive.quantile=0.95, verbose=FALSE)

# Sort Idents levels for nicer plotting
Seurat::Idents(sc) = factor(Seurat::Idents(sc), levels=hto_levels)
sc$hash.ID = factor(sc$hash.ID, levels=hto_levels)
sc$HTO_classification.global = factor(sc$HTO_classification.global, levels=c("Negative", "Doublet", "Singlet"))

# HTO classification results
hash_ID_table = sc[["hash.ID"]] %>% 
  dplyr::count(hash.ID) %>% 
  dplyr::rename(HTO=hash.ID) %>% 
  dplyr::mutate(Perc=round(n/sum(n)*100,2)) %>% 
  as.data.frame

p1 = ggplot(sc[["HTO_classification.global"]] %>% dplyr::count(HTO_classification.global), 
            aes(x="", y=n, fill=HTO_classification.global)) + 
  geom_bar(width=1, stat="identity") + 
  coord_polar("y", start=0, direction=-1) + 
  AddStyle(title="HTO global classification results", 
           fill=hto_colours$col_hto_global, 
           xlab="", ylab="")

p2 = ggplot(sc[["hash.ID"]] %>% dplyr::count(hash.ID), 
            aes(x="", y=n, fill=hash.ID)) + 
  geom_bar(width=1, stat="identity") + 
  coord_polar("y", start=0, direction=-1) + 
  AddStyle(title="HTO classification results", 
           fill=hto_colours$col_hto_collapsed,
           xlab="", ylab="")

p = p1 + p2 + 
  patchwork::plot_annotation(title="HTO classification results")
p

knitr::kable(hash_ID_table, align="l", caption="HTO classification results") %>% 
        kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
```

# Visualisation of HTO data
This section of the report visualises raw and normalised HTO data to understand whether the demultiplexing step has worked well. 
```{r hto_plot_counts_raw, dpi=96}
# Distribution of HTO counts before and after normalisation
hto_t_raw = Seurat::GetAssayData(sc, assay="HTO", slot="counts") %>% 
  as.data.frame %>% t %>% as.data.frame
hto_t_raw_pseudo = hto_t_raw + 1
hto_t_norm = Seurat::GetAssayData(sc, assay="HTO", slot="data") %>% 
  as.data.frame %>% t %>% as.data.frame

p1 = ggplot(hto_t_raw_pseudo %>% tidyr::pivot_longer(tidyr::everything()), 
            aes(x=name, y=value, fill=name, group=name)) + 
  geom_violin() + 
  scale_y_continuous(trans="log2") + 
  AddStyle(title="HTO raw counts", 
           fill=hto_colours$col_hto_collapsed, 
           legend_position="none", 
           xlab="", ylab="") +
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1))
  
p2 = ggplot(hto_t_norm %>% tidyr::pivot_longer(tidyr::everything()), 
            aes(x=name, y=value, fill=name, group=name)) + 
  geom_violin() + 
  AddStyle(title="HTO normalised counts", 
           fill=hto_colours$col_hto_collapsed, 
           legend_position="none",
           xlab="", ylab="") + 
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1))

p = p1 + p2
p = p + patchwork::plot_annotation("HTO counts before and after normalisation")
p
```

Pairs of raw (top) and normalised (bottom) HTO counts are visualised to confirm mutal exclusivity in singlet cells. Data points correspond to measured HTO counts per HTO, colours correspond to the assigned samples-of-origin. 

```{r hto_plot_counts_norm, fig.height=10}
n = DfAllColumnCombinations(x=hto_t_raw_pseudo, cell_classification=sc$hash.ID)

# Plot
p = ggplot(n, aes(x=value1, y=value2, color=cell_classification)) + 
  geom_point() + 
  scale_x_continuous(trans="log2") + scale_y_continuous(trans="log2") + 
  AddStyle(col=hto_colours$col_hto_collapsed)

p = p + facet_grid(name2~name1, drop=FALSE) + 
  theme(axis.title.x=element_blank(), strip.text.x=element_text(size=10, color="black"),
        axis.title.y=element_blank(), strip.text.y=element_text(size=10, color="black"),
        strip.background = element_rect(colour="white", fill="lightgrey"),
        legend.position="bottom", 
        axis.text.x=element_text(angle=45, hjust=1, vjust=0.5)) + 
  patchwork::plot_annotation("Raw HTO counts")
p
```

```{r hto_plot_norm_scatter, fig.height=10}  
n = DfAllColumnCombinations(x=hto_t_norm, cell_classification=sc$hash.ID)
n = n[n$value1 > 0 & n$value2 > 0, ]
p = ggplot(n, aes(x=value1, y=value2, color=cell_classification)) + 
  geom_point() + 
  scale_x_continuous(trans="log2") + scale_y_continuous(trans="log2") + 
  AddStyle(col=hto_colours$col_hto_collapsed)

p = p + facet_grid(name2~name1, drop=FALSE) + 
  theme(axis.title.x=element_blank(), strip.text.x=element_text(size=10, color="black"),
        axis.title.y=element_blank(), strip.text.y=element_text(size=10, color="black"),
        strip.background = element_rect(colour="white", fill="lightgrey"),
        legend.position="bottom") + 
  patchwork::plot_annotation("Normalised HTO data")
suppressMessages(p)
```

The following ridge plots visualise the enrichment of assigned sample-of-origin for the respective normalised HTO counts. 
```{r hto_plot_norm, fig.height=10, message=FALSE}
# Group cells based on HTO classification 
p_list = RidgePlot(sc, assay="HTO", features=rownames(Seurat::GetAssay(sc, assay="HTO")), 
                   same.y.lims=TRUE, cols=hto_colours$col_hto_collapsed, combine=FALSE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle(legend_position="none")
p = patchwork::wrap_plots(p_list, ncol = 2) + 
  patchwork::plot_annotation("Normalised HTO data")
suppressMessages(p)
```

Lastly, we compare the counts and number of features between classified cells. 
```{r hto_plot_features}
# Number of counts and features in the different cells
nfeature_metrics = grep("_HTO", 
                        grep("nFeature_", colnames(sc[[]]), v=TRUE),
                        v=TRUE,
                        invert=TRUE)
ncounts_metrics = grep("_HTO", 
                        grep("nCount_", colnames(sc[[]]), v=TRUE),
                        v=TRUE,
                        invert=TRUE)

p1 = VlnPlot(sc, features=ncounts_metrics, idents=levels(Seurat::Idents(sc)), pt.size=0) + 
  geom_violin(color=NA) + 
  AddStyle(title="Counts (RNA)", 
           fill=hto_colours$col_hto_collapsed, 
           legend_position="none", 
           xlab="") +
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) 

p2 = VlnPlot(sc, features=nfeature_metrics, idents=levels(Seurat::Idents(sc)), pt.size=0) + 
  geom_violin(color=NA) +
  AddStyle(title="Features (RNA)", 
           fill=hto_colours$col_hto_collapsed, 
           legend_position="none", 
           xlab="") +
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) 

p = patchwork::wrap_plots(list(p1, p2), ncol=2)
p = p + patchwork::plot_annotation("Number of counts and features for HTO-classified cells")
p
```

# Keeping singlets
This section of the report states the number of cells that remain after negative and doublet cells are removed. 
```{r hto_subset}
sc.all = sc
sc = subset(sc, idents=c("Negative", "Doublet"), invert=TRUE)
sc
```

# Preliminary visualisation of RNA data
This section of the report provides first insights into your RNA dataset based on a preliminary pre-processing of the RNA data using the standard scRNA-seq workflow. 
```{r prelim_analysis, results="hide"}
# Normalise, scale, pca umap and clustering for sc
sc = suppressWarnings(Seurat::NormalizeData(sc, assay="RNA", normalization.method = "LogNormalize", scale.factor=10000, verbose=FALSE))
sc = Seurat::FindVariableFeatures(sc, assay="RNA", selection.method="vst", verbose=FALSE)
sc = Seurat::ScaleData(sc, assay="RNA", features=Seurat::VariableFeatures(sc, assay="RNA"), verbose=FALSE)
sc = Seurat::RunPCA(sc, assay="RNA", features=VariableFeatures(object=sc, assay="RNA"), verbose=FALSE, npcs=min(50, ncol(sc)))
sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:10, verbose=FALSE, umap.method="uwot"))
sc = Seurat::FindNeighbors(sc, dims=1:10, verbose=FALSE)
sc = Seurat::FindClusters(sc, algorithm=1, verbose=FALSE, method="igraph")
Idents(sc) = "hash.ID"

# Normalise, scale, pca umap and clustering for sc.all
sc.all = suppressWarnings(Seurat::NormalizeData(sc.all, assay="RNA", normalization.method = "LogNormalize", scale.factor=10000, verbose=FALSE))
sc.all = Seurat::FindVariableFeatures(sc.all, assay="RNA", selection.method="vst", verbose=FALSE)
sc.all = Seurat::ScaleData(sc.all, assay="RNA", features=Seurat::VariableFeatures(sc.all, assay="RNA"), verbose=FALSE)
sc.all = Seurat::RunPCA(sc.all, assay="RNA", features=VariableFeatures(object=sc.all, assay="RNA"), verbose=FALSE, npcs=min(50, ncol(sc.all)))
sc.all = suppressWarnings(Seurat::RunUMAP(sc.all, dims=1:10, verbose=FALSE, umap.method="uwot"))
sc.all = Seurat::FindNeighbors(sc.all, dims=1:10, verbose=FALSE)
sc.all = Seurat::FindClusters(sc.all, algorithm=1, verbose=FALSE, method="igraph")
Idents(sc.all) = "hash.ID"

# Set figure height
sc_fig_height = round((length(hto_samples))*1.25)
sc_qc_fig_height = round((length(levels(sc$seurat_clusters)))*0.75)
sc_all_fig_height = round((length(hto_samples)+2)*1.25)
sc_all_qc_fig_height = round((length(levels(sc.all$seurat_clusters)))*0.75)

# Set cluster colors
cluster_names = levels(sc$seurat_clusters)
sc_col_clusters = GenerateColours(num_colours=length(cluster_names), palette="ggsci::pal_d3", palette_options = list(palette="category20"))
names(sc_col_clusters) = cluster_names

cluster_names = levels(sc.all$seurat_clusters)
sc_all_col_clusters = GenerateColours(num_colours=length(cluster_names), palette="ggsci::pal_d3", palette_options = list(palette="category20"))
names(sc_all_col_clusters) = cluster_names
```

## Visualisation with UMAP {.tabset}
We use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the assigned sample-of-origin. 

Take care not to mis-read a UMAP:  

* Parameters influence the plot (we use defaults here)  
* Cluster sizes relative to each other mean nothing, since the method has a local notion of distance  
* Distances between clusters might not mean anything  
* You may need more than one plot  
  
For a nice read to intuitively understand UMAP, see https://pair-code.github.io/understanding-umap/. 

### Before filtering
```{r umap_before}
p = Seurat::DimPlot(sc.all, reduction="umap", group.by="hash.ID", cols=hto_colours$col_hto_collapsed) + 
  AddStyle(title="UMAP, cells coloured by HTO classification, including doublets and negatives", 
           legend_title="Cell classification", 
           legend_position="bottom")
p
```

* Note: HTO colors are the same before and after filtering

### After filtering
```{r umap_after}
# Plot UMAP after HTO filtering
p = Seurat::DimPlot(sc, reduction="umap", group.by="hash.ID", cols=hto_colours$col_hto_collapsed) + 
  AddStyle(title="UMAP, cells coloured by HTO classification, singlets only", 
           legend_title="Cell classification", 
           legend_position="bottom")
p
```

* Note: HTO colors are the same before and after filtering

### Before filtering (each hto)
```{r umap_cluster_before, fig.height=sc_all_fig_height}
p = Seurat::DimPlot(sc.all, reduction="umap", group.by="seurat_clusters", split.by="hash.ID", ncol=2, cols=sc_all_col_clusters) + 
  AddStyle(title="UMAP for each HTO, including doublets and negatives", 
           legend_title="Cluster", 
           legend_position="right")
p
```

* Important: Cluster colors are not neccessarily the same before and after filtering!

### After filtering (each hto)
```{r umap_cluster_after, fig.height=sc_fig_height}
p = Seurat::DimPlot(sc, reduction="umap", group.by="seurat_clusters", split.by="hash.ID", ncol=2, cols=sc_col_clusters) + 
  AddStyle(title="UMAP for each HTO, singlets only", 
           legend_title="Cluster", 
           legend_position="right")
p
```

* Important: Cluster colors are not neccessarily the same before and after filtering!

## Distribution of cells in HTO and in clusters

Next, we examine how the HTOs are distributed in the individual clusters and how the clusters are distributed in each individual HTO. We include negatives and doublets as they often group into specific clusters.

```{r hto_cluster distribution}

p = ggplot(sc.all[[c("seurat_clusters","hash.ID")]], aes(x=seurat_clusters, fill=hash.ID)) +
  geom_bar(position="fill", colour=NA) +
  scale_x_discrete("Cluster") +
  scale_y_continuous("Fraction of cells") +
  scale_fill_manual(values=hto_colours$col_hto_collapsed) +
  AddStyle(title="HTO cells in cluster", 
           legend_title="Cell classification", 
           legend_position="right")  
p

p = ggplot(sc.all[[c("seurat_clusters","hash.ID")]], aes(x=hash.ID, fill=seurat_clusters)) +
  geom_bar(position="fill", colour=NA) +
  scale_x_discrete("Cluster") +
  scale_y_continuous("Fraction of cells") +
  scale_fill_manual(values=sc_all_col_clusters) +
  AddStyle(title="Clusters in individual HTO", 
           legend_title="Cluster", 
           legend_position="right")  +
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) 
p
```

Here it also helps to inspect RNA counts and features per cluster and hto.

```{r hto_cluster_counts, fig.height=sc_all_qc_fig_height}
ncounts_metrics = grep("_HTO", 
                        grep("nCount_", colnames(sc.all[[]]), v=TRUE),
                        v=TRUE,
                        invert=TRUE)

p = ggplot(sc.all[[c("hash.ID", "seurat_clusters", ncounts_metrics[1])]], aes_string(x="hash.ID", y=ncounts_metrics[1], fill="hash.ID", group="hash.ID")) +
  geom_violin() +
  xlim(levels(sc.all$hash.ID)) +
  scale_y_continuous("Counts (RNA)") +
  facet_wrap(~seurat_clusters, ncol=3, scales="free_y") +
  #scale_fill_manual(values=hto_colours$col_hto_collapsed) +
  AddStyle(fill=hto_colours$col_hto_collapsed, 
           legend_title="Classified cells",
           legend_position="bottom", 
           xlab="") +
  theme(axis.text.x = element_text(angle=45, hjust=1), legend.position="none")
p = p + patchwork::plot_annotation("Number of counts for each cluster and HTO")
p
```

```{r hto_cluster_features, fig.height=sc_all_qc_fig_height}
nfeature_metrics = grep("_HTO", 
                        grep("nFeature_", colnames(sc.all[[]]), v=TRUE),
                        v=TRUE,
                        invert=TRUE)


p = ggplot(sc.all[[c("hash.ID", "seurat_clusters", nfeature_metrics[1])]], aes_string(x="hash.ID", y=nfeature_metrics[1], fill="hash.ID", group="hash.ID")) +
  geom_violin() +
  xlim(levels(sc.all$hash.ID)) +
  facet_wrap(~seurat_clusters, ncol=3, scales="free_y") +
  scale_y_continuous("Features (RNA)") +
  #scale_fill_manual(values=hto_colours$col_hto_collapsed) +
  AddStyle(fill=hto_colours$col_hto_collapsed, 
           legend_title="Classified cells",
           legend_position="bottom", 
           xlab="") +
  theme(axis.text.x = element_text(angle=45, hjust=1), legend.position="none")
p = p + patchwork::plot_annotation("Number of features for each cluster and HTO")
p
```

# Write out demultiplexed data
Finally, demultiplexed RNA data are written back to file. Results are included as metadata which may help for further analysis (e.g. filtering).
```{r hto_save_samples}
# Save each sample in a separate directory
Idents(sc) = "hash.ID"
sc$HTO_hash_ID = sc$hash.ID
samples = levels(Seurat::Idents(sc))
demux_samples_paths = c()
for (s in samples) {
  p = ExportSeuratAssayData(sc[,Seurat::Idents(sc)==s], 
                            dir=file.path(params$path_out, s), 
                            assays=original_assay_names, 
                            slot="counts",
                            include_cell_metadata_cols=c("HTO_classification", 
                                                         "HTO_hash_ID"))
  demux_samples_paths = c(demux_samples_paths, p)
}
message("Demultiplexed datasets are: ", paste(demux_samples_paths, collapse=", "))

# Export cell classification for Loupe
demux_samples_paths = c()
sc.all$HTO_hash_ID = sc.all$hash.ID
loupe_meta = as.data.frame(sc.all@meta.data)
loupe_meta = loupe_meta[,c("HTO_classification", "HTO_hash_ID")]
idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
loupe_meta = cbind(Barcode=rownames(loupe_meta), loupe_meta[, idx_keep])
p = file.path(params$path_out, "cell_classification_for_loupe.csv")
write.table(x=loupe_meta, file=p, col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
demux_samples_paths = c(demux_samples_paths, p)

message(paste("Classification file for Loupe is:", demux_samples_paths))
```

# Credits
This Pre-Workflow was developed as part of the [scrnaseq](https://github.com/ktrns/scrnaseq) GitHub repository by Katrin Sameith and Andreas Petzold at the [Dresden-concept Genome Center, TU Dresden](https://genomecenter.tu-dresden.de/) (Dresden, Germany), in collaboration with Torsten Glomb and Oliver Dittrich at the [Research Core Unit Genomics, Hannover Medical School](https://www.mhh.de/genomics) (Hannover, Germany). The Seurat Vignettes were initially used as templates. 

# Parameter table
The following parameters were used to run the workflow.  
```{r parameters_table}
out = ScrnaseqParamsInfo(params=params)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width=FALSE, position="left")
```

# Software versions
This report was generated using the [scrnaseq](https://github.com/ktrns/scrnaseq) GitHub repository. Software versions were collected at run time. 
```{r versions}
out = ScrnaseqSessionInfo(params$path_to_git)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover"), full_width=FALSE, position="left")
```