Skip to content

Commit

Permalink
Merge pull request #33 from MPUSP/dev
Browse files Browse the repository at this point in the history
feat: improved reporting, export of figures
  • Loading branch information
m-jahn authored Apr 11, 2024
2 parents 3ead644 + 79985b2 commit e4aad5b
Show file tree
Hide file tree
Showing 7 changed files with 269 additions and 108 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Major update to modules, added singularity container, fixed plotting issues.

- major update of all nf-core modules
- singularity container for fitness module and rmarkdown noteboooks
- export of figures to dedicated output folder, improved report

### `Fixed`

Expand Down
218 changes: 160 additions & 58 deletions bin/counts_summary.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -23,33 +23,16 @@ knitr::opts_chunk$set(echo = FALSE)

# Requirements

- if not all dependencies are satisfied by R environment, try to install additional libraries `dplyr`, `ggplot2`, `tidyr`, `Hmisc`
- loading libraries

```{r}
list_req_packages <- c("dplyr", "ggplot2", "tidyr", "Hmisc")
list_to_install <- setdiff(list_req_packages, rownames(installed.packages()))
if (length(list_to_install)) {
pkdir <- paste0(system("echo ${HOME}", intern = TRUE), "/.R")
system(paste0("mkdir ", pkdir))
.libPaths(new = pkdir)
install.packages(pkgs = list_to_install, lib = pkdir)
print(paste0("Missing package(s) ", paste(list_to_install, collapse = ", "), " are installed to '", pkdir, "'."))
} else {
print(paste0("Required package(s) ", paste(list_req_packages, collapse = ", "), " are already installed."))
}
```

- loading libraries
- loading libraries `dplyr`, `ggplot2`, `tidyr`, `Hmisc`

```{r}
suppressPackageStartupMessages({
for (pkg in list_req_packages) {
library(pkg, character.only = TRUE)
}
library(grDevices)
library(dplyr)
library(ggplot2)
library(tidyr)
library(Hmisc)
})
print(paste0("Required package(s) ", paste(list_req_packages, collapse = ", "), " are loaded."))
```

# Sample overview
Expand Down Expand Up @@ -101,7 +84,7 @@ print("Import of counts table complete.")
list_samples <- unique(df_counts$sample)
figwidth <- 9
figheight <- round(1 + (length(list_samples) / 4))
figheight2 <- 3 * figheight
figheight2 <- 1.6 * figheight
# output sample table
test <- df_counts %>%
Expand Down Expand Up @@ -145,110 +128,201 @@ custom_theme <- function(base_size = 12, base_line_size = 1.0, base_rect_size =
...
)
}
# function to export an image as svg and png
save_plot <- function(pl, path = "", width = 6, height = 6) {
pl_name <- deparse(substitute(pl))
pdf(
file = paste0(path, pl_name, ".pdf"),
width = width, height = height
)
print(pl)
dev.off()
grDevices::svg(
filename = paste0(path, pl_name, ".svg"),
width = width, height = height
)
print(pl)
dev.off()
grDevices::png(
filename = paste0(path, pl_name, ".png"),
width = width * 125, height = height * 125, res = 120
)
print(pl)
invisible(capture.output(dev.off()))
}
```

## Total number of mapped reads per sample

- figure shows the total number of mapped reads used for fitness score calculation
- all other reads that were filtered out during preprocessing (low quality, low mapping score) are not included

```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
df_counts %>%
plot_total_mapped_reads <- df_counts %>%
dplyr::group_by(sample) %>%
dplyr::summarize(n_reads = sum(n_reads)) %>%
ggplot(aes(x = sample, y = n_reads)) +
coord_flip() +
geom_col(fill = custom_colors[1], alpha = 0.7) +
labs(x = "", y = "total number of mapped reads") +
custom_theme()
save_plot(plot_total_mapped_reads, width = figwidth, height = figheight)
print(plot_total_mapped_reads)
```

## Number of individual barcodes per sample

- figure shows the number of individual, unique barcodes per sample
- depnding on `bowtie` settings, a certain number of mismatches in barcodes are allowed

```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
df_counts %>%
plot_individual_barcodes <- df_counts %>%
dplyr::group_by(sample) %>%
dplyr::summarize(`unique barcodes per sample` = sum(n_reads > 0)) %>%
ggplot(aes(x = sample, y = `unique barcodes per sample`)) +
geom_col(fill = custom_colors[1], alpha = 0.7) +
labs(x = "") +
coord_flip() +
custom_theme()
save_plot(plot_individual_barcodes, width = figwidth, height = figheight)
print(plot_individual_barcodes)
```

## Number of missing barcodes per sample

- figure shows the number of missing barcodes per sample
- this number is determined from the number of total encountered barcodes across all samples

```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
df_counts %>%
plot_missing_barcodes <- df_counts %>%
dplyr::group_by(sample) %>%
dplyr::summarize(`missing barcodes per sample` = sum(n_reads == 0)) %>%
ggplot(aes(x = sample, y = `missing barcodes per sample`)) +
geom_col(fill = custom_colors[1], alpha = 0.7) +
labs(x = "") +
coord_flip() +
custom_theme()
save_plot(plot_missing_barcodes, width = figwidth, height = figheight)
print(plot_missing_barcodes)
```

## Number of barcodes per gene, per sample

- figure shows the number of barcodes per gene, per sample
- the x-axis shows the number of genes with N barcodes, broken down by sample
- barcodes without mapped reads for the respective sample are removed
- for example a colored bar with label `≤ 2` shows number of genes with less or equal than 2 barcodes

```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
df_counts %>%
df_barcodes_per_gene <- df_counts %>%
dplyr::filter(n_reads > 0) %>%
dplyr::group_by(sample, Gene) %>%
dplyr::summarize(`unique barcodes` = length(unique(sgRNA))) %>%
dplyr::summarize(`unique barcodes` = length(unique(sgRNA)), .groups = "drop") %>%
dplyr::mutate(`unique barcodes` = cut(`unique barcodes`, breaks = pretty(`unique barcodes`))) %>%
dplyr::group_by(sample) %>%
dplyr::count(`unique barcodes`) %>%
ggplot(aes(x = sample, y = n, fill = factor(`unique barcodes`))) +
dplyr::mutate(`unique barcodes` = as.numeric(gsub("\\([0-9]*,|\\]", "", `unique barcodes`))) %>%
dplyr::ungroup() %>%
dplyr::arrange(sample, n)
plot_barcodes_gene_sample <- df_barcodes_per_gene %>%
ggplot(aes(
x = sample, y = n,
fill = factor(`unique barcodes`),
label = paste0("≤ ", `unique barcodes`)
)) +
geom_col(alpha = 0.7) +
geom_text(aes(
x = sample,
y = unlist((tapply(n, sample, function(x) cumsum(rev(x)) - (rev(x) / 2)))),
label = rev(`unique barcodes`)
), color = "white") +
geom_text(position = position_stack(vjust = 0.5), color = "white") +
labs(x = "") +
coord_flip() +
scale_fill_manual(values = colorRampPalette(custom_colors[1:5])(9)) +
scale_fill_manual(values = colorRampPalette(
custom_colors[1:5])(length(unique(df_barcodes_per_gene[["unique barcodes"]])))
) +
custom_theme(legend.position = "none")
save_plot(plot_barcodes_gene_sample, width = figwidth, height = figheight)
print(plot_barcodes_gene_sample)
```

## Read count distribution, per sample (capped at 1000 barcodes)
## Read count distribution, violin plot

- figure shows the read count distribution per sample and barcode
- read count per barcode is only shown for the first 1000 barcodes to reduce processing time
- barcodes without mapped reads for the respective sample are removed
- read count is log 10 transformed (0 -> 1, 1 -> 10, 2 -> 100, ...)

```{r, fig.width = figwidth, fig.height = figheight, warning = FALSE}
df_counts %>%
plot_count_dist <- df_counts %>%
dplyr::filter(n_reads > 0) %>%
dplyr::group_by(sample) %>%
dplyr::slice(1:1000) %>%
ggplot(aes(x = sample, y = log10(n_reads))) +
geom_violin(
trim = FALSE, fill = custom_colors[1],
alpha = 0.7, col = "white"
) +
labs(y = expression("log"[10] * " reads per barcode")) +
coord_flip() +
stat_summary(fun.data = mean_sdl, geom = "pointrange", size = 0.5, col = grey(0.3)) +
custom_theme()
save_plot(plot_count_dist, width = figwidth, height = figheight)
print(plot_count_dist)
```

## Number of reads per barcode, per sample
## Read count distribution, histogram

- figure shows the same data as above, but with full set of barcodes per sample
- barcodes without mapped reads for the respective sample are removed
- read count is log 10 transformed (0 -> 1, 1 -> 10, 2 -> 100, ...)

```{r, fig.width = figwidth, fig.height = figheight2, warning = FALSE}
df_counts %>%
ggplot(aes(x = log2(n_reads))) +
plot_reads_per_bc <- df_counts %>%
ggplot(aes(x = log10(n_reads))) +
geom_histogram(fill = custom_colors[1], alpha = 0.7, bins = 30) +
labs(y = "", x = expression("log"[2] * " reads per barcode")) +
facet_wrap(~sample, ncol = 2) +
labs(y = "", x = expression("log"[10] * " reads per barcode")) +
facet_wrap(~sample, ncol = 4) +
custom_theme()
save_plot(plot_reads_per_bc, width = figwidth, height = figheight2)
print(plot_reads_per_bc)
```

## Top 10 most abundant barcodes, per sample

- figure shows top 10 barcodes ranked by read count
- ideally, initial time point samples show high equality of barcode abundance
- extremely high abundance of very few barcodes is a sign of few mutants outcompeting the population
- this can happen when libraries are (pre-) cultivated for too long periods of time
- can lead to downstream problems as it reduces library diversity (depletes low abundant mutants)

```{r, fig.width = figwidth, fig.height = figheight2, warning = FALSE}
df_counts %>%
plot_top10_barcodes <- df_counts %>%
dplyr::group_by(sample) %>%
dplyr::arrange(sample, desc(n_reads)) %>%
dplyr::mutate(rank = seq_along(sgRNA)) %>%
dplyr::filter(between(rank, 1, 10)) %>%
ggplot(aes(x = factor(rank), y = n_reads)) +
geom_col(fill = custom_colors[1], alpha = 0.7, width = 1) +
labs(y = "n reads", x = "barcodes ranked by abundance") +
facet_wrap(~sample, ncol = 2) +
facet_wrap(~sample, ncol = 4) +
custom_theme()
save_plot(plot_top10_barcodes, width = figwidth, height = figheight2)
print(plot_top10_barcodes)
```

## Cumulative read count distribution and population equality
## Cumulative read count distribution and barcode diversity

- figure shows the barcode diversity by plotting fraction of reads (%) vs fraction of barcodes (%)
- the ideal library has high diversity and equal distribution of barcodes for initial time points
- such a distribution would follow the diagonal dashed grey line
- if reads per barcode (red line) are not well distributed, `% of reads` (y-axis) shows a steep ascent
- this means very few barcodes contribute to almost all reads

```{r, fig.width = figwidth, fig.height = figheight2, warning = FALSE}
df_auc <- df_counts %>%
Expand All @@ -263,43 +337,58 @@ df_auc <- df_counts %>%
dplyr::summarize(pc_reads = sum(pc_reads), .groups = "drop_last") %>%
dplyr::mutate(pc_reads = cumsum(pc_reads))
df_auc %>%
plot_cumulative_read_count <- df_auc %>%
ggplot(aes(x = pc_barcodes, y = pc_reads, group = sample)) +
geom_line(linewidth = 1.0, color = custom_colors[1]) +
geom_step(linewidth = 1.0, color = custom_colors[1]) +
geom_abline(
slope = 1, intercept = 0, linewidth = 1.0,
linetype = 2, color = grey(0.5)
) +
facet_wrap(~sample, ncol = 2) +
lims(x = c(0, 100), y = c(0, 100)) +
labs(x = "% of barcodes", y = "% of reads") +
facet_wrap(~sample, ncol = 4) +
custom_theme()
save_plot(plot_cumulative_read_count, width = figwidth, height = figheight2)
print(plot_cumulative_read_count)
```

- this table shows the area under curve (AUC) for the line plot above
- an AUC of 0.5 is ideal, an AUC approaching 1.0 is not optimal
- the 'Gini index' is a score between 0 and 1 measuring population equality
- it is defined as the `(AUC - AUC_optimal) / AUC_optimal`
- a Gini score of 0 describes a perfectly equal, a Gini score of 1.0 a perfectly unequal distribution

```{r, warning = FALSE}
calc_auc <- function(x, y) {
sum(diff(x) * (head(y, -1) + tail(y, -1))) / 2
}
df_auc %>%
dplyr::summarize(
auc = calc_auc(pc_barcodes, pc_reads),
gini = (auc - 5000) / 5000
auc = calc_auc(pc_barcodes, pc_reads)/100^2,
auc_optimal = 0.5,
gini = (auc - 0.5) / 0.5
) %>%
mutate(quality = case_when(
gini <= 0.2 ~ "OK",
gini > 0.2 & gini <= 0.5 ~ "warning: inequal read distribution",
gini > 0.5 ~ "warning: extremely inequal read distribution"
gini <= 0.33 ~ "low inequality (G < 0.33)",
gini > 0.33 & gini <= 0.66 ~ "intermediate inequality (0.33 < G < 0.66)",
gini > 0.66 ~ "high inequality (G > 0.66)"
))
```


## Sample and replicate correlation coefficent (R)

- figure shows heat map with correlation coefficient R (-1 < R < 1)
- correlation coefficient shows how strongly read abundance is correlated

```{r, fig.width = 7.5, fig.height = 7, warning = FALSE}
df_counts %>%
df_correlation <- df_counts %>%
tidyr::pivot_wider(names_from = "sample", values_from = "n_reads") %>%
dplyr::select(-c(1:2)) %>%
cor() %>%
cor()
plot_replicate_correlation <- df_correlation %>%
dplyr::as_tibble() %>%
dplyr::mutate(sample1 = colnames(.)) %>%
tidyr::pivot_longer(
Expand All @@ -316,10 +405,20 @@ df_counts %>%
colours = c(custom_colors[1], grey(0.9), custom_colors[2]),
limits = c(-1, 1)
)
write.csv(df_correlation, file = "correlation_table.csv")
save_plot(plot_replicate_correlation, width = 7.5, height = 7)
print(plot_replicate_correlation)
```

## Sample and replicate similarity with PCA

- this figure shows sample similarity similar to above figure
- uses principal component analysis (PCA) to reduce the multidimensional data to 3 main dimensions
- plotted are principal component 1 (x-axis), 2 (y-axis) and 3 (point size)
- replicates for same samples should cluster together
- outliers should be easily visible

```{r, fig.width = 7, fig.height = 7, warning = FALSE}
pca_result <- df_counts %>%
tidyr::pivot_wider(names_from = "sample", values_from = "n_reads") %>%
Expand All @@ -332,7 +431,7 @@ pca_result <- df_counts %>%
df_PCA <- pca_result$x %>%
as_tibble(rownames = "sample")
df_PCA %>%
plot_replicate_pca <- df_PCA %>%
ggplot(aes(x = PC1, y = -PC2, size = PC3, color = sample, label = sample)) +
geom_point(alpha = 0.7) +
geom_text(size = 2.5, show.legend = FALSE) +
Expand All @@ -343,13 +442,16 @@ df_PCA %>%
custom_theme(legend.position = "none", aspect.ratio = 1) +
scale_color_manual(values = colorRampPalette(custom_colors)(nrow(df_PCA))) +
guides(size = "none")
save_plot(plot_replicate_pca, width = 7, height = 7)
print(plot_replicate_pca)
```

# Report info

The template for this report is located in `./nf-core-crispriscreen/bin/counts_summary.Rmd`.

Date: 2022-04-28
Date: 2024-04-10

Author: Michael Jahn, PhD

Expand Down
Loading

0 comments on commit e4aad5b

Please sign in to comment.