diff --git a/DESCRIPTION b/DESCRIPTION index 9089c61..1d55d3b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: MicrobiomeBenchmarkDataAnalyses Title: Benchmarking analyses with the MicrobiomeBenchmarkData package -Version: 0.99.16 +Version: 0.99.17 Authors@R: person("Samuel David", "Gamboa-Tuz", email = "Samuel.Gamboa.Tuz@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6863-7943")) diff --git a/vignettes/articles/Figure4.pdf b/vignettes/articles/Figure4.pdf index 4abb018..490c680 100644 Binary files a/vignettes/articles/Figure4.pdf and b/vignettes/articles/Figure4.pdf differ diff --git a/vignettes/articles/Ravel_2011_16S_BV.Rmd b/vignettes/articles/Ravel_2011_16S_BV.Rmd index ce0ec87..a936252 100644 --- a/vignettes/articles/Ravel_2011_16S_BV.Rmd +++ b/vignettes/articles/Ravel_2011_16S_BV.Rmd @@ -23,17 +23,13 @@ library(ggpubr) library(tidySummarizedExperiment) ``` -# Introduction - In this vignette, several differential abundance (DA) methods will be compared using the *Ravel\_2011\_16S\_BV* dataset. Lactobacillus is expected to -be enriched in healthy vagina (HV) samples and other taxa, such as Gardnerella -and Prevotella, are expected to be more abundant or enriched in -bacterial vaginosis (BV) samples. - -# Data +be enriched in healthy vagina (HV) samples. BV-associated taxa, +such as Gardnerella and Prevotella, are expected to be more abundant or +enriched in bacterial vaginosis (BV) samples. -## Import, summarize by genus, and filter +## Data + Select equal number of samples per ethnicity group. This was based on the minimum number of samples in an ethnicity. @@ -45,12 +41,20 @@ conditions_col <- 'study_condition' conditions <- c(condB = 'healthy', condA = 'bacterial_vaginosis') tse <- getBenchmarkData(dat_name, dryrun = FALSE)[[1]] - -## Select equal number of samples per ethnicity group col_data <- tse |> colData() |> - as.data.frame() |> + as.data.frame() |> dplyr::filter(study_condition %in% conditions) +col_data |> + summarise( + .by = c("ethnicity", "nugent_score_category", "study_condition"), + range = paste0(min(nugent_score), "-", max(nugent_score)), + n = n() + ) |> + arrange(ethnicity, study_condition, n) +``` + +```{r} row_names_list <- col_data |> {\(y) split(y, factor(y$ethnicity))}() |> {\(y) map(y, ~split(.x, .x$study_condition))}() |> @@ -63,6 +67,7 @@ set.seed(4567) select_samples <- row_names_list |> {\(y) map(y, ~ sample(.x, min_n, replace = FALSE))}() |> unlist(use.names = FALSE) +# select_samples <- unique(unlist(row_names_list)) tse_subset <- tse[, select_samples] ## Summarize by genus @@ -81,7 +86,19 @@ colData(tse_genus)$study_condition <- tse_genus ``` -## Get prior info +```{r} +tse_genus |> + colData() |> + as_tibble() |> + summarise( + .by = c("ethnicity", "nugent_score_category", "study_condition"), + range = paste0(min(nugent_score), "-", max(nugent_score)), + n = n() + ) |> + arrange(ethnicity, study_condition, n) +``` + +## Prior info - biological annotations ```{r prior info} prior_info <- tse_genus |> @@ -98,30 +115,25 @@ prior_info <- tse_genus |> head(prior_info) ``` -## Convert to phyloseq +## DA analysis + +Convert to phyloseq ```{r convert to phyloseq} -ps <- makePhyloseqFromTreeSummarizedExperiment(tse_genus) -sample_data(ps)[[conditions_col]] <- - factor(sample_data(ps)[[conditions_col]], levels = conditions) +ps <- convertToPhyloseq(tse_genus) +sample_data(ps)[[conditions_col]] <- factor( + sample_data(ps)[[conditions_col]], levels = conditions +) ps ``` -# Benchdamic workflow - -## Set DA methods - -```{r} -## Normalization methods supported in benchdamic +```{r normalization, weights, DA methods} norm_methods <- set_norm_list() -# norm_methods <- norm_methods[names(norm_methods) != "norm_CSS"] ps <- runNormalizations(norm_methods, ps, verbose = FALSE) zw <- weights_ZINB(ps, design = conditions_col) DA_methods <- set_DA_methods_list(conditions_col, conditions) -## The following chunk of code was written for compatibility with -## a more recent version of Seurat implemented in benchdamic for (i in seq_along(DA_methods)) { if (grepl("Seurat", names(DA_methods)[i])) { names(DA_methods[[i]]$contrast) <- NULL @@ -129,16 +141,12 @@ for (i in seq_along(DA_methods)) { next } } -# These methods throw an error, so they must be removed -# DA_methods <- DA_methods[!names(DA_methods) == 'DA_ALDEx2.1'] -# DA_methods <- DA_methods[!names(DA_methods) == 'DA_corncob.1'] -# DA_methods <- DA_methods[!names(DA_methods) == 'DA_edgeR.1'] names(DA_methods) ``` -## Run DA methods +Run DA analysis: -```{r, warning=FALSE, message=FALSE} +```{r run DA, warning=FALSE, message=FALSE} tim <- system.time({ DA_output <- vector("list", length(DA_methods)) for (i in seq_along(DA_output)) { @@ -156,53 +164,43 @@ tim <- system.time({ tim ``` -# Enrichment - -Get direction - -```{r get direction (effect size)} -direction <- get_direction_cols(DA_output, conditions_col, conditions) -head(direction) -``` - - - -```{r} -hist(abs(DA_output$lefse.CLR$statInfo$LDA_scores)) -``` - +## Enrichment -```{r} -hist(abs(DA_output$lefse.TSS$statInfo$LDA_scores)) -``` +We need a threshold for DA for lefse-CLR +(It can't be the same as when using lefse-TSS): - -```{r} +```{r median lefser} c( lefse.TSS = median(DA_output$lefse.TSS$statInfo$abs_score), lefse.CLR = median(DA_output$lefse.CLR$statInfo$abs_score) ) ``` - Create some variables for selecting and ranking differentially abundant features: -```{r} +```{r variables for enrichment funs} +direction <- get_direction_cols(DA_output, conditions_col, conditions) + adjThr<- rep(0.1, length(DA_output)) names(adjThr) <- names(DA_output) esThr <- rep(0, length(DA_output)) names(esThr) <- names(DA_output) + +## Lefse needs an effect size threshold because the +## method requires both LDA and p-value. +## Otherwise, it would only be a wilcox-test esThr[grep("lefse.TSS", names(esThr))] <- 2 esThr[grep("lefse.CLR", names(esThr))] <- 0.15 +## Lefse will be based on LDA rather than p-value slotV <- ifelse(grepl("lefse", names(DA_output)), "statInfo", "pValMat") colNameV <- ifelse(grepl("lefse", names(DA_output)), "LDA_scores", "adjP") typeV <- ifelse(grepl("lefse", names(DA_output)), "logfc", "pvalue") ``` -Create enrichment. Threshold values is based on adjusted p-values +Create enrichment: ```{r enrichment} enrichment <- createEnrichment( @@ -218,37 +216,20 @@ enrichment <- createEnrichment( alternative = "greater", verbose = FALSE ) - -# enrichment <- createEnrichment( -# object = DA_output, -# priorKnowledge = prior_info, -# enrichmentCol = "taxon_annotation", -# namesCol = "taxon_name", -# slot = "pValMat", colName = "adjP", type = "pvalue", -# direction = direction, -# threshold_pvalue = 0.1, -# threshold_logfc = 0, -# top = NULL, -# alternative = "greater", -# verbose = FALSE -# ) - ``` Create enrichment summary: -```{r} +```{r enrichment summary} enrichmentSummary <- purrr::map(enrichment, ~ { .x$summaries |> purrr::map(function(x) { - pos <- which(colnames(x) != "pvalue") x |> tibble::rownames_to_column(var = "direction") |> tidyr::pivot_longer( names_to = "annotation", values_to = "n", cols = 2 ) - }) |> dplyr::bind_rows() |> dplyr::relocate(pvalue) @@ -273,7 +254,7 @@ enrichmentSummary <- purrr::map(enrichment, ~ { Create enrichment plot: -```{r} +```{r enrichment plot} enPlot <- enrichmentSummary |> left_join(get_meth_class(), by = "method") |> mutate( @@ -319,43 +300,11 @@ enPlot <- enrichmentSummary |> ) ``` - - - - - - - -```{r enrichment plot, warning=FALSE, message=FALSE} -# Create enrichment plot -## Not a plot. This is a data.frame that should be used as input for -## the plot_enrichment2 function. -# enrich_plot <- plot_enrichment( -# enrichment = enrichment, -# enrichment_col = "taxon_annotation", -# levels_to_plot = c("hv-associated", "bv-associated"), -# conditions = c(condB = "HV", condA = "BV") -# ) - -## The actual plot. -# enrich_plot2 <- plot_enrichment_2( -# enrich_plot, -# dir = c(up = 'BV', down = 'HV') -# ) + -# theme( -# axis.title = element_text(size = 17), -# axis.text = element_text(size = 15), -# legend.text = element_text(size = 13), -# strip.text = element_text(size = 17) -# ) -# enrich_plot2 -``` - -# Plot putative true positves and true negatives ratio +## Plot putative true positves and true negatives ratio Create 'positives' object. No thresholds were added. -```{r putative positives} +```{r putative positives data} positives <- map(1:length(DA_output), .f = function(i) { positives <- createPositives( object = DA_output[i], @@ -372,38 +321,9 @@ positives <- map(1:length(DA_output), .f = function(i) { FP = list(c("DOWN Abundant", "bv-associated"), c("UP Abundant", "hv-associated")) ) |> left_join(get_meth_class(), by = 'method') -}) |> bind_rows() - - - - - - - - -# positives <- createPositives( -# object = DA_output, -# priorKnowledge = prior_info, -# enrichmentCol = "taxon_annotation", namesCol = "taxon_name", -# slot = "pValMat", colName = "rawP", type = "pvalue", -# direction = direction, -# threshold_pvalue = 1, -# threshold_logfc = 0, -# top = seq.int(from = 0, to = 20, by = 2), -# alternative = "greater", -# verbose = FALSE, -# TP = list(c("DOWN Abundant", "hv-associated"), c("UP Abundant", "bv-associated")), -# FP = list(c("DOWN Abundant", "bv-associated"), c("UP Abundant", "hv-associated")) -# ) |> -# left_join(get_meth_class(), by = 'method') |> -# relocate(method_class) -``` - -Create putative positives plot - -```{r plot positves, fig.width=15, fig.height=10} -positives <- positives |> - mutate( +}) |> + bind_rows() |> + mutate( base_method = case_when( grepl("lefse", base_method) ~ sub("lefse", "LEfSe", base_method), grepl("wilcox", base_method) ~ sub("wilcox", "Wilcox", base_method), @@ -415,11 +335,15 @@ positives <- positives |> TRUE ~ method ) ) +``` + +Create "positives" plot: + +```{r plot positves, fig.width=15, fig.height=10} vec <- positives$color names(vec) <- positives$base_method posPlot <- positives |> - # mutate(diff = jitter(TP - FP, amount = 1.5, factor = 2)) |> mutate(diff = TP - FP) |> ggplot(aes(top, diff)) + geom_line( @@ -441,75 +365,37 @@ posPlot <- positives |> scale_color_manual(values = vec, name = "Base method") + theme_minimal() + theme(legend.position = "bottom") -# plots <- plot_positives(positives) |> -# map( ~ { -# .x + -# theme( -# axis.title = element_text(size = 17), -# axis.text = element_text(size = 15), -# legend.text = element_text(size = 13), -# strip.text = element_text(size = 17) -# ) -# }) -# k <- grid.arrange(grobs = plots, ncol = 3) ``` -```{r, fig.width=15, fig.height=15, warning=FALSE, eval=FALSE} -# ePlot <- ggarrange( -# enrich_plot2, k, ncol = 1, labels = c("a)", "b)"), heights = c(8, 10) -# ) +Combine enrichment and "positives" plot: -``` - - -```{r, fig.height=9, fig.width=10} +```{r combine enrichment and pos plots, fig.height=9, fig.width=10} pp <- ggarrange( plotlist = list(enPlot, posPlot), ncol = 1, heights = c(1.5, 1) ) pp ``` - - - -```{r} -# ggsave( -# filename = "Figure2.pdf", plot = ePlot, -# dpi = 300, height = 15, width = 15 -# ) +```{r save plots, eval=FALSE, echo=FALSE} ggsave( filename = "Figure2.pdf", plot = pp, dpi = 300, height = 9, width = 11 ) ``` -# Perform DA with lefse, Wilcox, and ZINQ-Cauchy manually +# Perform DA with lefse and Wilcox ```{r manual transformations} tssFun <- function(x) { (x) / sum(x) * 1e6 } + clrFun <- function(x) { log(x / exp(mean(log(x)))) } - -## Relative abundance (TSS - total sum scaling) assay(tse_genus, "TSS") <- apply(assay(tse_genus, "counts") + 1, 2, tssFun) assay(tse_genus, "CLR") <- apply(assay(tse_genus, "counts") + 1, 2, clrFun) -## No need for pseudocount in the next line -assay(tse_genus, "TSS + CLR") <- apply(assay(tse_genus, "TSS"), 2, clrFun) - -## CLR transform -# assay(tse_genus, 'CLR') <- apply(assay(tse_genus), 2, function(x) { -# log((x + 1) / exp(mean(log(x + 1)))) -# }) - -## Relative abundance + CLR transform -# assay(tse_genus, 'TSS + CLR') <- apply(assay(tse_genus, 'TSS'), 2, function(x) { -# # x / exp(mean(log(x))) -# log(x / exp(mean(log(x)))) -# }) data <- tse_genus |> as_tibble() |> @@ -532,8 +418,9 @@ calcWilcox <- function(dat, val_col, log = FALSE) { ## Separate components taxa <- split(dat, factor(dat$taxon_name)) taxa_names <- names(taxa) - taxa_annotations <- - dplyr::distinct(dplyr::select(data, dplyr::starts_with('taxon'))) + taxa_annotations <- data |> + dplyr::select(tidyselect::starts_with('taxon')) |> + dplyr::distinct() ## Perform Wilcoxon test pvalues <- vector('double', length(taxa)) @@ -589,11 +476,10 @@ calcWilcox <- function(dat, val_col, log = FALSE) { Perform statistical test: ```{r run calcWilcox, warning=FALSE} -wilcox <- list( +wilcoxRes <- list( wilcox_counts = calcWilcox(data, 'counts'), wilcox_relab = calcWilcox(data, 'TSS'), - wilcox_clr = calcWilcox(data, 'CLR', log = TRUE), - wilcox_relab_clr = calcWilcox(data, 'TSS + CLR', log = TRUE) + wilcox_clr = calcWilcox(data, 'CLR', log = TRUE) ) |> bind_rows(.id = 'method') ``` @@ -601,7 +487,7 @@ wilcox <- list( Filter DA taxa ```{r filter taxa} -wilcox_DA <- wilcox |> +wilcox_DA <- wilcoxRes |> dplyr::filter(adjP <= 0.1, abs(logFC) > 0) |> mutate(DA = ifelse(logFC > 0, "OA", "UA")) ``` @@ -639,7 +525,7 @@ incorrect_taxa_wilcox_clr Let's plot their values for each matrix ```{r get abundance values} -transformations <- c('counts', 'TSS', 'CLR', 'TSS + CLR') +transformations <- c('counts', 'TSS', 'CLR') l1 <- vector('list', length(transformations)) names(l1) <- transformations for (i in seq_along(transformations)) { @@ -655,7 +541,10 @@ wilcox_raw <- bind_rows(l1, .id = 'transformation') |> {\(y) pivot_longer( y, cols = 3:ncol(y), values_to = 'value', names_to = 'sample' )}() |> - left_join(data[,c('sample', 'study_condition')], by = 'sample') + left_join( + data[,c('sample', 'study_condition')], by = 'sample', + relationship = "many-to-many" + ) head(wilcox_raw) ``` @@ -666,29 +555,15 @@ Box plot of incorrect values: l <- wilcox_raw |> mutate(taxon_name = sub('genus:', '', taxon_name)) |> {\(y) split(y, y$transformation)}() -l$`TSS + CLR` <- NULL l$counts$value <- log(l$counts$value + 1) -l$TSS$value <- log(l$TSS$value + 1) +# l$TSS$value <- log(l$TSS$value + 1) # this (pseudocount) should not be necessary since a pseudocount was added above +l$TSS$value <- log(l$TSS$value) # this should not be necessary since a pseudocount was added above wilcox_raw <- reduce(l, bind_rows) - - - wilcox_genus_plot <- wilcox_raw |> - # mutate(taxon_name = sub('genus:', '', taxon_name)) |> - # mutate( - # value = case_when( - # transformation %in% c("counts", "TSS") ~ log(value + 1), - # TRUE ~ value - # ) - # ) |> - # mutate(value = log(value + 1)) |> - # filter(transformation != "TSS + CLR") |> mutate(transformation = factor( transformation, levels = c('counts', 'TSS', 'CLR'), labels = c('log(counts + 1)', 'log(TSS + 1)', 'CLR') - # transformation, levels = c('counts', 'TSS', 'CLR', 'TSS + CLR' ), - # labels = c('log(counts + 1)', 'log(TSS + 1)', 'CLR', 'TSS + CLR') )) |> mutate(study_condition = factor( study_condition, levels = c('healthy', 'bacterial_vaginosis'), @@ -713,7 +588,7 @@ wilcox_genus_plot <- wilcox_raw |> wilcox_genus_plot ``` -```{r} +```{r, eval=FALSE, echo=FALSE} ggsave( file = "Figure3.pdf", plot = wilcox_genus_plot, dpi = 300, width = 9, height = 4 @@ -721,118 +596,21 @@ ggsave( ``` - -```{r, message=FALSE} -stats <- data |> - mutate(taxon_name = sub('genus:', '', taxon_name)) |> - filter(taxon_name %in% c('Actinomyces', 'Corynebacterium')) |> - group_by(study_condition, taxon_name) |> - summarise( - mean_counts = mean(counts), - sd_counts = sd(counts), - median_counts = median(counts), - - mean_TSS = mean(TSS), - sd_TSS = sd(TSS), - median_TSS = median(TSS), - - mean_CLR = mean(CLR), - sd_CLR = sd(CLR), - median_CLR = median(CLR), - - mean_TSS_CLR = mean(`TSS + CLR`), - sd_TSS_CLR = sd(`TSS + CLR`), - median_TSS_CLR = median(`TSS + CLR`) - ) |> - ungroup() |> - arrange(taxon_name) |> - modify_if(.p = is.numeric, .f = ~ round(.x, 2)) |> - select(-starts_with("median")) -stats -``` - - -```{r} -types_names <- c("counts$", "TSS$", "[^(TSS)]_CLR$", "TSS_CLR$") -new_stats <- select(stats, taxon_name, study_condition) -for (i in seq_along(types_names)) { - pos <- grep(types_names[i], colnames(stats), value = TRUE) - mean_vals <- stats[,grep("mean", pos, value = TRUE), drop = TRUE] - sd_vals <- stats[,grep("sd", pos, value = TRUE), drop = TRUE] - new_col_name <- sub("mean_", "", pos[1]) - new_col <- paste0(mean_vals, "\u00B1", sd_vals) - new_stats[[new_col_name]] <- new_col -} -new_stats <- new_stats |> - rename( - Taxon = taxon_name, Condition = study_condition, - Counts = counts, `TSS+CLR` = TSS_CLR - ) |> - mutate( - Condition = case_when( - Condition == "healthy" ~ "HV", - Condition == "bacterial_vaginosis" ~ "BV" - ) - ) - -new_stats <- new_stats |> - pivot_longer( - names_to = "Data type", values_to = "Value", cols = Counts:last_col() - ) |> - pivot_wider( - names_from = "Condition", values_from = "Value" - ) - # filter( - # `Data type` != "TSS+CLR" - # ) -DT::datatable( - data = new_stats, - rownames = FALSE, - extensions = "Buttons", - options = list( - dom = 'Bfrtip', - buttons = c('copy', 'csv', 'excel', 'pdf', 'print') - ) -) -``` - - - -```{r plot fold change} -wilcox |> - mutate( - sig = ifelse(adjP <= 0.1, '*', '') - ) |> - mutate(sig2 = paste0(round(logFC, 2), ' ', sig)) |> - mutate(taxon_name = sub('genus:', '', taxon_name)) |> - mutate(taxon_name = as.factor(taxon_name)) |> - filter(taxon_name %in% c('Actinomyces', 'Corynebacterium')) |> - ggplot(aes(taxon_name, logFC)) + - geom_col(aes(fill = method), position = position_dodge(width = 0.9)) + - geom_text( - aes(label = sig2, group = method), - position = position_dodge(width = 0.9), vjust = -0.5 - ) + - labs( - title = 'LogFC of taxa identified as significant (adjP <= 0.1) by CLR', - subtitle = 'logFC is indicated on top of bars. * means significant' - ) -``` - ## Lefse Define a function for running Lefse: ```{r lefse function} -calcLefse <- function(dat, assay) { - res <- lefser2( - dat, kruskal.threshold = 0.05, wilcox.threshold = 0.05, - lda.threshold = 0, groupCol = 'study_condition', assay = assay +calcLefse <- function(se, assay) { + res <- lefser::lefser( + se, kruskal.threshold = 0.05, wilcox.threshold = 0.05, + lda.threshold = 0, classCol = 'study_condition', assay = assay ) - adj_pvalues <- p.adjust(res$kw_pvalues) + return(res) + # adj_pvalues <- p.adjust(res$kw_pvalues) - dplyr::mutate(res, rawP = kw_pvalues, adjP = adj_pvalues) + # dplyr::mutate(res, rawP = kw_pvalues, adjP = adj_pvalues) # res <- lefser2( # dat, kruskal.threshold = 0.05, wilcox.threshold = 0.05, @@ -858,23 +636,23 @@ taxa_annotations <- lefse <- list( lefse_counts = calcLefse(tse_genus, 'counts'), lefse_relab = calcLefse(tse_genus, 'TSS'), - lefse_clr = calcLefse(tse_genus, 'CLR'), - lefse_relab_clr = calcLefse(tse_genus, 'TSS + CLR') + lefse_clr = calcLefse(tse_genus, 'CLR') ) |> bind_rows(.id = 'method') |> mutate( DA = ifelse(scores > 0, 'OA', 'UA') ) |> - rename(taxon_name = 'Names') |> + rename(taxon_name = 'features') |> left_join(taxa_annotations, by = 'taxon_name') head(lefse) ``` ```{r} -lefse_DA <- lefse |> - dplyr::filter(adjP <= 0.1, abs(scores) > 0) |> - mutate(DA = ifelse(scores > 0, "OA", "UA")) +lefse_DA <- lefse +# lefse_DA <- lefse |> +# dplyr::filter(adjP <= 0.1, abs(scores) > 0) |> +# mutate(DA = ifelse(scores > 0, "OA", "UA")) ``` Plot lefse results: @@ -907,178 +685,7 @@ incorrect_taxa_lefse_clr <- lefse_DA |> incorrect_taxa_lefse_clr ## the same as in wilcox. ``` -## ZINQ - -```{r} -calcZINQ <- function(dat, val_col, y_Cord = 'D', log = FALSE) { - taxa <- split(dat, dat$taxon_name) - taxa_names <- names(taxa) - - taxa_annotations <- - dplyr::distinct(dplyr::select(dat, dplyr::starts_with('taxon'))) - - pvalues <- vector('double', length(taxa)) - names(pvalues) <- taxa_names - form <- paste0(val_col, ' ~ study_condition') - for (i in seq_along(pvalues)) { - df <- taxa[[i]] - - res <- tryCatch( - error = function(e) NULL, { - ZINQ::ZINQ_tests( - formula.logistic = as.formula(form), - formula.quantile = as.formula(form), - C = 'study_condition', y_CorD = y_Cord, data = df - ) - } - ) - - if (is.null(res)) { - pvalues[i] <- NA - } else { - pvalues[i] <- ZINQ::ZINQ_combination(res, method = 'Cauchy') - } - - } - - adj_pvalues <- p.adjust(pvalues, method = 'fdr') - - log_fold_change <- vector('double', length(taxa)) - for (i in seq_along(log_fold_change)) { - df <- taxa[[i]] - healthy <- df |> - dplyr::filter(study_condition == 'healthy') |> - {\(y) y[[val_col]]}() - bv <- df |> - dplyr::filter(study_condition == 'bacterial_vaginosis') |> - {\(y) y[[val_col]]}() - - if (log) { # If log, revert with exp - healthy <- mean(exp(healthy)) - bv <- mean(exp(bv)) - } else{ - healthy <- mean(healthy) - bv <- mean(bv) - } - - if (bv >= healthy) { # control is healthy, condition of interest is bv - log_fold_change[i] <- log2(bv / healthy) - } else if (bv < healthy) { - log_fold_change[i] <- -log2(healthy / bv) - } - } - - ## Combine results and annotations - output <- data.frame( - taxon_name = taxa_names, - rawP = pvalues, - adjP = adj_pvalues, - logFC = log_fold_change - ) - - return(output) - # dplyr::left_join(output, taxa_annotations, by = 'taxon_name') - -} - -``` - -Run ZINQ - -```{r, warning=FALSE} -zinq <- list( - zinq_counts = calcZINQ(data, 'counts', y_Cord = 'D'), - zinq_relab = calcZINQ(data, 'TSS', y_Cord = 'C'), - zinq_clr = calcZINQ(data, 'CLR', y_Cord = 'C'), - zinq_relab_clr = calcZINQ(data, 'TSS + CLR', y_Cord = 'C') -) |> - bind_rows(.id = 'method') |> - mutate( - DA = ifelse(logFC > 0, 'OA', 'UA') - ) |> - left_join(taxa_annotations, by = 'taxon_name') - -zinq_DA <- zinq |> - dplyr::filter(adjP <= 0.1, abs(logFC) > 0) |> - mutate(DA = ifelse(logFC > 0, "OA", "UA")) -``` - - -Plot ZINQ results - -```{r} -zinq_plot <- zinq_DA |> - dplyr::filter(taxon_annotation != 'Unannotated') |> - count(method, taxon_annotation, DA) |> - mutate(n = ifelse(DA == 'UA', -n, n)) |> - mutate(method = sub('lefse_', '', method)) |> - ggplot(aes(method, n)) + - geom_col(aes(fill = taxon_annotation), position = 'dodge') + - geom_hline(yintercept = 0) + - labs( - title = 'ZINQ test', - y = 'Number of DA taxa', x = 'Transformation method' - ) - # scale_y_continuous(limits = c(-3, 13), breaks = seq(-3, 13, 2)) -zinq_plot -``` - -```{r} -incorrect_taxa_lefse_clr <- zinq_DA |> - dplyr::filter( - method %in% c('zinq_clr', 'zinq_relab_clr'), DA == 'UA', - taxon_annotation == 'bv-associated' - ) |> - pull(taxon_name) |> - unique() -incorrect_taxa_lefse_clr ## the same as in wilcox. -``` - -# ANCOM-BC, MetagenomeSeq, and DESEQ2 - -ANCOM-BC -```{r ancombc} -# ancombc <- as.data.frame(DA_output$ancombc.none$statInfo) -# ancombc$taxon_name <- rownames(ancombc) -# ancombc <- left_join(ancombc, taxa_annotations, by = "taxon_name") |> -# relocate(taxon_name, taxon_annotation) -# ancombc |> -# filter(q_val <= 0.1, lfc < 0, taxon_annotation == 'bv-associated') |> -# pull(taxon_name) - -``` - -MetagenomeSeq - -```{r, eval=FALSE, echo=FALSE} -# metagenomeseq <- -# as.data.frame(DA_output$metagenomeSeq.CSS.fitFeatureModel$statInfo) -# metagenomeseq$taxon_name <- rownames(metagenomeseq) -# metagenomeseq <- left_join(metagenomeseq, taxa_annotations, by = "taxon_name") |> -# relocate(taxon_name, taxon_annotation) -# metagenomeseq |> -# filter( -# adjPvalues <= 0.1, logFC < 0, -# taxon_annotation == 'bv-associated' -# ) |> -# pull(taxon_name) -``` - -DESEQ2 - -```{r} -deseq <- as.data.frame(DA_output$DESeq2.poscounts$statInfo) -deseq$taxon_name <- rownames(deseq) -deseq <- left_join(deseq, taxa_annotations, by = "taxon_name") |> - relocate(taxon_name, taxon_annotation) -deseq |> - filter( - padj <= 0.1, log2FoldChange < 0, - taxon_annotation == 'bv-associated' - ) |> - pull(taxon_name) -``` # Plots of BV-associated genera @@ -1226,112 +833,6 @@ sample_sizes <- filter(data, taxon_name == 'genus:Lactobacillus') |> data_with_lact <- left_join(data, sample_sizes, by = 'sample') ``` - -Relative abundance vs CLR all taxa - -```{r} -data_with_lact |> - ggplot(aes(log(TSS + 1), CLR)) + - geom_point( - aes(color = study_condition, size = lact_tss), - alpha = 0.3, position = 'jitter' - ) + - labs( - title = 'Relative abundace vs CLR per genus', - x = 'log(TSS + 1)' - ) + - theme_bw() -``` - -Relative abundance vs CLR BV-associated - -```{r} -data_with_lact |> - filter(taxon_annotation == 'bv-associated') |> - ggplot(aes(log(TSS + 1), CLR)) + - geom_point( - aes(color = study_condition, size = lact_tss), - alpha = 0.3, position = 'jitter' - ) + - labs( - title = 'Relative abundace vs CLR', - subtitle = 'BV-associated genera only', - x = 'log(TSS + 1)' - ) + - scale_size(name = 'Lactobacillus Rel. Ab.') + - theme_bw() -``` - -```{r, eval=FALSE, include=FALSE, echo=FALSE} -# Plotting CLR vs Relab of Lactobacillus, Prevotella, Actinomyces, and -# Corynebacterium - -plot_1 <- data_with_lact |> - filter(taxon_name == 'genus:Actinomyces') |> - ggplot(aes(log(TSS + 1), CLR)) + - geom_point( - aes(color = study_condition, size = lact_tss), - alpha = 0.3, position = 'jitter' - ) + - labs( - # title = 'Relative abundace vs CLR', - title = 'Actinomyces (BV-associated)', - x = 'log(TSS + 1)' - ) + - scale_size(name = 'Lactobacillus Rel. Ab.') + - theme_bw() - -plot_2 <- data_with_lact |> - filter(taxon_name == 'genus:Corynebacterium') |> - ggplot(aes(log(TSS + 1), CLR)) + - geom_point( - aes(color = study_condition, size = lact_tss), - alpha = 0.3, position = 'jitter' - ) + - labs( - # title = 'Relative abundace vs CLR', - title = 'Corynebacterium (BV-associated)', - x = 'Relative abundance' - ) + - scale_size(name = 'Lactobacillus Rel. Ab.') + - theme_bw() - -plot_3 <- data_with_lact |> - filter(taxon_name == 'genus:Prevotella') |> - ggplot(aes(log(TSS + 1), CLR)) + - geom_point( - aes(color = study_condition, size = lact_tss), - alpha = 0.3, position = 'jitter' - ) + - labs( - # title = 'Relative abundace vs CLR', - title = 'Prevotella (BV-associated)', - x = 'Relative abundance' - ) + - scale_size(name = 'Lactobacillus Rel. Ab.') + - theme_bw() - -plot_4 <- data_with_lact |> - filter(taxon_name == 'genus:Lactobacillus') |> - ggplot(aes(log(TSS + 1), CLR)) + - geom_point( - aes(color = study_condition, size = lact_tss), - alpha = 0.3, position = 'jitter' - ) + - labs( - # title = 'Relative abundace vs CLR', - title = 'Lactobacillus (HV)', - x = 'Relative abundance' - ) + - scale_size(name = 'Lactobacillus Rel. Ab.') + - theme_bw() - -plots <- ggpubr::ggarrange( - plot_4, plot_3, plot_1, plot_2, align = 'hv', - common.legend = TRUE, legend = 'bottom' -) -``` - Plotting log(CLR) vs log(Relab) of Lactobacillus, Prevotella, Actinomyces, and Corynebacterium. @@ -1344,7 +845,7 @@ plot_1b <- data_with_lact |> labels = c('BV', 'HV') ) ) |> - ggplot(aes(log(TSS + 1), CLR)) + + ggplot(aes(log(TSS), CLR)) + geom_point( aes(color = study_condition, size = lact_tss), alpha = 0.3 @@ -1367,7 +868,7 @@ plot_2b <- data_with_lact |> labels = c('BV', 'HV') ) ) |> - ggplot(aes(log(TSS + 1), CLR)) + + ggplot(aes(log(TSS), CLR)) + geom_point( aes(color = study_condition, size = lact_tss), alpha = 0.3 @@ -1390,7 +891,7 @@ plot_3b <- data_with_lact |> labels = c('BV', 'HV') ) ) |> - ggplot(aes(log(TSS + 1), CLR)) + + ggplot(aes(log(TSS), CLR)) + geom_point( aes(color = study_condition, size = lact_tss), alpha = 0.3 @@ -1413,7 +914,7 @@ plot_4b <- data_with_lact |> labels = c('BV', 'HV') ) ) |> - ggplot(aes(log(TSS + 1), CLR)) + + ggplot(aes(log(TSS), CLR)) + geom_point( aes(color = study_condition, size = lact_tss), alpha = 0.3