+
-
+
-
+
-
+
-
+
-
+
diff --git a/docs/listings.json b/docs/listings.json
index 48dc1f3..5d40cea 100644
--- a/docs/listings.json
+++ b/docs/listings.json
@@ -2,6 +2,7 @@
{
"listing": "/index.html",
"items": [
+ "/notebooks/2024-04-08_brumfield.html",
"/notebooks/2024-04-01_spurbeck.html",
"/notebooks/2024-03-19_yang-2.html",
"/notebooks/2024-03-16_yang.html",
diff --git a/docs/notebooks/2024-04-08_brumfield.html b/docs/notebooks/2024-04-08_brumfield.html
index b882a74..f86b507 100644
--- a/docs/notebooks/2024-04-08_brumfield.html
+++ b/docs/notebooks/2024-04-08_brumfield.html
@@ -1627,935 +1627,934 @@
df-print: paged
editor: visual
title-block-banner: black
-draft: true
----
-
-```{r}
-#| label: load-packages
-#| include: false
-library(tidyverse)
-library(cowplot)
-library(patchwork)
-library(fastqcr)
-library(RColorBrewer)
-source("../scripts/aux_plot-theme.R")
-theme_base <- theme_base + theme(aspect.ratio = NULL)
-theme_kit <- theme_base + theme(
- axis.text.x = element_text(hjust = 1, angle = 45),
- axis.title.x = element_blank(),
-)
-tnl <- theme(legend.position = "none")
-```
-
-Continuing my analysis of datasets from the [P2RA preprint](https://doi.org/10.1101/2023.12.22.23300450), I analyzed the data from [Brumfield et al. (2022)](https://doi.org/10.1128/mbio.00591-22). This study conducted longitudinal monitoring of SARS-CoV-2 via qPCR from a single manhole in Maryland, combined with comprehensive microbiome analysis using metagenomics during a major COVID spike in early 2021. In total, they sequenced six samples from February to April 2021.
-
-One unusual feature that makes this study interesting is that they conducted both RNA and DNA sequencing on each study. Prior to sequencing, samples underwent concentration with the InnovaPrep Concentrating Pipette, followed by separate DNA & RNA extraction using different kits. RNA samples underwent rRNA depletion prior to library prep. All samples were sequenced on an Illumina HiSeq4000 with 2x150bp reads.
-
-# The raw data
-
-The 6 samples from the Brumfield dataset yielded 24M-45M (mean 33M) DNA-sequencing reads and 29M-64M (mean 46M) RNA-sequencing reads per sample, for a total of 196M DNA read pairs and 276M RNA read pairs (59 and 83 gigabases of sequence, respectively). Read qualities were mostly high but tailed off at the 3' end in some samples, suggesting the need for trimming. Adapter levels were very high. Inferred duplication levels were 44-58% in DNA reads and 90-96% in RNA reads, i.e. very high.
-
-```{r}
-#| warning: false
-#| label: read-qc-data
-
-# Data input paths
-data_dir <- "../data/2024-03-19_brumfield/"
-libraries_path <- file.path(data_dir, "sample-metadata.csv")
-basic_stats_path <- file.path(data_dir, "qc_basic_stats.tsv")
-adapter_stats_path <- file.path(data_dir, "qc_adapter_stats.tsv")
-quality_base_stats_path <- file.path(data_dir, "qc_quality_base_stats.tsv")
-quality_seq_stats_path <- file.path(data_dir, "qc_quality_sequence_stats.tsv")
-
-# Import libraries and extract metadata from sample names
-libraries <- read_csv(libraries_path, show_col_types = FALSE) %>%
- arrange(date) %>% mutate(date = fct_inorder(as.character(date))) %>%
- arrange(desc(na_type)) %>% mutate(na_type = fct_inorder(na_type))
-
-# Import QC data
-stages <- c("raw_concat", "cleaned", "dedup", "ribo_initial", "ribo_secondary")
-basic_stats <- read_tsv(basic_stats_path, show_col_types = FALSE) %>%
- inner_join(libraries, by="sample") %>%
- mutate(stage = factor(stage, levels = stages),
- sample = fct_inorder(sample))
-adapter_stats <- read_tsv(adapter_stats_path, show_col_types = FALSE) %>%
- mutate(sample = sub("_2$", "", sample)) %>%
- inner_join(libraries, by="sample") %>%
- mutate(stage = factor(stage, levels = stages),
- read_pair = fct_inorder(as.character(read_pair)))
-quality_base_stats <- read_tsv(quality_base_stats_path, show_col_types = FALSE) %>%
- inner_join(libraries, by="sample") %>%
- mutate(stage = factor(stage, levels = stages),
- read_pair = fct_inorder(as.character(read_pair)))
-quality_seq_stats <- read_tsv(quality_seq_stats_path, show_col_types = FALSE) %>%
- inner_join(libraries, by="sample") %>%
- mutate(stage = factor(stage, levels = stages),
- read_pair = fct_inorder(as.character(read_pair)))
-
-# Filter to raw data
-basic_stats_raw <- basic_stats %>% filter(stage == "raw_concat")
-adapter_stats_raw <- adapter_stats %>% filter(stage == "raw_concat")
-quality_base_stats_raw <- quality_base_stats %>% filter(stage == "raw_concat")
-quality_seq_stats_raw <- quality_seq_stats %>% filter(stage == "raw_concat")
-```
-
-```{r}
-#| fig-height: 5
-#| warning: false
-#| label: plot-basic-stats
-
-# Visualize basic stats
-scale_fill_na <- purrr::partial(scale_fill_brewer, palette="Set1", name="Nucleic acid type")
-g_basic <- ggplot(basic_stats_raw, aes(x=date, fill=na_type, group=sample)) +
- scale_fill_na() + theme_kit
-g_nreads_raw <- g_basic + geom_col(aes(y=n_read_pairs), position = "dodge") +
- scale_y_continuous(name="# Read pairs", expand=c(0,0))
-g_nreads_raw_2 <- g_nreads_raw + theme(legend.position = "none")
-legend_group <- get_legend(g_nreads_raw)
-
-g_nbases_raw <- g_basic + geom_col(aes(y=n_bases_approx), position = "dodge") +
- scale_y_continuous(name="Total base pairs (approx)", expand=c(0,0)) +
- theme(legend.position = "none")
-g_ndup_raw <- g_basic + geom_col(aes(y=percent_duplicates), position = "dodge") +
- scale_y_continuous(name="% Duplicates (FASTQC)", expand=c(0,0), limits = c(0,100), breaks = seq(0,100,20)) +
- theme(legend.position = "none")
-
-g_basic_raw <- plot_grid(g_nreads_raw_2 + g_nbases_raw + g_ndup_raw, legend_group, ncol = 1, rel_heights = c(1,0.1))
-g_basic_raw
-```
-
-```{r}
-#| label: plot-raw-quality
-
-scale_color_na <- purrr::partial(scale_color_brewer,palette="Set1",name="Nucleic acid type")
-g_qual_raw <- ggplot(mapping=aes(color=na_type, linetype=read_pair,
- group=interaction(sample,read_pair))) +
- scale_color_na() + scale_linetype_discrete(name = "Read Pair") +
- guides(color=guide_legend(nrow=2,byrow=TRUE),
- linetype = guide_legend(nrow=2,byrow=TRUE)) +
- theme_base
-
-# Visualize adapters
-g_adapters_raw <- g_qual_raw +
- geom_line(aes(x=position, y=pc_adapters), data=adapter_stats_raw) +
- scale_y_continuous(name="% Adapters", limits=c(0,NA),
- breaks = seq(0,100,10), expand=c(0,0)) +
- scale_x_continuous(name="Position", limits=c(0,NA),
- breaks=seq(0,140,20), expand=c(0,0)) +
- facet_wrap(~adapter)
-g_adapters_raw
-
-# Visualize quality
-g_quality_base_raw <- g_qual_raw +
- geom_hline(yintercept=25, linetype="dashed", color="red") +
- geom_hline(yintercept=30, linetype="dashed", color="red") +
- geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats_raw) +
- scale_y_continuous(name="Mean Phred score", expand=c(0,0), limits=c(10,45)) +
- scale_x_continuous(name="Position", limits=c(0,NA),
- breaks=seq(0,140,20), expand=c(0,0))
-g_quality_base_raw
-
-g_quality_seq_raw <- g_qual_raw +
- geom_vline(xintercept=25, linetype="dashed", color="red") +
- geom_vline(xintercept=30, linetype="dashed", color="red") +
- geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats_raw) +
- scale_x_continuous(name="Mean Phred score", expand=c(0,0)) +
- scale_y_continuous(name="# Sequences", expand=c(0,0))
-g_quality_seq_raw
-```
-
-# Preprocessing
-
-The average fraction of reads lost at each stage in the preprocessing pipeline is shown in the following table. On average, cleaning & deduplication together removed about 50% of total reads from DNA libraries and about 92% from RNA libraries, primarily during deduplication. Only a few reads (low single-digit percentages at most) were lost during subsequent ribodepletion, even in RNA reads, consistent with the samples having undergone rRNA depletion prior to sequencing.
-
-```{r}
-#| label: preproc-table
-n_reads_rel <- basic_stats %>%
- select(sample, na_type, stage, percent_duplicates, n_read_pairs) %>%
- group_by(sample, na_type) %>% arrange(sample, na_type, stage) %>%
- mutate(p_reads_retained = n_read_pairs / lag(n_read_pairs),
- p_reads_lost = 1 - p_reads_retained,
- p_reads_retained_abs = n_read_pairs / n_read_pairs[1],
- p_reads_lost_abs = 1-p_reads_retained_abs,
- p_reads_lost_abs_marginal = p_reads_lost_abs - lag(p_reads_lost_abs))
-n_reads_rel_display <- n_reads_rel %>% rename(Stage=stage, `NA Type`=na_type) %>%
- group_by(`NA Type`, Stage) %>%
- summarize(`% Total Reads Lost (Cumulative)` = paste0(round(min(p_reads_lost_abs*100),1), "-", round(max(p_reads_lost_abs*100),1), " (mean ", round(mean(p_reads_lost_abs*100),1), ")"),
- `% Total Reads Lost (Marginal)` = paste0(round(min(p_reads_lost_abs_marginal*100),1), "-", round(max(p_reads_lost_abs_marginal*100),1), " (mean ", round(mean(p_reads_lost_abs_marginal*100),1), ")"), .groups="drop") %>%
- filter(Stage != "raw_concat") %>%
- mutate(Stage = Stage %>% as.numeric %>% factor(labels=c("Trimming & filtering", "Deduplication", "Initial ribodepletion", "Secondary ribodepletion")))
-n_reads_rel_display
-```
-
-```{r}
-#| label: preproc-figures
-#| warning: false
-# Plot reads over preprocessing
-g_reads_stages <- ggplot(basic_stats, aes(x=stage, y=n_read_pairs,fill=na_type,group=sample)) +
- geom_col(position="dodge") + scale_fill_na() +
- scale_y_continuous("# Read pairs", expand=c(0,0)) +
- theme_kit
-g_reads_stages
-
-# Plot relative read losses during preprocessing
-g_reads_rel <- ggplot(n_reads_rel, aes(x=stage, y=p_reads_lost_abs_marginal,fill=na_type,group=sample)) +
- geom_col(position="dodge") + scale_fill_na() +
- scale_y_continuous("% Total Reads Lost", expand=c(0,0), labels = function(x) x*100) +
- theme_kit
-g_reads_rel
-```
-
-Data cleaning with FASTP was very successful at removing adapters, with very few adapter sequences found by FASTQC at any stage after the raw data. FASTP was also successful at improving read quality.
-
-```{r}
-#| warning: false
-#| label: plot-quality
-
-g_qual <- ggplot(mapping=aes(color=na_type, linetype=read_pair,
- group=interaction(sample,read_pair))) +
- scale_color_na() + scale_linetype_discrete(name = "Read Pair") +
- guides(color=guide_legend(nrow=2,byrow=TRUE),
- linetype = guide_legend(nrow=2,byrow=TRUE)) +
- theme_base
-
-# Visualize adapters
-g_adapters <- g_qual +
- geom_line(aes(x=position, y=pc_adapters), data=adapter_stats) +
- scale_y_continuous(name="% Adapters", limits=c(0,20),
- breaks = seq(0,50,10), expand=c(0,0)) +
- scale_x_continuous(name="Position", limits=c(0,NA),
- breaks=seq(0,140,20), expand=c(0,0)) +
- facet_grid(stage~adapter)
-g_adapters
-
-# Visualize quality
-g_quality_base <- g_qual +
- geom_hline(yintercept=25, linetype="dashed", color="red") +
- geom_hline(yintercept=30, linetype="dashed", color="red") +
- geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats) +
- scale_y_continuous(name="Mean Phred score", expand=c(0,0), limits=c(10,45)) +
- scale_x_continuous(name="Position", limits=c(0,NA),
- breaks=seq(0,140,20), expand=c(0,0)) +
- facet_grid(stage~.)
-g_quality_base
-
-g_quality_seq <- g_qual +
- geom_vline(xintercept=25, linetype="dashed", color="red") +
- geom_vline(xintercept=30, linetype="dashed", color="red") +
- geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats) +
- scale_x_continuous(name="Mean Phred score", expand=c(0,0)) +
- scale_y_continuous(name="# Sequences", expand=c(0,0)) +
- facet_grid(stage~.)
-g_quality_seq
-```
-
-According to FASTQC, deduplication was moderately effective at reducing measured duplicate levels, with FASTQC-measured levels falling from an average of 50% to one of 16% for DNA reads, and from 93% to 42% for RNA reads:
-
-```{r}
-#| label: preproc-dedup
-
-g_dup_stages <- ggplot(basic_stats, aes(x=stage, y=percent_duplicates, fill=na_type, group=sample)) +
- geom_col(position="dodge") + scale_fill_na() +
- scale_y_continuous("% Duplicates", limits=c(0,100), breaks=seq(0,100,20), expand=c(0,0)) +
- theme_kit
-g_readlen_stages <- ggplot(basic_stats, aes(x=stage, y=mean_seq_len, fill=na_type, group=sample)) +
- geom_col(position="dodge") + scale_fill_na() +
- scale_y_continuous("Mean read length (nt)", expand=c(0,0)) +
- theme_kit
-legend_loc <- get_legend(g_dup_stages)
-g_dup_stages
-g_readlen_stages
-```
-
-# High-level composition
-
-As before, to assess the high-level composition of the reads, I ran the ribodepleted files through Kraken (using the Standard 16 database) and summarized the results with Bracken. Combining these results with the read counts above gives us a breakdown of the inferred composition of the samples:
-
-```{r}
-#| label: plot-composition-all
-
-# Import Bracken data
-bracken_path <- file.path(data_dir, "bracken_counts.tsv")
-bracken <- read_tsv(bracken_path, show_col_types = FALSE)
-total_assigned <- bracken %>% group_by(sample) %>% summarize(
- name = "Total",
- kraken_assigned_reads = sum(kraken_assigned_reads),
- added_reads = sum(added_reads),
- new_est_reads = sum(new_est_reads),
- fraction_total_reads = sum(fraction_total_reads)
-)
-bracken_spread <- bracken %>% select(name, sample, new_est_reads) %>%
- mutate(name = tolower(name)) %>%
- pivot_wider(id_cols = "sample", names_from = "name", values_from = "new_est_reads")
-# Count reads
-read_counts_preproc <- basic_stats %>%
- select(sample, na_type, date, stage, n_read_pairs) %>%
- pivot_wider(id_cols = c("sample", "na_type", "date"), names_from="stage", values_from="n_read_pairs")
-read_counts <- read_counts_preproc %>%
- inner_join(total_assigned %>% select(sample, new_est_reads), by = "sample") %>%
- rename(assigned = new_est_reads) %>%
- inner_join(bracken_spread, by="sample")
-# Assess composition
-read_comp <- transmute(read_counts, sample=sample, na_type=na_type, date=date,
- n_filtered = raw_concat-cleaned,
- n_duplicate = cleaned-dedup,
- n_ribosomal = (dedup-ribo_initial) + (ribo_initial-ribo_secondary),
- n_unassigned = ribo_secondary-assigned,
- n_bacterial = bacteria,
- n_archaeal = archaea,
- n_viral = viruses,
- n_human = eukaryota)
-read_comp_long <- pivot_longer(read_comp, -(sample:date), names_to = "classification",
- names_prefix = "n_", values_to = "n_reads") %>%
- mutate(classification = fct_inorder(str_to_sentence(classification))) %>%
- group_by(sample) %>% mutate(p_reads = n_reads/sum(n_reads))
-read_comp_summ <- read_comp_long %>%
- group_by(na_type, classification) %>%
- summarize(n_reads = sum(n_reads), .groups = "drop_last") %>%
- mutate(n_reads = replace_na(n_reads,0),
- p_reads = n_reads/sum(n_reads),
- pc_reads = p_reads*100)
-
-# Plot overall composition
-g_comp <- ggplot(read_comp_long, aes(x=sample, y=p_reads, fill=classification)) +
- geom_col(position = "stack") +
- scale_y_continuous(name = "% Reads", limits = c(0,1.01), breaks = seq(0,1,0.2),
- expand = c(0,0), labels = function(x) x*100) +
- scale_fill_brewer(palette = "Set1", name = "Classification") +
- facet_wrap(.~na_type, scales="free", ncol=5) +
- theme_kit
-g_comp
-
-# Plot composition of minor components
-read_comp_minor <- read_comp_long %>% filter(classification %in% c("Archaeal", "Viral", "Human", "Other"))
-palette_minor <- brewer.pal(9, "Set1")[6:9]
-g_comp_minor <- ggplot(read_comp_minor, aes(x=sample, y=p_reads, fill=classification)) +
- geom_col(position = "stack") +
- scale_y_continuous(name = "% Reads",
- expand = c(0,0), labels = function(x) x*100) +
- scale_fill_manual(values=palette_minor, name = "Classification") +
- facet_wrap(.~na_type, scales = "free_x", ncol=5) +
- theme_kit
-g_comp_minor
-
-```
-
-```{r}
-#| label: composition-summary
-
-p_reads_summ_group <- read_comp_long %>%
- mutate(classification = ifelse(classification %in% c("Filtered", "Duplicate", "Unassigned"), "Excluded", as.character(classification)),
- classification = fct_inorder(classification)) %>%
- group_by(classification, sample, na_type) %>%
- summarize(p_reads = sum(p_reads), .groups = "drop") %>%
- group_by(classification, na_type) %>%
- summarize(pc_min = min(p_reads)*100, pc_max = max(p_reads)*100,
- pc_mean = mean(p_reads)*100, .groups = "drop")
-p_reads_summ_prep <- p_reads_summ_group %>%
- mutate(classification = fct_inorder(classification),
- pc_min = pc_min %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),
- pc_max = pc_max %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),
- pc_mean = pc_mean %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),
- display = paste0(pc_min, "-", pc_max, "% (mean ", pc_mean, "%)"))
-p_reads_summ <- p_reads_summ_prep %>%
- select(classification, na_type, display) %>%
- pivot_wider(names_from=na_type, values_from = display)
-p_reads_summ
-```
-
-As previously noted, RNA samples were overwhelmingly composed of duplicates. Despite this, the RNA samples achieved a decently high level of total viral reads, with an average of 0.55% of reads mapping to viruses (higher than Crits-Christoph). Levels of total viral reads were much lower in DNA libraries, which were dominated by unassigned and (non-ribosomal) bacterial sequences.
-
-Within viral reads, RNA libraries were (as usual) dominated by plant viruses, particularly *Virgaviridae* – though one sample, unusually, has a substantial minority of reads from *Fiersviridae*, a family of RNA phages. Detected DNA viruses come from a wider range of families, but the most abundant families (*Suoliviridae*, *Intestiviridae*, *Autographiviridae*, *Myoviridae*) are all DNA phages.
-
-```{r}
-#| label: plot-viral-families-all
-
-# Get viral taxonomy
-viral_taxa_path <- file.path(data_dir, "viral-taxids.tsv.gz")
-viral_taxa <- read_tsv(viral_taxa_path, show_col_types = FALSE)
-
-# Import Kraken reports & extract & count viral families
-samples <- as.character(basic_stats_raw$sample)
-col_names <- c("pc_reads_total", "n_reads_clade", "n_reads_direct", "rank", "taxid", "name")
-report_paths <- paste0(data_dir, "kraken/", samples, ".report.gz")
-kraken_reports <- lapply(1:length(samples),
- function(n) read_tsv(report_paths[n], col_names = col_names,
- show_col_types = FALSE) %>%
- mutate(sample = samples[n])) %>% bind_rows
-kraken_reports_viral <- filter(kraken_reports, taxid %in% viral_taxa$taxid) %>%
- group_by(sample) %>%
- mutate(p_reads_viral = n_reads_clade/n_reads_clade[1])
-viral_families <- kraken_reports_viral %>% filter(rank == "F") %>%
- inner_join(libraries, by="sample")
-
-# Identify major viral families
-viral_families_major_tab <- viral_families %>% group_by(name, taxid, na_type) %>%
- summarize(p_reads_viral_max = max(p_reads_viral), .groups="drop") %>%
- filter(p_reads_viral_max >= 0.04)
-viral_families_major_list <- viral_families_major_tab %>% pull(name)
-viral_families_major <- viral_families %>% filter(name %in% viral_families_major_list) %>%
- select(name, taxid, sample, na_type, date, p_reads_viral)
-viral_families_minor <- viral_families_major %>% group_by(sample, date, na_type) %>%
- summarize(p_reads_viral_major = sum(p_reads_viral), .groups = "drop") %>%
- mutate(name = "Other", taxid=NA, p_reads_viral = 1-p_reads_viral_major) %>%
- select(name, taxid, sample, na_type, date, p_reads_viral)
-viral_families_display <- bind_rows(viral_families_major, viral_families_minor) %>%
- arrange(desc(p_reads_viral)) %>% mutate(name = factor(name, levels=c(viral_families_major_list, "Other")))
-g_families <- ggplot(viral_families_display, aes(x=date, y=p_reads_viral, fill=name)) +
- geom_col(position="stack") +
- scale_fill_brewer(palette = "Set3", name = "Viral family") +
- scale_y_continuous(name="% Viral Reads", limits=c(0,1), breaks = seq(0,1,0.2),
- expand=c(0,0), labels = function(y) y*100) +
- facet_wrap(~na_type) +
- theme_kit
-g_families
-```
-
-Given the very high level of duplicates in the RNA data, I also repeated the analysis from my second Yang et al. entry, re-running taxonomic composition analysis on pre-deduplication data and comparing the effects of deduplication on composition:
-
-```{r}
-#| label: plot-composition-dedup
-
-class_levels <- c("Unassigned", "Bacterial", "Archaeal", "Viral", "Human")
-subset_factor <- 0.05
-
-# 1. Remove filtered & duplicate reads from original Bracken output & renormalize
-read_comp_renorm <- read_comp_long %>%
- filter(! classification %in% c("Filtered", "Duplicate")) %>%
- group_by(sample) %>%
- mutate(p_reads = n_reads/sum(n_reads),
- classification = classification %>% as.character %>%
- ifelse(. == "Ribosomal", "Bacterial", .)) %>%
- group_by(sample, na_type, date, classification) %>%
- summarize(n_reads = sum(n_reads), p_reads = sum(p_reads), .groups = "drop") %>%
- mutate(classification = factor(classification, levels = class_levels))
-
-# 2. Import pre-deduplicated
-bracken_path_predup <- file.path(data_dir, "bracken_counts_subset.tsv")
-bracken_predup <- read_tsv(bracken_path_predup, show_col_types = FALSE)
-total_assigned_predup <- bracken_predup %>% group_by(sample) %>% summarize(
- name = "Total",
- kraken_assigned_reads = sum(kraken_assigned_reads),
- added_reads = sum(added_reads),
- new_est_reads = sum(new_est_reads),
- fraction_total_reads = sum(fraction_total_reads)
-)
-bracken_spread_predup <- bracken_predup %>% select(name, sample, new_est_reads) %>%
- mutate(name = tolower(name)) %>%
- pivot_wider(id_cols = "sample", names_from = "name", values_from = "new_est_reads")
-# Count reads
-read_counts_predup <- read_counts_preproc %>%
- select(sample, na_type, date, raw_concat, cleaned) %>%
- mutate(raw_concat = raw_concat * subset_factor, cleaned = cleaned * subset_factor) %>%
- inner_join(total_assigned_predup %>% select(sample, new_est_reads), by = "sample") %>%
- rename(assigned = new_est_reads) %>%
- inner_join(bracken_spread_predup, by="sample")
-# Assess composition
-read_comp_predup <- transmute(read_counts_predup, sample=sample, na_type = na_type,
- date=date,
- n_filtered = raw_concat-cleaned,
- n_unassigned = cleaned-assigned,
- n_bacterial = bacteria,
- n_archaeal = archaea,
- n_viral = viruses,
- n_human = eukaryota)
-read_comp_predup_long <- pivot_longer(read_comp_predup, -(sample:date),
- names_to = "classification",
- names_prefix = "n_", values_to = "n_reads") %>%
- mutate(classification = fct_inorder(str_to_sentence(classification))) %>%
- group_by(sample) %>% mutate(p_reads = n_reads/sum(n_reads)) %>%
- filter(! classification %in% c("Filtered", "Duplicate")) %>%
- group_by(sample) %>%
- mutate(p_reads = n_reads/sum(n_reads))
-
-# 3. Combine
-read_comp_comb <- bind_rows(read_comp_predup_long %>% mutate(deduplicated = FALSE),
- read_comp_renorm %>% mutate(deduplicated = TRUE)) %>%
- mutate(label = ifelse(deduplicated, "Post-dedup", "Pre-dedup") %>% fct_inorder)
-
-# Plot overall composition
-g_comp_predup <- ggplot(read_comp_comb, aes(x=label, y=p_reads, fill=classification)) +
- geom_col(position = "stack") +
- scale_y_continuous(name = "% Reads", limits = c(0,1.01), breaks = seq(0,1,0.2),
- expand = c(0,0), labels = function(x) x*100) +
- scale_fill_brewer(palette = "Set1", name = "Classification") +
- facet_grid(na_type~date) +
- theme_kit
-g_comp_predup
-
-```
-
-In general, deduplication has little effect on the composition of DNA samples, which remain primarily bacterial and unassigned. A few RNA samples show a reduction in the bacterial (more precisely, bacterial + rRNA) fraction after deduplication, consistent with the presence of some true biological duplicate sequences (most likely rRNA) that are being collapsed. Surprisingly, however, the largest effect is in the opposite direction, with several samples showing large increases in bacterial sequences following deduplication. This suggests that some highly abundant non-bacterial sequence is being collapsed in these samples, increasing the apparent abundance of bacteria.
-
-The increase in bacterial reads in these samples comes at the expense of (1) human reads and (2) the unassigned fraction. The former suggests the presence of human rRNA sequences that are being erroneously incorporated into the post-deduplication "bacterial" fraction; however, this effect is smaller than the decrease in unassigned reads. One possibility might be other non-human eukaryotic rRNA sequences, which Kraken is unable to assign using our current database.
-
-# Human-infecting virus reads: validation
-
-Next, I investigated the human-infecting virus read content of these unenriched samples. To get good results here, I needed to make some changes to the pipeline, including adding Trimmomatic as an additional primer-trimming step to prevent further primer-contamination-based false positives. However, for the sake of time I'm not describing them in detail here; if you want to see more I encourage you to check the commit log at the [workflow repo](https://github.com/naobservatory/mgs-workflow).
-
-Having made these changes, the workflow identified a total of 14,073 RNA read pairs and 70 DNA read pairs across all samples (0.09% and 0.00009% of surviving reads, respectively).
-
-```{r}
-#| label: hv-read-counts
-
-hv_reads_filtered_path <- file.path(data_dir, "hv_hits_putative_filtered.tsv.gz")
-hv_reads_filtered <- read_tsv(hv_reads_filtered_path, show_col_types = FALSE) %>%
- inner_join(libraries, by="sample") %>%
- arrange(date, na_type)
-n_hv_filtered <- hv_reads_filtered %>% group_by(sample, date, na_type) %>% count %>% inner_join(basic_stats %>% filter(stage == "ribo_initial") %>% select(sample, n_read_pairs), by="sample") %>% rename(n_putative = n, n_total = n_read_pairs) %>% mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)
-n_hv_filtered_summ <- n_hv_filtered %>% group_by(na_type) %>% summarize(n_putative = sum(n_putative), n_total = sum(n_total)) %>% mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)
-```
-
-```{r}
-#| label: plot-hv-scores
-#| warning: false
-#| fig-width: 8
-
-mrg <- hv_reads_filtered %>%
- mutate(kraken_label = ifelse(assigned_hv, "Kraken2 HV\nassignment",
- ifelse(hit_hv, "Kraken2 HV\nhit",
- "No hit or\nassignment"))) %>%
- group_by(sample, na_type) %>% arrange(desc(adj_score_fwd), desc(adj_score_rev)) %>%
- mutate(seq_num = row_number(),
- adj_score_max = pmax(adj_score_fwd, adj_score_rev))
-# Import Bowtie2/Kraken data and perform filtering steps
-g_mrg <- ggplot(mrg, aes(x=adj_score_fwd, y=adj_score_rev)) +
- geom_point(alpha=0.5, shape=16) +
- scale_x_continuous(name="S(forward read)", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +
- scale_y_continuous(name="S(reverse read)", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +
- facet_grid(na_type~kraken_label, labeller = labeller(kit = label_wrap_gen(20))) +
- theme_base + theme(aspect.ratio=1)
-g_mrg
-
-g_hist_0 <- ggplot(mrg, aes(x=adj_score_max)) + geom_histogram(binwidth=5,boundary=0,position="dodge") + facet_grid(na_type~kraken_label, scales="free_y") + scale_x_continuous(name = "Maximum adjusted alignment score") + scale_y_continuous(name="# Read pairs") + scale_fill_brewer(palette = "Dark2") + theme_base + geom_vline(xintercept=20, linetype="dashed", color="red")
-g_hist_0
-```
-
-As previously described, I ran BLASTN on these reads via a dedicated EC2 instance, using the same parameters I've used for previous datasets.
-
-```{r}
-#| label: make-blast-fasta
-mrg_fasta <- mrg %>%
- mutate(seq_head = paste0(">", seq_id)) %>%
- ungroup %>%
- select(header1=seq_head, seq1=query_seq_fwd,
- header2=seq_head, seq2=query_seq_rev) %>%
- mutate(header1=paste0(header1, "_1"), header2=paste0(header2, "_2"))
-mrg_fasta_out <- do.call(paste, c(mrg_fasta, sep="\n")) %>% paste(collapse="\n")
-write(mrg_fasta_out, file.path(data_dir, "brumfield-putative-viral.fasta"))
-```
-
-```{r}
-#| label: process-blast-data
-#| warning: false
-# Import BLAST results
-blast_results_path <- file.path(data_dir, "blast/brumfield-putative-viral.blast.gz")
-blast_cols <- c("qseqid", "sseqid", "sgi", "staxid", "qlen", "evalue", "bitscore", "qcovs", "length", "pident", "mismatch", "gapopen", "sstrand", "qstart", "qend", "sstart", "send")
-blast_results <- read_tsv(blast_results_path, show_col_types = FALSE,
- col_names = blast_cols, col_types = cols(.default="c"))
-
-# Add viral status
-blast_results_viral <- mutate(blast_results, viral = staxid %in% viral_taxa$taxid)
-
-# Filter for best hit for each query/subject, then rank hits for each query
-blast_results_best <- blast_results_viral %>% group_by(qseqid, staxid) %>%
- filter(bitscore == max(bitscore)) %>%
- filter(length == max(length)) %>% filter(row_number() == 1)
-blast_results_ranked <- blast_results_best %>%
- group_by(qseqid) %>% mutate(rank = dense_rank(desc(bitscore)))
-blast_results_highrank <- blast_results_ranked %>% filter(rank <= 5) %>%
- mutate(read_pair = str_split(qseqid, "_") %>% sapply(nth, n=-1),
- seq_id = str_split(qseqid, "_") %>% sapply(nth, n=1)) %>%
- mutate(bitscore = as.numeric(bitscore))
-
-# Summarize by read pair and taxid
-blast_results_paired <- blast_results_highrank %>%
- group_by(seq_id, staxid, viral) %>%
- summarize(bitscore_max = max(bitscore), bitscore_min = min(bitscore),
- n_reads = n(), .groups = "drop") %>%
- mutate(viral_full = viral & n_reads == 2)
-
-# Compare to Kraken & Bowtie assignments
-mrg_assign <- mrg %>% select(sample, seq_id, taxid, assigned_taxid, adj_score_max, na_type)
-blast_results_assign <- left_join(blast_results_paired, mrg_assign, by="seq_id") %>%
- mutate(taxid_match_bowtie = (staxid == taxid),
- taxid_match_kraken = (staxid == assigned_taxid),
- taxid_match_any = taxid_match_bowtie | taxid_match_kraken)
-blast_results_out <- blast_results_assign %>%
- group_by(seq_id) %>%
- summarize(viral_status = ifelse(any(viral_full), 2,
- ifelse(any(taxid_match_any), 2,
- ifelse(any(viral), 1, 0))),
- .groups = "drop")
-```
-
-```{r}
-#| label: plot-blast-results
-#| fig-height: 6
-#| warning: false
-
-# Merge BLAST results with unenriched read data
-mrg_blast <- full_join(mrg, blast_results_out, by="seq_id") %>%
- mutate(viral_status = replace_na(viral_status, 0),
- viral_status_out = ifelse(viral_status == 0, FALSE, TRUE))
-mrg_blast_rna <- mrg_blast %>% filter(na_type == "RNA")
-mrg_blast_dna <- mrg_blast %>% filter(na_type == "DNA")
-
-# Plot RNA
-g_mrg_blast_rna <- mrg_blast_rna %>%
- ggplot(aes(x=adj_score_fwd, y=adj_score_rev, color=viral_status_out)) +
- geom_point(alpha=0.5, shape=16) +
- scale_x_continuous(name="S(forward read)", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +
- scale_y_continuous(name="S(reverse read)", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +
- scale_color_brewer(palette = "Set1", name = "Viral status") +
- facet_grid(viral_status_out~kraken_label, labeller = labeller(kit = label_wrap_gen(20))) +
- theme_base + labs(title="RNA") +
- theme(aspect.ratio=1, plot.title = element_text(size=rel(2), hjust=0))
-g_mrg_blast_rna
-
-# Plot DNA
-g_mrg_blast_dna <- mrg_blast_dna %>%
- ggplot(aes(x=adj_score_fwd, y=adj_score_rev, color=viral_status_out)) +
- geom_point(alpha=0.5, shape=16) +
- scale_x_continuous(name="S(forward read)", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +
- scale_y_continuous(name="S(reverse read)", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +
- scale_color_brewer(palette = "Set1", name = "Viral status") +
- facet_grid(viral_status_out~kraken_label, labeller = labeller(kit = label_wrap_gen(20))) +
- theme_base + labs(title="DNA") +
- theme(aspect.ratio=1, plot.title = element_text(size=rel(2), hjust=0))
-g_mrg_blast_dna
-```
-
-```{r}
-#| label: plot-blast-histogram
-g_hist_1 <- ggplot(mrg_blast, aes(x=adj_score_max)) + geom_histogram(binwidth=5,boundary=0,position="dodge") + facet_grid(na_type~viral_status_out, scales = "free_y") + scale_x_continuous(name = "Maximum adjusted alignment score") + scale_y_continuous(name="# Read pairs") + scale_fill_brewer(palette = "Dark2") + theme_base + geom_vline(xintercept=20, linetype="dashed", color="red")
-g_hist_1
-```
-
-These results look very good on visual inspection, and indeed precision and sensitivity are both very high. For a disjunctive score threshold of 20, my updated workflow achieves an F1 score of 99.0% for RNA sequences and 99.2% for DNA sequences; more than high enough to continue on to human viral analysis.
-
-```{r}
-#| label: plot-f1
-test_sens_spec <- function(tab, score_threshold, conjunctive, include_special){
- if (!include_special) tab <- filter(tab, viral_status_out %in% c("TRUE", "FALSE"))
- tab_retained <- tab %>% mutate(
- conjunctive = conjunctive,
- retain_score_conjunctive = (adj_score_fwd > score_threshold & adj_score_rev > score_threshold),
- retain_score_disjunctive = (adj_score_fwd > score_threshold | adj_score_rev > score_threshold),
- retain_score = ifelse(conjunctive, retain_score_conjunctive, retain_score_disjunctive),
- retain = assigned_hv | hit_hv | retain_score) %>%
- group_by(viral_status_out, retain) %>% count
- pos_tru <- tab_retained %>% filter(viral_status_out == "TRUE", retain) %>% pull(n) %>% sum
- pos_fls <- tab_retained %>% filter(viral_status_out != "TRUE", retain) %>% pull(n) %>% sum
- neg_tru <- tab_retained %>% filter(viral_status_out != "TRUE", !retain) %>% pull(n) %>% sum
- neg_fls <- tab_retained %>% filter(viral_status_out == "TRUE", !retain) %>% pull(n) %>% sum
- sensitivity <- pos_tru / (pos_tru + neg_fls)
- specificity <- neg_tru / (neg_tru + pos_fls)
- precision <- pos_tru / (pos_tru + pos_fls)
- f1 <- 2 * precision * sensitivity / (precision + sensitivity)
- out <- tibble(threshold=score_threshold, include_special = include_special,
- conjunctive = conjunctive, sensitivity=sensitivity,
- specificity=specificity, precision=precision, f1=f1)
- return(out)
-}
-range_f1 <- function(intab, inc_special, inrange=15:45){
- tss <- purrr::partial(test_sens_spec, tab=intab, include_special=inc_special)
- stats_conj <- lapply(inrange, tss, conjunctive=TRUE) %>% bind_rows
- stats_disj <- lapply(inrange, tss, conjunctive=FALSE) %>% bind_rows
- stats_all <- bind_rows(stats_conj, stats_disj) %>%
- pivot_longer(!(threshold:conjunctive), names_to="metric", values_to="value") %>%
- mutate(conj_label = ifelse(conjunctive, "Conjunctive", "Disjunctive"))
- return(stats_all)
-}
-inc_special <- FALSE
-stats_rna <- range_f1(mrg_blast_rna, inc_special) %>% mutate(na_type = "RNA")
-stats_dna <- range_f1(mrg_blast_dna, inc_special) %>% mutate(na_type = "DNA")
-stats_0 <- bind_rows(stats_rna, stats_dna) %>% mutate(attempt=0)
-threshold_opt_0 <- stats_0 %>% group_by(conj_label,attempt,na_type) %>%
- filter(metric == "f1") %>%
- filter(value == max(value)) %>% filter(threshold == min(threshold))
-g_stats_0 <- ggplot(stats_0, aes(x=threshold, y=value, color=metric)) +
- geom_vline(data = threshold_opt_0, mapping = aes(xintercept=threshold),
- color = "red", linetype = "dashed") +
- geom_line() +
- scale_y_continuous(name = "Value", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) +
- scale_x_continuous(name = "Threshold", expand = c(0,0)) +
- scale_color_brewer(palette="Set3") +
- facet_grid(na_type~conj_label) +
- theme_base
-g_stats_0
-```
-
-# Human-infecting virus reads: analysis
-
-```{r}
-#| label: count-hv-reads
-
-# Get raw read counts
-read_counts_raw <- basic_stats_raw %>%
- select(sample, date, na_type, n_reads_raw = n_read_pairs)
-
-# Get HV read counts
-mrg_hv <- mrg %>% mutate(hv_status = assigned_hv | hit_hv | adj_score_max >= 20)
-read_counts_hv <- mrg_hv %>% filter(hv_status) %>% group_by(sample) %>% count(name="n_reads_hv")
-read_counts <- read_counts_raw %>% left_join(read_counts_hv, by="sample") %>%
- mutate(n_reads_hv = replace_na(n_reads_hv, 0))
-
-# Aggregate
-read_counts_total <- read_counts %>% group_by(na_type) %>%
- summarize(n_reads_raw = sum(n_reads_raw),
- n_reads_hv = sum(n_reads_hv)) %>%
- mutate(sample= "All samples", date = "All dates")
-read_counts_agg <- read_counts %>% arrange(date) %>%
- mutate(date = as.character(date)) %>% arrange(sample) %>%
- bind_rows(read_counts_total) %>%
- mutate(sample = fct_inorder(sample),
- date = fct_inorder(date),
- p_reads_hv = n_reads_hv/n_reads_raw)
-
-# Get old counts
-n_hv_reads_old <- c(691, 121, 18, 224, 2, 26, 6, 21, 4, 29, 12, 22)
-n_hv_reads_old[13] <- sum(n_hv_reads_old[which(read_counts$na_type=="RNA")])
-n_hv_reads_old[14] <- sum(n_hv_reads_old[which(read_counts$na_type=="DNA")])
-
-read_counts_comp <- read_counts_agg %>%
- mutate(n_reads_hv_old = n_hv_reads_old,
- p_reads_hv_old = n_reads_hv_old/n_reads_raw)
-```
-
-Applying a disjunctive cutoff at S=20 identifies 13,809 RNA reads and 66 DNA reads as human-viral. This gives an overall relative HV abundance of $5.00 \times 10^{-5}$ for RNA reads and $3.66 \times 10^{-7}$ for DNA reads. In contrast, the overall HV in the public dashboard is $3.93 \times 10^{-6}$ for RNA reads and $4.54 \times 10^{-7}$; the measured HV has thus increased 12.7x for RNA reads, but *decreased* slightly for DNA reads.
-
-```{r}
-#| label: plot-hv-ra
-#| warning: false
-# Visualize
-g_phv_agg <- ggplot(read_counts_agg, aes(x=date, color=na_type)) +
- geom_point(aes(y=p_reads_hv)) +
- scale_y_log10("Relative abundance of human virus reads") +
- scale_color_na() + theme_kit
-g_phv_agg
-```
-
-Digging into individual viruses, we see that fecal-oral viruses predominate, especially *Mamastrovirus*, *Rotavirus*, *Salivirus*, and fecal-oral *Enterovirus* species (especially *Enterovirus C*, which includes poliovirus):
-
-```{r}
-#| label: raise-hv-taxa
-
-# Get viral taxon names for putative HV reads
-viral_taxa$name[viral_taxa$taxid == 249588] <- "Mamastrovirus"
-viral_taxa$name[viral_taxa$taxid == 194960] <- "Kobuvirus"
-viral_taxa$name[viral_taxa$taxid == 688449] <- "Salivirus"
-viral_taxa$name[viral_taxa$taxid == 585893] <- "Picobirnaviridae"
-viral_taxa$name[viral_taxa$taxid == 333922] <- "Betapapillomavirus"
-
-viral_taxa$name[viral_taxa$taxid == 334207] <- "Betapapillomavirus 3"
-viral_taxa$name[viral_taxa$taxid == 369960] <- "Porcine type-C oncovirus"
-viral_taxa$name[viral_taxa$taxid == 333924] <- "Betapapillomavirus 2"
-
-mrg_hv_named <- mrg_hv %>% left_join(viral_taxa, by="taxid")
-
-# Discover viral species & genera for HV reads
-raise_rank <- function(read_db, taxid_db, out_rank = "species", verbose = FALSE){
- # Get higher ranks than search rank
- ranks <- c("subspecies", "species", "subgenus", "genus", "subfamily", "family", "suborder", "order", "class", "subphylum", "phylum", "kingdom", "superkingdom")
- rank_match <- which.max(ranks == out_rank)
- high_ranks <- ranks[rank_match:length(ranks)]
- # Merge read DB and taxid DB
- reads <- read_db %>% select(-parent_taxid, -rank, -name) %>%
- left_join(taxid_db, by="taxid")
- # Extract sequences that are already at appropriate rank
- reads_rank <- filter(reads, rank == out_rank)
- # Drop sequences at a higher rank and return unclassified sequences
- reads_norank <- reads %>% filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))
- while(nrow(reads_norank) > 0){ # As long as there are unclassified sequences...
- # Promote read taxids and re-merge with taxid DB, then re-classify and filter
- reads_remaining <- reads_norank %>% mutate(taxid = parent_taxid) %>%
- select(-parent_taxid, -rank, -name) %>%
- left_join(taxid_db, by="taxid")
- reads_rank <- reads_remaining %>% filter(rank == out_rank) %>%
- bind_rows(reads_rank)
- reads_norank <- reads_remaining %>%
- filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))
- }
- # Finally, extract and append reads that were excluded during the process
- reads_dropped <- reads %>% filter(!seq_id %in% reads_rank$seq_id)
- reads_out <- reads_rank %>% bind_rows(reads_dropped) %>%
- select(-parent_taxid, -rank, -name) %>%
- left_join(taxid_db, by="taxid")
- return(reads_out)
-}
-hv_reads_species <- raise_rank(mrg_hv_named, viral_taxa, "species")
-hv_reads_genus <- raise_rank(mrg_hv_named, viral_taxa, "genus")
-hv_reads_family <- raise_rank(mrg_hv_named, viral_taxa, "family")
-```
-
-```{r}
-#| label: hv-families
-
-# Count reads for each human-viral family
-hv_family_counts <- hv_reads_family %>%
- group_by(sample, date, na_type, name, taxid) %>%
- count(name = "n_reads_hv") %>%
- group_by(sample, date, na_type) %>%
- mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))
-
-# Identify high-ranking families and group others
-hv_family_major_tab <- hv_family_counts %>% group_by(name) %>%
- filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%
- arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > 0.01)
-hv_family_counts_major <- hv_family_counts %>%
- mutate(name_display = ifelse(name %in% hv_family_major_tab$name, name, "Other")) %>%
- group_by(sample, date, na_type, name_display) %>%
- summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv),
- .groups="drop") %>%
- mutate(name_display = factor(name_display,
- levels = c(hv_family_major_tab$name, "Other")))
-
-# Plot family composition
-palette <- c(brewer.pal(12, "Set3"), "#AAAAAA")
-g_hv_families <- ggplot(hv_family_counts_major,
- aes(x=date, y=p_reads_hv, fill=name_display)) +
- geom_col(position="stack") +
- scale_fill_manual(name="Viral\nfamily", values=palette) +
- scale_y_continuous(name="% HV Reads", limits=c(0,1), breaks=seq(0,1,0.2)) +
- facet_grid(.~na_type) +
- guides(fill=guide_legend(ncol=3)) +
- theme_kit
-g_hv_families
-```
-
-```{r}
-#| label: hv-genera
-
-# Count reads for each human-viral genus
-hv_genus_counts <- hv_reads_genus %>%
- group_by(sample, date, na_type, name, taxid) %>%
- count(name = "n_reads_hv") %>%
- group_by(sample, date, na_type) %>%
- mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))
-
-# Identify high-ranking genera and group others
-hv_genus_major_tab <- hv_genus_counts %>% group_by(name) %>%
- filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%
- arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > 0.02)
-hv_genus_counts_major <- hv_genus_counts %>%
- mutate(name_display = ifelse(name %in% hv_genus_major_tab$name, name, "Other")) %>%
- group_by(sample, date, na_type, name_display) %>%
- summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv),
- .groups="drop") %>%
- mutate(name_display = factor(name_display,
- levels = c(hv_genus_major_tab$name, "Other")))
-
-# Plot genus composition
-palette <- c(brewer.pal(12, "Set3"), "#AAAAAA")
-g_hv_genera <- ggplot(hv_genus_counts_major,
- aes(x=date, y=p_reads_hv, fill=name_display)) +
- geom_col(position="stack") +
- scale_fill_manual(name="Viral\ngenus", values=palette) +
- scale_y_continuous(name="% HV Reads", limits=c(0,1), breaks=seq(0,1,0.2)) +
- facet_grid(.~na_type) +
- guides(fill=guide_legend(ncol=3)) +
- theme_kit
-g_hv_genera
-```
-
-```{r}
-#| label: hv-species
-
-# Count reads for each human-viral species
-hv_species_counts <- hv_reads_species %>%
- group_by(sample, date, na_type, name, taxid) %>%
- count(name = "n_reads_hv") %>%
- group_by(sample, date, na_type) %>%
- mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))
-
-# Identify high-ranking species and group others
-hv_species_major_tab <- hv_species_counts %>% group_by(name, taxid) %>%
- filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%
- arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > 0.05)
-hv_species_counts_major <- hv_species_counts %>%
- mutate(name_display = ifelse(name %in% hv_species_major_tab$name, name, "Other")) %>%
- group_by(sample, date, na_type, name_display) %>%
- summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv),
- .groups="drop") %>%
- mutate(name_display = factor(name_display,
- levels = c(hv_species_major_tab$name, "Other")))
-
-# Plot species composition
-palette <- c(brewer.pal(12, "Set3"), "#AAAAAA")
-g_hv_species <- ggplot(hv_species_counts_major,
- aes(x=date, y=p_reads_hv, fill=name_display)) +
- geom_col(position="stack") +
- scale_fill_manual(name="Viral\nspecies", values=palette) +
- scale_y_continuous(name="% HV Reads", limits=c(0,1), breaks=seq(0,1,0.2)) +
- facet_grid(.~na_type) +
- guides(fill=guide_legend(ncol=3)) +
- theme_kit
-g_hv_species
-```
-
-By far the most common virus according to my pipeline (with over 91% of human-viral reads) is *Rotavirus A*; second (with 8%) is *Orthopicobirnavirus hominis*. Together, these two enteric RNA viruses dominate HV RNA reads in all samples. Both appear to be real; at least, BLASTN primarily maps these reads to the same or closely-related viruses to that identified by Bowtie2:
-
-```{r}
-#| label: hv-rotavirus
-
-# Import names db
-tax_names_path <- file.path(data_dir, "taxid-names.tsv.gz")
-tax_names <- read_tsv(tax_names_path, show_col_types = FALSE)
-
-# Add missing names
-tax_names_new <- tibble(
- staxid = c(557241, 557245, 557232, 557247, 557242, 557245),
- name = c("Canine rotavirus A79-10/G3P[3]", "Human rotavirus Ro1845", "Canine rotavirus K9",
- "Human rotavirus HCR3A", "Feline rotavirus Cat97/G3P[3]", "Feline rotavirus Cat2/G3P[9]")
-)
-tax_names <- bind_rows(tax_names, tax_names_new)
-
-rota_ids <- hv_reads_species %>% filter(taxid==28875) %>% pull(seq_id)
-rota_hits <- blast_results_paired %>%
- filter(seq_id %in% rota_ids) %>%
- mutate(staxid = as.integer(staxid))
-rota_hits_sorted <- rota_hits %>% group_by(staxid) %>% count %>% arrange(desc(n)) %>%
- left_join(tax_names, by="staxid") %>%
- mutate(name = fct_inorder(name))
-rota_hits_sorted_head <- rota_hits_sorted %>% head(10) %>%
- mutate(name=fct_inorder(as.character(name)))
-g_rota <- ggplot(rota_hits_sorted_head,
- aes(x=n, y=fct_inorder(name))) + geom_col() +
- scale_x_continuous(name = "# Mapped read pairs") + theme_base +
- theme(axis.title.y = element_blank())
-g_rota
-```
-
-```{r}
-#| label: hv-orthopicobirnavirus
-
-# Add missing names
-tax_names_new <- tibble(
- staxid = c(442302, 3003954, 585895),
- name = c("Porcine picobirnavirus", "ticpantry virus 7", "uncultured picobirnavirus")
-)
-tax_names <- bind_rows(tax_names, tax_names_new)
-
-pico_ids <- hv_reads_species %>% filter(taxid==2956252) %>% pull(seq_id)
-pico_hits <- blast_results_paired %>%
- filter(seq_id %in% pico_ids) %>%
- mutate(staxid = as.integer(staxid))
-pico_hits_sorted <- pico_hits %>% group_by(staxid) %>% count %>% arrange(desc(n)) %>%
- left_join(tax_names, by="staxid") %>%
- mutate(name = fct_inorder(name))
-pico_hits_sorted_head <- pico_hits_sorted %>% head(10) %>%
- mutate(name=fct_inorder(as.character(name)))
-g_pico <- ggplot(pico_hits_sorted_head,
- aes(x=n, y=fct_inorder(name))) + geom_col() +
- scale_x_continuous(name = "# Mapped read pairs") + theme_base +
- theme(axis.title.y = element_blank())
-g_pico
-```
-
-# Conclusion
-
-I'm quite happy with the quality of the human-viral selection process for this dataset, which ended up achieving very high precision and sensitivity. The most striking result coming out of this analysis is probably the drastic difference in total HV abundance between RNA and DNA reads, with the former \>100x higher despite very similar processing methods. In the past we've attributed low HV abundance in the DNA datasets we've analyzed to the lack of viral enrichment in available DNA datasets; in this case, however, there is very little difference between the DNA and RNA processing methods, so this explanation doesn't really hold. This makes me more pessimistic about achieving good HV relative abundance with DNA sequencing, even with improved viral enrichment methods.
+---
+
+```{r}
+#| label: load-packages
+#| include: false
+library(tidyverse)
+library(cowplot)
+library(patchwork)
+library(fastqcr)
+library(RColorBrewer)
+source("../scripts/aux_plot-theme.R")
+theme_base <- theme_base + theme(aspect.ratio = NULL)
+theme_kit <- theme_base + theme(
+ axis.text.x = element_text(hjust = 1, angle = 45),
+ axis.title.x = element_blank(),
+)
+tnl <- theme(legend.position = "none")
+```
+
+Continuing my analysis of datasets from the [P2RA preprint](https://doi.org/10.1101/2023.12.22.23300450), I analyzed the data from [Brumfield et al. (2022)](https://doi.org/10.1128/mbio.00591-22). This study conducted longitudinal monitoring of SARS-CoV-2 via qPCR from a single manhole in Maryland, combined with comprehensive microbiome analysis using metagenomics during a major COVID spike in early 2021. In total, they sequenced six samples from February to April 2021.
+
+One unusual feature that makes this study interesting is that they conducted both RNA and DNA sequencing on each study. Prior to sequencing, samples underwent concentration with the InnovaPrep Concentrating Pipette, followed by separate DNA & RNA extraction using different kits. RNA samples underwent rRNA depletion prior to library prep. All samples were sequenced on an Illumina HiSeq4000 with 2x150bp reads.
+
+# The raw data
+
+The 6 samples from the Brumfield dataset yielded 24M-45M (mean 33M) DNA-sequencing reads and 29M-64M (mean 46M) RNA-sequencing reads per sample, for a total of 196M DNA read pairs and 276M RNA read pairs (59 and 83 gigabases of sequence, respectively). Read qualities were mostly high but tailed off at the 3' end in some samples, suggesting the need for trimming. Adapter levels were very high. Inferred duplication levels were 44-58% in DNA reads and 90-96% in RNA reads, i.e. very high.
+
+```{r}
+#| warning: false
+#| label: read-qc-data
+
+# Data input paths
+data_dir <- "../data/2024-03-19_brumfield/"
+libraries_path <- file.path(data_dir, "sample-metadata.csv")
+basic_stats_path <- file.path(data_dir, "qc_basic_stats.tsv")
+adapter_stats_path <- file.path(data_dir, "qc_adapter_stats.tsv")
+quality_base_stats_path <- file.path(data_dir, "qc_quality_base_stats.tsv")
+quality_seq_stats_path <- file.path(data_dir, "qc_quality_sequence_stats.tsv")
+
+# Import libraries and extract metadata from sample names
+libraries <- read_csv(libraries_path, show_col_types = FALSE) %>%
+ arrange(date) %>% mutate(date = fct_inorder(as.character(date))) %>%
+ arrange(desc(na_type)) %>% mutate(na_type = fct_inorder(na_type))
+
+# Import QC data
+stages <- c("raw_concat", "cleaned", "dedup", "ribo_initial", "ribo_secondary")
+basic_stats <- read_tsv(basic_stats_path, show_col_types = FALSE) %>%
+ inner_join(libraries, by="sample") %>%
+ mutate(stage = factor(stage, levels = stages),
+ sample = fct_inorder(sample))
+adapter_stats <- read_tsv(adapter_stats_path, show_col_types = FALSE) %>%
+ mutate(sample = sub("_2$", "", sample)) %>%
+ inner_join(libraries, by="sample") %>%
+ mutate(stage = factor(stage, levels = stages),
+ read_pair = fct_inorder(as.character(read_pair)))
+quality_base_stats <- read_tsv(quality_base_stats_path, show_col_types = FALSE) %>%
+ inner_join(libraries, by="sample") %>%
+ mutate(stage = factor(stage, levels = stages),
+ read_pair = fct_inorder(as.character(read_pair)))
+quality_seq_stats <- read_tsv(quality_seq_stats_path, show_col_types = FALSE) %>%
+ inner_join(libraries, by="sample") %>%
+ mutate(stage = factor(stage, levels = stages),
+ read_pair = fct_inorder(as.character(read_pair)))
+
+# Filter to raw data
+basic_stats_raw <- basic_stats %>% filter(stage == "raw_concat")
+adapter_stats_raw <- adapter_stats %>% filter(stage == "raw_concat")
+quality_base_stats_raw <- quality_base_stats %>% filter(stage == "raw_concat")
+quality_seq_stats_raw <- quality_seq_stats %>% filter(stage == "raw_concat")
+```
+
+```{r}
+#| fig-height: 5
+#| warning: false
+#| label: plot-basic-stats
+
+# Visualize basic stats
+scale_fill_na <- purrr::partial(scale_fill_brewer, palette="Set1", name="Nucleic acid type")
+g_basic <- ggplot(basic_stats_raw, aes(x=date, fill=na_type, group=sample)) +
+ scale_fill_na() + theme_kit
+g_nreads_raw <- g_basic + geom_col(aes(y=n_read_pairs), position = "dodge") +
+ scale_y_continuous(name="# Read pairs", expand=c(0,0))
+g_nreads_raw_2 <- g_nreads_raw + theme(legend.position = "none")
+legend_group <- get_legend(g_nreads_raw)
+
+g_nbases_raw <- g_basic + geom_col(aes(y=n_bases_approx), position = "dodge") +
+ scale_y_continuous(name="Total base pairs (approx)", expand=c(0,0)) +
+ theme(legend.position = "none")
+g_ndup_raw <- g_basic + geom_col(aes(y=percent_duplicates), position = "dodge") +
+ scale_y_continuous(name="% Duplicates (FASTQC)", expand=c(0,0), limits = c(0,100), breaks = seq(0,100,20)) +
+ theme(legend.position = "none")
+
+g_basic_raw <- plot_grid(g_nreads_raw_2 + g_nbases_raw + g_ndup_raw, legend_group, ncol = 1, rel_heights = c(1,0.1))
+g_basic_raw
+```
+
+```{r}
+#| label: plot-raw-quality
+
+scale_color_na <- purrr::partial(scale_color_brewer,palette="Set1",name="Nucleic acid type")
+g_qual_raw <- ggplot(mapping=aes(color=na_type, linetype=read_pair,
+ group=interaction(sample,read_pair))) +
+ scale_color_na() + scale_linetype_discrete(name = "Read Pair") +
+ guides(color=guide_legend(nrow=2,byrow=TRUE),
+ linetype = guide_legend(nrow=2,byrow=TRUE)) +
+ theme_base
+
+# Visualize adapters
+g_adapters_raw <- g_qual_raw +
+ geom_line(aes(x=position, y=pc_adapters), data=adapter_stats_raw) +
+ scale_y_continuous(name="% Adapters", limits=c(0,NA),
+ breaks = seq(0,100,10), expand=c(0,0)) +
+ scale_x_continuous(name="Position", limits=c(0,NA),
+ breaks=seq(0,140,20), expand=c(0,0)) +
+ facet_wrap(~adapter)
+g_adapters_raw
+
+# Visualize quality
+g_quality_base_raw <- g_qual_raw +
+ geom_hline(yintercept=25, linetype="dashed", color="red") +
+ geom_hline(yintercept=30, linetype="dashed", color="red") +
+ geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats_raw) +
+ scale_y_continuous(name="Mean Phred score", expand=c(0,0), limits=c(10,45)) +
+ scale_x_continuous(name="Position", limits=c(0,NA),
+ breaks=seq(0,140,20), expand=c(0,0))
+g_quality_base_raw
+
+g_quality_seq_raw <- g_qual_raw +
+ geom_vline(xintercept=25, linetype="dashed", color="red") +
+ geom_vline(xintercept=30, linetype="dashed", color="red") +
+ geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats_raw) +
+ scale_x_continuous(name="Mean Phred score", expand=c(0,0)) +
+ scale_y_continuous(name="# Sequences", expand=c(0,0))
+g_quality_seq_raw
+```
+
+# Preprocessing
+
+The average fraction of reads lost at each stage in the preprocessing pipeline is shown in the following table. On average, cleaning & deduplication together removed about 50% of total reads from DNA libraries and about 92% from RNA libraries, primarily during deduplication. Only a few reads (low single-digit percentages at most) were lost during subsequent ribodepletion, even in RNA reads, consistent with the samples having undergone rRNA depletion prior to sequencing.
+
+```{r}
+#| label: preproc-table
+n_reads_rel <- basic_stats %>%
+ select(sample, na_type, stage, percent_duplicates, n_read_pairs) %>%
+ group_by(sample, na_type) %>% arrange(sample, na_type, stage) %>%
+ mutate(p_reads_retained = n_read_pairs / lag(n_read_pairs),
+ p_reads_lost = 1 - p_reads_retained,
+ p_reads_retained_abs = n_read_pairs / n_read_pairs[1],
+ p_reads_lost_abs = 1-p_reads_retained_abs,
+ p_reads_lost_abs_marginal = p_reads_lost_abs - lag(p_reads_lost_abs))
+n_reads_rel_display <- n_reads_rel %>% rename(Stage=stage, `NA Type`=na_type) %>%
+ group_by(`NA Type`, Stage) %>%
+ summarize(`% Total Reads Lost (Cumulative)` = paste0(round(min(p_reads_lost_abs*100),1), "-", round(max(p_reads_lost_abs*100),1), " (mean ", round(mean(p_reads_lost_abs*100),1), ")"),
+ `% Total Reads Lost (Marginal)` = paste0(round(min(p_reads_lost_abs_marginal*100),1), "-", round(max(p_reads_lost_abs_marginal*100),1), " (mean ", round(mean(p_reads_lost_abs_marginal*100),1), ")"), .groups="drop") %>%
+ filter(Stage != "raw_concat") %>%
+ mutate(Stage = Stage %>% as.numeric %>% factor(labels=c("Trimming & filtering", "Deduplication", "Initial ribodepletion", "Secondary ribodepletion")))
+n_reads_rel_display
+```
+
+```{r}
+#| label: preproc-figures
+#| warning: false
+# Plot reads over preprocessing
+g_reads_stages <- ggplot(basic_stats, aes(x=stage, y=n_read_pairs,fill=na_type,group=sample)) +
+ geom_col(position="dodge") + scale_fill_na() +
+ scale_y_continuous("# Read pairs", expand=c(0,0)) +
+ theme_kit
+g_reads_stages
+
+# Plot relative read losses during preprocessing
+g_reads_rel <- ggplot(n_reads_rel, aes(x=stage, y=p_reads_lost_abs_marginal,fill=na_type,group=sample)) +
+ geom_col(position="dodge") + scale_fill_na() +
+ scale_y_continuous("% Total Reads Lost", expand=c(0,0), labels = function(x) x*100) +
+ theme_kit
+g_reads_rel
+```
+
+Data cleaning with FASTP was very successful at removing adapters, with very few adapter sequences found by FASTQC at any stage after the raw data. FASTP was also successful at improving read quality.
+
+```{r}
+#| warning: false
+#| label: plot-quality
+
+g_qual <- ggplot(mapping=aes(color=na_type, linetype=read_pair,
+ group=interaction(sample,read_pair))) +
+ scale_color_na() + scale_linetype_discrete(name = "Read Pair") +
+ guides(color=guide_legend(nrow=2,byrow=TRUE),
+ linetype = guide_legend(nrow=2,byrow=TRUE)) +
+ theme_base
+
+# Visualize adapters
+g_adapters <- g_qual +
+ geom_line(aes(x=position, y=pc_adapters), data=adapter_stats) +
+ scale_y_continuous(name="% Adapters", limits=c(0,20),
+ breaks = seq(0,50,10), expand=c(0,0)) +
+ scale_x_continuous(name="Position", limits=c(0,NA),
+ breaks=seq(0,140,20), expand=c(0,0)) +
+ facet_grid(stage~adapter)
+g_adapters
+
+# Visualize quality
+g_quality_base <- g_qual +
+ geom_hline(yintercept=25, linetype="dashed", color="red") +
+ geom_hline(yintercept=30, linetype="dashed", color="red") +
+ geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats) +
+ scale_y_continuous(name="Mean Phred score", expand=c(0,0), limits=c(10,45)) +
+ scale_x_continuous(name="Position", limits=c(0,NA),
+ breaks=seq(0,140,20), expand=c(0,0)) +
+ facet_grid(stage~.)
+g_quality_base
+
+g_quality_seq <- g_qual +
+ geom_vline(xintercept=25, linetype="dashed", color="red") +
+ geom_vline(xintercept=30, linetype="dashed", color="red") +
+ geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats) +
+ scale_x_continuous(name="Mean Phred score", expand=c(0,0)) +
+ scale_y_continuous(name="# Sequences", expand=c(0,0)) +
+ facet_grid(stage~.)
+g_quality_seq
+```
+
+According to FASTQC, deduplication was moderately effective at reducing measured duplicate levels, with FASTQC-measured levels falling from an average of 50% to one of 16% for DNA reads, and from 93% to 42% for RNA reads:
+
+```{r}
+#| label: preproc-dedup
+
+g_dup_stages <- ggplot(basic_stats, aes(x=stage, y=percent_duplicates, fill=na_type, group=sample)) +
+ geom_col(position="dodge") + scale_fill_na() +
+ scale_y_continuous("% Duplicates", limits=c(0,100), breaks=seq(0,100,20), expand=c(0,0)) +
+ theme_kit
+g_readlen_stages <- ggplot(basic_stats, aes(x=stage, y=mean_seq_len, fill=na_type, group=sample)) +
+ geom_col(position="dodge") + scale_fill_na() +
+ scale_y_continuous("Mean read length (nt)", expand=c(0,0)) +
+ theme_kit
+legend_loc <- get_legend(g_dup_stages)
+g_dup_stages
+g_readlen_stages
+```
+
+# High-level composition
+
+As before, to assess the high-level composition of the reads, I ran the ribodepleted files through Kraken (using the Standard 16 database) and summarized the results with Bracken. Combining these results with the read counts above gives us a breakdown of the inferred composition of the samples:
+
+```{r}
+#| label: plot-composition-all
+
+# Import Bracken data
+bracken_path <- file.path(data_dir, "bracken_counts.tsv")
+bracken <- read_tsv(bracken_path, show_col_types = FALSE)
+total_assigned <- bracken %>% group_by(sample) %>% summarize(
+ name = "Total",
+ kraken_assigned_reads = sum(kraken_assigned_reads),
+ added_reads = sum(added_reads),
+ new_est_reads = sum(new_est_reads),
+ fraction_total_reads = sum(fraction_total_reads)
+)
+bracken_spread <- bracken %>% select(name, sample, new_est_reads) %>%
+ mutate(name = tolower(name)) %>%
+ pivot_wider(id_cols = "sample", names_from = "name", values_from = "new_est_reads")
+# Count reads
+read_counts_preproc <- basic_stats %>%
+ select(sample, na_type, date, stage, n_read_pairs) %>%
+ pivot_wider(id_cols = c("sample", "na_type", "date"), names_from="stage", values_from="n_read_pairs")
+read_counts <- read_counts_preproc %>%
+ inner_join(total_assigned %>% select(sample, new_est_reads), by = "sample") %>%
+ rename(assigned = new_est_reads) %>%
+ inner_join(bracken_spread, by="sample")
+# Assess composition
+read_comp <- transmute(read_counts, sample=sample, na_type=na_type, date=date,
+ n_filtered = raw_concat-cleaned,
+ n_duplicate = cleaned-dedup,
+ n_ribosomal = (dedup-ribo_initial) + (ribo_initial-ribo_secondary),
+ n_unassigned = ribo_secondary-assigned,
+ n_bacterial = bacteria,
+ n_archaeal = archaea,
+ n_viral = viruses,
+ n_human = eukaryota)
+read_comp_long <- pivot_longer(read_comp, -(sample:date), names_to = "classification",
+ names_prefix = "n_", values_to = "n_reads") %>%
+ mutate(classification = fct_inorder(str_to_sentence(classification))) %>%
+ group_by(sample) %>% mutate(p_reads = n_reads/sum(n_reads))
+read_comp_summ <- read_comp_long %>%
+ group_by(na_type, classification) %>%
+ summarize(n_reads = sum(n_reads), .groups = "drop_last") %>%
+ mutate(n_reads = replace_na(n_reads,0),
+ p_reads = n_reads/sum(n_reads),
+ pc_reads = p_reads*100)
+
+# Plot overall composition
+g_comp <- ggplot(read_comp_long, aes(x=sample, y=p_reads, fill=classification)) +
+ geom_col(position = "stack") +
+ scale_y_continuous(name = "% Reads", limits = c(0,1.01), breaks = seq(0,1,0.2),
+ expand = c(0,0), labels = function(x) x*100) +
+ scale_fill_brewer(palette = "Set1", name = "Classification") +
+ facet_wrap(.~na_type, scales="free", ncol=5) +
+ theme_kit
+g_comp
+
+# Plot composition of minor components
+read_comp_minor <- read_comp_long %>% filter(classification %in% c("Archaeal", "Viral", "Human", "Other"))
+palette_minor <- brewer.pal(9, "Set1")[6:9]
+g_comp_minor <- ggplot(read_comp_minor, aes(x=sample, y=p_reads, fill=classification)) +
+ geom_col(position = "stack") +
+ scale_y_continuous(name = "% Reads",
+ expand = c(0,0), labels = function(x) x*100) +
+ scale_fill_manual(values=palette_minor, name = "Classification") +
+ facet_wrap(.~na_type, scales = "free_x", ncol=5) +
+ theme_kit
+g_comp_minor
+
+```
+
+```{r}
+#| label: composition-summary
+
+p_reads_summ_group <- read_comp_long %>%
+ mutate(classification = ifelse(classification %in% c("Filtered", "Duplicate", "Unassigned"), "Excluded", as.character(classification)),
+ classification = fct_inorder(classification)) %>%
+ group_by(classification, sample, na_type) %>%
+ summarize(p_reads = sum(p_reads), .groups = "drop") %>%
+ group_by(classification, na_type) %>%
+ summarize(pc_min = min(p_reads)*100, pc_max = max(p_reads)*100,
+ pc_mean = mean(p_reads)*100, .groups = "drop")
+p_reads_summ_prep <- p_reads_summ_group %>%
+ mutate(classification = fct_inorder(classification),
+ pc_min = pc_min %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),
+ pc_max = pc_max %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),
+ pc_mean = pc_mean %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),
+ display = paste0(pc_min, "-", pc_max, "% (mean ", pc_mean, "%)"))
+p_reads_summ <- p_reads_summ_prep %>%
+ select(classification, na_type, display) %>%
+ pivot_wider(names_from=na_type, values_from = display)
+p_reads_summ
+```
+
+As previously noted, RNA samples were overwhelmingly composed of duplicates. Despite this, the RNA samples achieved a decently high level of total viral reads, with an average of 0.55% of reads mapping to viruses (higher than Crits-Christoph). Levels of total viral reads were much lower in DNA libraries, which were dominated by unassigned and (non-ribosomal) bacterial sequences.
+
+Within viral reads, RNA libraries were (as usual) dominated by plant viruses, particularly *Virgaviridae* – though one sample, unusually, has a substantial minority of reads from *Fiersviridae*, a family of RNA phages. Detected DNA viruses come from a wider range of families, but the most abundant families (*Suoliviridae*, *Intestiviridae*, *Autographiviridae*, *Myoviridae*) are all DNA phages.
+
+```{r}
+#| label: plot-viral-families-all
+
+# Get viral taxonomy
+viral_taxa_path <- file.path(data_dir, "viral-taxids.tsv.gz")
+viral_taxa <- read_tsv(viral_taxa_path, show_col_types = FALSE)
+
+# Import Kraken reports & extract & count viral families
+samples <- as.character(basic_stats_raw$sample)
+col_names <- c("pc_reads_total", "n_reads_clade", "n_reads_direct", "rank", "taxid", "name")
+report_paths <- paste0(data_dir, "kraken/", samples, ".report.gz")
+kraken_reports <- lapply(1:length(samples),
+ function(n) read_tsv(report_paths[n], col_names = col_names,
+ show_col_types = FALSE) %>%
+ mutate(sample = samples[n])) %>% bind_rows
+kraken_reports_viral <- filter(kraken_reports, taxid %in% viral_taxa$taxid) %>%
+ group_by(sample) %>%
+ mutate(p_reads_viral = n_reads_clade/n_reads_clade[1])
+viral_families <- kraken_reports_viral %>% filter(rank == "F") %>%
+ inner_join(libraries, by="sample")
+
+# Identify major viral families
+viral_families_major_tab <- viral_families %>% group_by(name, taxid, na_type) %>%
+ summarize(p_reads_viral_max = max(p_reads_viral), .groups="drop") %>%
+ filter(p_reads_viral_max >= 0.04)
+viral_families_major_list <- viral_families_major_tab %>% pull(name)
+viral_families_major <- viral_families %>% filter(name %in% viral_families_major_list) %>%
+ select(name, taxid, sample, na_type, date, p_reads_viral)
+viral_families_minor <- viral_families_major %>% group_by(sample, date, na_type) %>%
+ summarize(p_reads_viral_major = sum(p_reads_viral), .groups = "drop") %>%
+ mutate(name = "Other", taxid=NA, p_reads_viral = 1-p_reads_viral_major) %>%
+ select(name, taxid, sample, na_type, date, p_reads_viral)
+viral_families_display <- bind_rows(viral_families_major, viral_families_minor) %>%
+ arrange(desc(p_reads_viral)) %>% mutate(name = factor(name, levels=c(viral_families_major_list, "Other")))
+g_families <- ggplot(viral_families_display, aes(x=date, y=p_reads_viral, fill=name)) +
+ geom_col(position="stack") +
+ scale_fill_brewer(palette = "Set3", name = "Viral family") +
+ scale_y_continuous(name="% Viral Reads", limits=c(0,1), breaks = seq(0,1,0.2),
+ expand=c(0,0), labels = function(y) y*100) +
+ facet_wrap(~na_type) +
+ theme_kit
+g_families
+```
+
+Given the very high level of duplicates in the RNA data, I also repeated the analysis from my second Yang et al. entry, re-running taxonomic composition analysis on pre-deduplication data and comparing the effects of deduplication on composition:
+
+```{r}
+#| label: plot-composition-dedup
+
+class_levels <- c("Unassigned", "Bacterial", "Archaeal", "Viral", "Human")
+subset_factor <- 0.05
+
+# 1. Remove filtered & duplicate reads from original Bracken output & renormalize
+read_comp_renorm <- read_comp_long %>%
+ filter(! classification %in% c("Filtered", "Duplicate")) %>%
+ group_by(sample) %>%
+ mutate(p_reads = n_reads/sum(n_reads),
+ classification = classification %>% as.character %>%
+ ifelse(. == "Ribosomal", "Bacterial", .)) %>%
+ group_by(sample, na_type, date, classification) %>%
+ summarize(n_reads = sum(n_reads), p_reads = sum(p_reads), .groups = "drop") %>%
+ mutate(classification = factor(classification, levels = class_levels))
+
+# 2. Import pre-deduplicated
+bracken_path_predup <- file.path(data_dir, "bracken_counts_subset.tsv")
+bracken_predup <- read_tsv(bracken_path_predup, show_col_types = FALSE)
+total_assigned_predup <- bracken_predup %>% group_by(sample) %>% summarize(
+ name = "Total",
+ kraken_assigned_reads = sum(kraken_assigned_reads),
+ added_reads = sum(added_reads),
+ new_est_reads = sum(new_est_reads),
+ fraction_total_reads = sum(fraction_total_reads)
+)
+bracken_spread_predup <- bracken_predup %>% select(name, sample, new_est_reads) %>%
+ mutate(name = tolower(name)) %>%
+ pivot_wider(id_cols = "sample", names_from = "name", values_from = "new_est_reads")
+# Count reads
+read_counts_predup <- read_counts_preproc %>%
+ select(sample, na_type, date, raw_concat, cleaned) %>%
+ mutate(raw_concat = raw_concat * subset_factor, cleaned = cleaned * subset_factor) %>%
+ inner_join(total_assigned_predup %>% select(sample, new_est_reads), by = "sample") %>%
+ rename(assigned = new_est_reads) %>%
+ inner_join(bracken_spread_predup, by="sample")
+# Assess composition
+read_comp_predup <- transmute(read_counts_predup, sample=sample, na_type = na_type,
+ date=date,
+ n_filtered = raw_concat-cleaned,
+ n_unassigned = cleaned-assigned,
+ n_bacterial = bacteria,
+ n_archaeal = archaea,
+ n_viral = viruses,
+ n_human = eukaryota)
+read_comp_predup_long <- pivot_longer(read_comp_predup, -(sample:date),
+ names_to = "classification",
+ names_prefix = "n_", values_to = "n_reads") %>%
+ mutate(classification = fct_inorder(str_to_sentence(classification))) %>%
+ group_by(sample) %>% mutate(p_reads = n_reads/sum(n_reads)) %>%
+ filter(! classification %in% c("Filtered", "Duplicate")) %>%
+ group_by(sample) %>%
+ mutate(p_reads = n_reads/sum(n_reads))
+
+# 3. Combine
+read_comp_comb <- bind_rows(read_comp_predup_long %>% mutate(deduplicated = FALSE),
+ read_comp_renorm %>% mutate(deduplicated = TRUE)) %>%
+ mutate(label = ifelse(deduplicated, "Post-dedup", "Pre-dedup") %>% fct_inorder)
+
+# Plot overall composition
+g_comp_predup <- ggplot(read_comp_comb, aes(x=label, y=p_reads, fill=classification)) +
+ geom_col(position = "stack") +
+ scale_y_continuous(name = "% Reads", limits = c(0,1.01), breaks = seq(0,1,0.2),
+ expand = c(0,0), labels = function(x) x*100) +
+ scale_fill_brewer(palette = "Set1", name = "Classification") +
+ facet_grid(na_type~date) +
+ theme_kit
+g_comp_predup
+
+```
+
+In general, deduplication has little effect on the composition of DNA samples, which remain primarily bacterial and unassigned. A few RNA samples show a reduction in the bacterial (more precisely, bacterial + rRNA) fraction after deduplication, consistent with the presence of some true biological duplicate sequences (most likely rRNA) that are being collapsed. Surprisingly, however, the largest effect is in the opposite direction, with several samples showing large increases in bacterial sequences following deduplication. This suggests that some highly abundant non-bacterial sequence is being collapsed in these samples, increasing the apparent abundance of bacteria.
+
+The increase in bacterial reads in these samples comes at the expense of (1) human reads and (2) the unassigned fraction. The former suggests the presence of human rRNA sequences that are being erroneously incorporated into the post-deduplication "bacterial" fraction; however, this effect is smaller than the decrease in unassigned reads. One possibility might be other non-human eukaryotic rRNA sequences, which Kraken is unable to assign using our current database.
+
+# Human-infecting virus reads: validation
+
+Next, I investigated the human-infecting virus read content of these unenriched samples. To get good results here, I needed to make some changes to the pipeline, including adding Trimmomatic as an additional primer-trimming step to prevent further primer-contamination-based false positives. However, for the sake of time I'm not describing them in detail here; if you want to see more I encourage you to check the commit log at the [workflow repo](https://github.com/naobservatory/mgs-workflow).
+
+Having made these changes, the workflow identified a total of 14,073 RNA read pairs and 70 DNA read pairs across all samples (0.09% and 0.00009% of surviving reads, respectively).
+
+```{r}
+#| label: hv-read-counts
+
+hv_reads_filtered_path <- file.path(data_dir, "hv_hits_putative_filtered.tsv.gz")
+hv_reads_filtered <- read_tsv(hv_reads_filtered_path, show_col_types = FALSE) %>%
+ inner_join(libraries, by="sample") %>%
+ arrange(date, na_type)
+n_hv_filtered <- hv_reads_filtered %>% group_by(sample, date, na_type) %>% count %>% inner_join(basic_stats %>% filter(stage == "ribo_initial") %>% select(sample, n_read_pairs), by="sample") %>% rename(n_putative = n, n_total = n_read_pairs) %>% mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)
+n_hv_filtered_summ <- n_hv_filtered %>% group_by(na_type) %>% summarize(n_putative = sum(n_putative), n_total = sum(n_total)) %>% mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)
+```
+
+```{r}
+#| label: plot-hv-scores
+#| warning: false
+#| fig-width: 8
+
+mrg <- hv_reads_filtered %>%
+ mutate(kraken_label = ifelse(assigned_hv, "Kraken2 HV\nassignment",
+ ifelse(hit_hv, "Kraken2 HV\nhit",
+ "No hit or\nassignment"))) %>%
+ group_by(sample, na_type) %>% arrange(desc(adj_score_fwd), desc(adj_score_rev)) %>%
+ mutate(seq_num = row_number(),
+ adj_score_max = pmax(adj_score_fwd, adj_score_rev))
+# Import Bowtie2/Kraken data and perform filtering steps
+g_mrg <- ggplot(mrg, aes(x=adj_score_fwd, y=adj_score_rev)) +
+ geom_point(alpha=0.5, shape=16) +
+ scale_x_continuous(name="S(forward read)", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +
+ scale_y_continuous(name="S(reverse read)", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +
+ facet_grid(na_type~kraken_label, labeller = labeller(kit = label_wrap_gen(20))) +
+ theme_base + theme(aspect.ratio=1)
+g_mrg
+
+g_hist_0 <- ggplot(mrg, aes(x=adj_score_max)) + geom_histogram(binwidth=5,boundary=0,position="dodge") + facet_grid(na_type~kraken_label, scales="free_y") + scale_x_continuous(name = "Maximum adjusted alignment score") + scale_y_continuous(name="# Read pairs") + scale_fill_brewer(palette = "Dark2") + theme_base + geom_vline(xintercept=20, linetype="dashed", color="red")
+g_hist_0
+```
+
+As previously described, I ran BLASTN on these reads via a dedicated EC2 instance, using the same parameters I've used for previous datasets.
+
+```{r}
+#| label: make-blast-fasta
+mrg_fasta <- mrg %>%
+ mutate(seq_head = paste0(">", seq_id)) %>%
+ ungroup %>%
+ select(header1=seq_head, seq1=query_seq_fwd,
+ header2=seq_head, seq2=query_seq_rev) %>%
+ mutate(header1=paste0(header1, "_1"), header2=paste0(header2, "_2"))
+mrg_fasta_out <- do.call(paste, c(mrg_fasta, sep="\n")) %>% paste(collapse="\n")
+write(mrg_fasta_out, file.path(data_dir, "brumfield-putative-viral.fasta"))
+```
+
+```{r}
+#| label: process-blast-data
+#| warning: false
+# Import BLAST results
+blast_results_path <- file.path(data_dir, "blast/brumfield-putative-viral.blast.gz")
+blast_cols <- c("qseqid", "sseqid", "sgi", "staxid", "qlen", "evalue", "bitscore", "qcovs", "length", "pident", "mismatch", "gapopen", "sstrand", "qstart", "qend", "sstart", "send")
+blast_results <- read_tsv(blast_results_path, show_col_types = FALSE,
+ col_names = blast_cols, col_types = cols(.default="c"))
+
+# Add viral status
+blast_results_viral <- mutate(blast_results, viral = staxid %in% viral_taxa$taxid)
+
+# Filter for best hit for each query/subject, then rank hits for each query
+blast_results_best <- blast_results_viral %>% group_by(qseqid, staxid) %>%
+ filter(bitscore == max(bitscore)) %>%
+ filter(length == max(length)) %>% filter(row_number() == 1)
+blast_results_ranked <- blast_results_best %>%
+ group_by(qseqid) %>% mutate(rank = dense_rank(desc(bitscore)))
+blast_results_highrank <- blast_results_ranked %>% filter(rank <= 5) %>%
+ mutate(read_pair = str_split(qseqid, "_") %>% sapply(nth, n=-1),
+ seq_id = str_split(qseqid, "_") %>% sapply(nth, n=1)) %>%
+ mutate(bitscore = as.numeric(bitscore))
+
+# Summarize by read pair and taxid
+blast_results_paired <- blast_results_highrank %>%
+ group_by(seq_id, staxid, viral) %>%
+ summarize(bitscore_max = max(bitscore), bitscore_min = min(bitscore),
+ n_reads = n(), .groups = "drop") %>%
+ mutate(viral_full = viral & n_reads == 2)
+
+# Compare to Kraken & Bowtie assignments
+mrg_assign <- mrg %>% select(sample, seq_id, taxid, assigned_taxid, adj_score_max, na_type)
+blast_results_assign <- left_join(blast_results_paired, mrg_assign, by="seq_id") %>%
+ mutate(taxid_match_bowtie = (staxid == taxid),
+ taxid_match_kraken = (staxid == assigned_taxid),
+ taxid_match_any = taxid_match_bowtie | taxid_match_kraken)
+blast_results_out <- blast_results_assign %>%
+ group_by(seq_id) %>%
+ summarize(viral_status = ifelse(any(viral_full), 2,
+ ifelse(any(taxid_match_any), 2,
+ ifelse(any(viral), 1, 0))),
+ .groups = "drop")
+```
+
+```{r}
+#| label: plot-blast-results
+#| fig-height: 6
+#| warning: false
+
+# Merge BLAST results with unenriched read data
+mrg_blast <- full_join(mrg, blast_results_out, by="seq_id") %>%
+ mutate(viral_status = replace_na(viral_status, 0),
+ viral_status_out = ifelse(viral_status == 0, FALSE, TRUE))
+mrg_blast_rna <- mrg_blast %>% filter(na_type == "RNA")
+mrg_blast_dna <- mrg_blast %>% filter(na_type == "DNA")
+
+# Plot RNA
+g_mrg_blast_rna <- mrg_blast_rna %>%
+ ggplot(aes(x=adj_score_fwd, y=adj_score_rev, color=viral_status_out)) +
+ geom_point(alpha=0.5, shape=16) +
+ scale_x_continuous(name="S(forward read)", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +
+ scale_y_continuous(name="S(reverse read)", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +
+ scale_color_brewer(palette = "Set1", name = "Viral status") +
+ facet_grid(viral_status_out~kraken_label, labeller = labeller(kit = label_wrap_gen(20))) +
+ theme_base + labs(title="RNA") +
+ theme(aspect.ratio=1, plot.title = element_text(size=rel(2), hjust=0))
+g_mrg_blast_rna
+
+# Plot DNA
+g_mrg_blast_dna <- mrg_blast_dna %>%
+ ggplot(aes(x=adj_score_fwd, y=adj_score_rev, color=viral_status_out)) +
+ geom_point(alpha=0.5, shape=16) +
+ scale_x_continuous(name="S(forward read)", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +
+ scale_y_continuous(name="S(reverse read)", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +
+ scale_color_brewer(palette = "Set1", name = "Viral status") +
+ facet_grid(viral_status_out~kraken_label, labeller = labeller(kit = label_wrap_gen(20))) +
+ theme_base + labs(title="DNA") +
+ theme(aspect.ratio=1, plot.title = element_text(size=rel(2), hjust=0))
+g_mrg_blast_dna
+```
+
+```{r}
+#| label: plot-blast-histogram
+g_hist_1 <- ggplot(mrg_blast, aes(x=adj_score_max)) + geom_histogram(binwidth=5,boundary=0,position="dodge") + facet_grid(na_type~viral_status_out, scales = "free_y") + scale_x_continuous(name = "Maximum adjusted alignment score") + scale_y_continuous(name="# Read pairs") + scale_fill_brewer(palette = "Dark2") + theme_base + geom_vline(xintercept=20, linetype="dashed", color="red")
+g_hist_1
+```
+
+These results look very good on visual inspection, and indeed precision and sensitivity are both very high. For a disjunctive score threshold of 20, my updated workflow achieves an F1 score of 99.0% for RNA sequences and 99.2% for DNA sequences; more than high enough to continue on to human viral analysis.
+
+```{r}
+#| label: plot-f1
+test_sens_spec <- function(tab, score_threshold, conjunctive, include_special){
+ if (!include_special) tab <- filter(tab, viral_status_out %in% c("TRUE", "FALSE"))
+ tab_retained <- tab %>% mutate(
+ conjunctive = conjunctive,
+ retain_score_conjunctive = (adj_score_fwd > score_threshold & adj_score_rev > score_threshold),
+ retain_score_disjunctive = (adj_score_fwd > score_threshold | adj_score_rev > score_threshold),
+ retain_score = ifelse(conjunctive, retain_score_conjunctive, retain_score_disjunctive),
+ retain = assigned_hv | hit_hv | retain_score) %>%
+ group_by(viral_status_out, retain) %>% count
+ pos_tru <- tab_retained %>% filter(viral_status_out == "TRUE", retain) %>% pull(n) %>% sum
+ pos_fls <- tab_retained %>% filter(viral_status_out != "TRUE", retain) %>% pull(n) %>% sum
+ neg_tru <- tab_retained %>% filter(viral_status_out != "TRUE", !retain) %>% pull(n) %>% sum
+ neg_fls <- tab_retained %>% filter(viral_status_out == "TRUE", !retain) %>% pull(n) %>% sum
+ sensitivity <- pos_tru / (pos_tru + neg_fls)
+ specificity <- neg_tru / (neg_tru + pos_fls)
+ precision <- pos_tru / (pos_tru + pos_fls)
+ f1 <- 2 * precision * sensitivity / (precision + sensitivity)
+ out <- tibble(threshold=score_threshold, include_special = include_special,
+ conjunctive = conjunctive, sensitivity=sensitivity,
+ specificity=specificity, precision=precision, f1=f1)
+ return(out)
+}
+range_f1 <- function(intab, inc_special, inrange=15:45){
+ tss <- purrr::partial(test_sens_spec, tab=intab, include_special=inc_special)
+ stats_conj <- lapply(inrange, tss, conjunctive=TRUE) %>% bind_rows
+ stats_disj <- lapply(inrange, tss, conjunctive=FALSE) %>% bind_rows
+ stats_all <- bind_rows(stats_conj, stats_disj) %>%
+ pivot_longer(!(threshold:conjunctive), names_to="metric", values_to="value") %>%
+ mutate(conj_label = ifelse(conjunctive, "Conjunctive", "Disjunctive"))
+ return(stats_all)
+}
+inc_special <- FALSE
+stats_rna <- range_f1(mrg_blast_rna, inc_special) %>% mutate(na_type = "RNA")
+stats_dna <- range_f1(mrg_blast_dna, inc_special) %>% mutate(na_type = "DNA")
+stats_0 <- bind_rows(stats_rna, stats_dna) %>% mutate(attempt=0)
+threshold_opt_0 <- stats_0 %>% group_by(conj_label,attempt,na_type) %>%
+ filter(metric == "f1") %>%
+ filter(value == max(value)) %>% filter(threshold == min(threshold))
+g_stats_0 <- ggplot(stats_0, aes(x=threshold, y=value, color=metric)) +
+ geom_vline(data = threshold_opt_0, mapping = aes(xintercept=threshold),
+ color = "red", linetype = "dashed") +
+ geom_line() +
+ scale_y_continuous(name = "Value", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) +
+ scale_x_continuous(name = "Threshold", expand = c(0,0)) +
+ scale_color_brewer(palette="Set3") +
+ facet_grid(na_type~conj_label) +
+ theme_base
+g_stats_0
+```
+
+# Human-infecting virus reads: analysis
+
+```{r}
+#| label: count-hv-reads
+
+# Get raw read counts
+read_counts_raw <- basic_stats_raw %>%
+ select(sample, date, na_type, n_reads_raw = n_read_pairs)
+
+# Get HV read counts
+mrg_hv <- mrg %>% mutate(hv_status = assigned_hv | hit_hv | adj_score_max >= 20)
+read_counts_hv <- mrg_hv %>% filter(hv_status) %>% group_by(sample) %>% count(name="n_reads_hv")
+read_counts <- read_counts_raw %>% left_join(read_counts_hv, by="sample") %>%
+ mutate(n_reads_hv = replace_na(n_reads_hv, 0))
+
+# Aggregate
+read_counts_total <- read_counts %>% group_by(na_type) %>%
+ summarize(n_reads_raw = sum(n_reads_raw),
+ n_reads_hv = sum(n_reads_hv)) %>%
+ mutate(sample= "All samples", date = "All dates")
+read_counts_agg <- read_counts %>% arrange(date) %>%
+ mutate(date = as.character(date)) %>% arrange(sample) %>%
+ bind_rows(read_counts_total) %>%
+ mutate(sample = fct_inorder(sample),
+ date = fct_inorder(date),
+ p_reads_hv = n_reads_hv/n_reads_raw)
+
+# Get old counts
+n_hv_reads_old <- c(691, 121, 18, 224, 2, 26, 6, 21, 4, 29, 12, 22)
+n_hv_reads_old[13] <- sum(n_hv_reads_old[which(read_counts$na_type=="RNA")])
+n_hv_reads_old[14] <- sum(n_hv_reads_old[which(read_counts$na_type=="DNA")])
+
+read_counts_comp <- read_counts_agg %>%
+ mutate(n_reads_hv_old = n_hv_reads_old,
+ p_reads_hv_old = n_reads_hv_old/n_reads_raw)
+```
+
+Applying a disjunctive cutoff at S=20 identifies 13,809 RNA reads and 66 DNA reads as human-viral. This gives an overall relative HV abundance of $5.00 \times 10^{-5}$ for RNA reads and $3.66 \times 10^{-7}$ for DNA reads. In contrast, the overall HV in the public dashboard is $3.93 \times 10^{-6}$ for RNA reads and $4.54 \times 10^{-7}$; the measured HV has thus increased 12.7x for RNA reads, but *decreased* slightly for DNA reads.
+
+```{r}
+#| label: plot-hv-ra
+#| warning: false
+# Visualize
+g_phv_agg <- ggplot(read_counts_agg, aes(x=date, color=na_type)) +
+ geom_point(aes(y=p_reads_hv)) +
+ scale_y_log10("Relative abundance of human virus reads") +
+ scale_color_na() + theme_kit
+g_phv_agg
+```
+
+Digging into individual viruses, we see that fecal-oral viruses predominate, especially *Mamastrovirus*, *Rotavirus*, *Salivirus*, and fecal-oral *Enterovirus* species (especially *Enterovirus C*, which includes poliovirus):
+
+```{r}
+#| label: raise-hv-taxa
+
+# Get viral taxon names for putative HV reads
+viral_taxa$name[viral_taxa$taxid == 249588] <- "Mamastrovirus"
+viral_taxa$name[viral_taxa$taxid == 194960] <- "Kobuvirus"
+viral_taxa$name[viral_taxa$taxid == 688449] <- "Salivirus"
+viral_taxa$name[viral_taxa$taxid == 585893] <- "Picobirnaviridae"
+viral_taxa$name[viral_taxa$taxid == 333922] <- "Betapapillomavirus"
+
+viral_taxa$name[viral_taxa$taxid == 334207] <- "Betapapillomavirus 3"
+viral_taxa$name[viral_taxa$taxid == 369960] <- "Porcine type-C oncovirus"
+viral_taxa$name[viral_taxa$taxid == 333924] <- "Betapapillomavirus 2"
+
+mrg_hv_named <- mrg_hv %>% left_join(viral_taxa, by="taxid")
+
+# Discover viral species & genera for HV reads
+raise_rank <- function(read_db, taxid_db, out_rank = "species", verbose = FALSE){
+ # Get higher ranks than search rank
+ ranks <- c("subspecies", "species", "subgenus", "genus", "subfamily", "family", "suborder", "order", "class", "subphylum", "phylum", "kingdom", "superkingdom")
+ rank_match <- which.max(ranks == out_rank)
+ high_ranks <- ranks[rank_match:length(ranks)]
+ # Merge read DB and taxid DB
+ reads <- read_db %>% select(-parent_taxid, -rank, -name) %>%
+ left_join(taxid_db, by="taxid")
+ # Extract sequences that are already at appropriate rank
+ reads_rank <- filter(reads, rank == out_rank)
+ # Drop sequences at a higher rank and return unclassified sequences
+ reads_norank <- reads %>% filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))
+ while(nrow(reads_norank) > 0){ # As long as there are unclassified sequences...
+ # Promote read taxids and re-merge with taxid DB, then re-classify and filter
+ reads_remaining <- reads_norank %>% mutate(taxid = parent_taxid) %>%
+ select(-parent_taxid, -rank, -name) %>%
+ left_join(taxid_db, by="taxid")
+ reads_rank <- reads_remaining %>% filter(rank == out_rank) %>%
+ bind_rows(reads_rank)
+ reads_norank <- reads_remaining %>%
+ filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))
+ }
+ # Finally, extract and append reads that were excluded during the process
+ reads_dropped <- reads %>% filter(!seq_id %in% reads_rank$seq_id)
+ reads_out <- reads_rank %>% bind_rows(reads_dropped) %>%
+ select(-parent_taxid, -rank, -name) %>%
+ left_join(taxid_db, by="taxid")
+ return(reads_out)
+}
+hv_reads_species <- raise_rank(mrg_hv_named, viral_taxa, "species")
+hv_reads_genus <- raise_rank(mrg_hv_named, viral_taxa, "genus")
+hv_reads_family <- raise_rank(mrg_hv_named, viral_taxa, "family")
+```
+
+```{r}
+#| label: hv-families
+
+# Count reads for each human-viral family
+hv_family_counts <- hv_reads_family %>%
+ group_by(sample, date, na_type, name, taxid) %>%
+ count(name = "n_reads_hv") %>%
+ group_by(sample, date, na_type) %>%
+ mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking families and group others
+hv_family_major_tab <- hv_family_counts %>% group_by(name) %>%
+ filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%
+ arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > 0.01)
+hv_family_counts_major <- hv_family_counts %>%
+ mutate(name_display = ifelse(name %in% hv_family_major_tab$name, name, "Other")) %>%
+ group_by(sample, date, na_type, name_display) %>%
+ summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv),
+ .groups="drop") %>%
+ mutate(name_display = factor(name_display,
+ levels = c(hv_family_major_tab$name, "Other")))
+
+# Plot family composition
+palette <- c(brewer.pal(12, "Set3"), "#AAAAAA")
+g_hv_families <- ggplot(hv_family_counts_major,
+ aes(x=date, y=p_reads_hv, fill=name_display)) +
+ geom_col(position="stack") +
+ scale_fill_manual(name="Viral\nfamily", values=palette) +
+ scale_y_continuous(name="% HV Reads", limits=c(0,1), breaks=seq(0,1,0.2)) +
+ facet_grid(.~na_type) +
+ guides(fill=guide_legend(ncol=3)) +
+ theme_kit
+g_hv_families
+```
+
+```{r}
+#| label: hv-genera
+
+# Count reads for each human-viral genus
+hv_genus_counts <- hv_reads_genus %>%
+ group_by(sample, date, na_type, name, taxid) %>%
+ count(name = "n_reads_hv") %>%
+ group_by(sample, date, na_type) %>%
+ mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking genera and group others
+hv_genus_major_tab <- hv_genus_counts %>% group_by(name) %>%
+ filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%
+ arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > 0.02)
+hv_genus_counts_major <- hv_genus_counts %>%
+ mutate(name_display = ifelse(name %in% hv_genus_major_tab$name, name, "Other")) %>%
+ group_by(sample, date, na_type, name_display) %>%
+ summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv),
+ .groups="drop") %>%
+ mutate(name_display = factor(name_display,
+ levels = c(hv_genus_major_tab$name, "Other")))
+
+# Plot genus composition
+palette <- c(brewer.pal(12, "Set3"), "#AAAAAA")
+g_hv_genera <- ggplot(hv_genus_counts_major,
+ aes(x=date, y=p_reads_hv, fill=name_display)) +
+ geom_col(position="stack") +
+ scale_fill_manual(name="Viral\ngenus", values=palette) +
+ scale_y_continuous(name="% HV Reads", limits=c(0,1), breaks=seq(0,1,0.2)) +
+ facet_grid(.~na_type) +
+ guides(fill=guide_legend(ncol=3)) +
+ theme_kit
+g_hv_genera
+```
+
+```{r}
+#| label: hv-species
+
+# Count reads for each human-viral species
+hv_species_counts <- hv_reads_species %>%
+ group_by(sample, date, na_type, name, taxid) %>%
+ count(name = "n_reads_hv") %>%
+ group_by(sample, date, na_type) %>%
+ mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking species and group others
+hv_species_major_tab <- hv_species_counts %>% group_by(name, taxid) %>%
+ filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%
+ arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > 0.05)
+hv_species_counts_major <- hv_species_counts %>%
+ mutate(name_display = ifelse(name %in% hv_species_major_tab$name, name, "Other")) %>%
+ group_by(sample, date, na_type, name_display) %>%
+ summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv),
+ .groups="drop") %>%
+ mutate(name_display = factor(name_display,
+ levels = c(hv_species_major_tab$name, "Other")))
+
+# Plot species composition
+palette <- c(brewer.pal(12, "Set3"), "#AAAAAA")
+g_hv_species <- ggplot(hv_species_counts_major,
+ aes(x=date, y=p_reads_hv, fill=name_display)) +
+ geom_col(position="stack") +
+ scale_fill_manual(name="Viral\nspecies", values=palette) +
+ scale_y_continuous(name="% HV Reads", limits=c(0,1), breaks=seq(0,1,0.2)) +
+ facet_grid(.~na_type) +
+ guides(fill=guide_legend(ncol=3)) +
+ theme_kit
+g_hv_species
+```
+
+By far the most common virus according to my pipeline (with over 91% of human-viral reads) is *Rotavirus A*; second (with 8%) is *Orthopicobirnavirus hominis*. Together, these two enteric RNA viruses dominate HV RNA reads in all samples. Both appear to be real; at least, BLASTN primarily maps these reads to the same or closely-related viruses to that identified by Bowtie2:
+
+```{r}
+#| label: hv-rotavirus
+
+# Import names db
+tax_names_path <- file.path(data_dir, "taxid-names.tsv.gz")
+tax_names <- read_tsv(tax_names_path, show_col_types = FALSE)
+
+# Add missing names
+tax_names_new <- tibble(
+ staxid = c(557241, 557245, 557232, 557247, 557242, 557245),
+ name = c("Canine rotavirus A79-10/G3P[3]", "Human rotavirus Ro1845", "Canine rotavirus K9",
+ "Human rotavirus HCR3A", "Feline rotavirus Cat97/G3P[3]", "Feline rotavirus Cat2/G3P[9]")
+)
+tax_names <- bind_rows(tax_names, tax_names_new)
+
+rota_ids <- hv_reads_species %>% filter(taxid==28875) %>% pull(seq_id)
+rota_hits <- blast_results_paired %>%
+ filter(seq_id %in% rota_ids) %>%
+ mutate(staxid = as.integer(staxid))
+rota_hits_sorted <- rota_hits %>% group_by(staxid) %>% count %>% arrange(desc(n)) %>%
+ left_join(tax_names, by="staxid") %>%
+ mutate(name = fct_inorder(name))
+rota_hits_sorted_head <- rota_hits_sorted %>% head(10) %>%
+ mutate(name=fct_inorder(as.character(name)))
+g_rota <- ggplot(rota_hits_sorted_head,
+ aes(x=n, y=fct_inorder(name))) + geom_col() +
+ scale_x_continuous(name = "# Mapped read pairs") + theme_base +
+ theme(axis.title.y = element_blank())
+g_rota
+```
+
+```{r}
+#| label: hv-orthopicobirnavirus
+
+# Add missing names
+tax_names_new <- tibble(
+ staxid = c(442302, 3003954, 585895),
+ name = c("Porcine picobirnavirus", "ticpantry virus 7", "uncultured picobirnavirus")
+)
+tax_names <- bind_rows(tax_names, tax_names_new)
+
+pico_ids <- hv_reads_species %>% filter(taxid==2956252) %>% pull(seq_id)
+pico_hits <- blast_results_paired %>%
+ filter(seq_id %in% pico_ids) %>%
+ mutate(staxid = as.integer(staxid))
+pico_hits_sorted <- pico_hits %>% group_by(staxid) %>% count %>% arrange(desc(n)) %>%
+ left_join(tax_names, by="staxid") %>%
+ mutate(name = fct_inorder(name))
+pico_hits_sorted_head <- pico_hits_sorted %>% head(10) %>%
+ mutate(name=fct_inorder(as.character(name)))
+g_pico <- ggplot(pico_hits_sorted_head,
+ aes(x=n, y=fct_inorder(name))) + geom_col() +
+ scale_x_continuous(name = "# Mapped read pairs") + theme_base +
+ theme(axis.title.y = element_blank())
+g_pico
+```
+
+# Conclusion
+
+I'm quite happy with the quality of the human-viral selection process for this dataset, which ended up achieving very high precision and sensitivity. The most striking result coming out of this analysis is probably the drastic difference in total HV abundance between RNA and DNA reads, with the former \>100x higher despite very similar processing methods. In the past we've attributed low HV abundance in the DNA datasets we've analyzed to the lack of viral enrichment in available DNA datasets; in this case, however, there is very little difference between the DNA and RNA processing methods, so this explanation doesn't really hold. This makes me more pessimistic about achieving good HV relative abundance with DNA sequencing, even with improved viral enrichment methods.
diff --git a/docs/search.json b/docs/search.json
index 80d9691..e2639fc 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -32,7 +32,7 @@
"href": "index.html",
"title": "Will's Public NAO Notebook",
"section": "",
- "text": "Workflow analysis of Spurbeck et al. (2023)\n\n\nCave carpa.\n\n\n\n\n\nApr 1, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nFollowup analysis of Yang et al. (2020)\n\n\nDigging into deduplication.\n\n\n\n\n\nMar 19, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Yang et al. (2020)\n\n\nWastewater from Xinjiang.\n\n\n\n\n\nMar 16, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nImproving read deduplication in the MGS workflow\n\n\nRemoving reverse-complement duplicates of human-viral reads.\n\n\n\n\n\nMar 1, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Rothman et al. (2021), part 2\n\n\nPanel-enriched samples.\n\n\n\n\n\nFeb 29, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Rothman et al. (2021), part 1\n\n\nUnenriched samples.\n\n\n\n\n\nFeb 27, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Crits-Christoph et al. (2021), part 3\n\n\nFixing the virus pipeline.\n\n\n\n\n\nFeb 15, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Crits-Christoph et al. (2021), part 2\n\n\nAbundance and composition of human-infecting viruses.\n\n\n\n\n\nFeb 8, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Crits-Christoph et al. (2021), part 1\n\n\nPreprocessing and composition.\n\n\n\n\n\nFeb 4, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nAutomating BLAST validation of human viral read assignment\n\n\nExperiments with BLASTN remote mode\n\n\n\n\n\nJan 30, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nProject Runway RNA-seq testing data: removing livestock reads\n\n\n\n\n\n\n\n\nDec 22, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Project Runway RNA-seq testing data\n\n\nApplying a new workflow to some oldish data.\n\n\n\n\n\nDec 19, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nEstimating the effect of read depth on duplication rate for Project Runway DNA data\n\n\nHow deep can we go?\n\n\n\n\n\nNov 8, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing viral read assignments between pipelines on Project Runway data\n\n\n\n\n\n\n\n\nNov 2, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nInitial analysis of Project Runway protocol testing data\n\n\n\n\n\n\n\n\nOct 31, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing options for read deduplication\n\n\nClumpify vs fastp\n\n\n\n\n\nOct 19, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing Ribodetector and bbduk for rRNA detection\n\n\nIn search of quick rRNA filtering.\n\n\n\n\n\nOct 16, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing FASTP and AdapterRemoval for MGS pre-processing\n\n\nTwo tools – how do they perform?\n\n\n\n\n\nOct 12, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nHow does Element AVITI sequencing work?\n\n\nFindings of a shallow investigation\n\n\n\n\n\nOct 11, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nExtraction experiment 2: high-level results & interpretation\n\n\nComparing RNA yields and quality across extraction kits for settled solids\n\n\n\n\n\nSep 21, 2023\n\n\n\n\n\n\nNo matching items"
+ "text": "Workflow analysis of Brumfield et al. (2022)\n\n\nWastewater from a manhole in Maryland.\n\n\n\n\n\nApr 8, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Spurbeck et al. (2023)\n\n\nCave carpa.\n\n\n\n\n\nApr 1, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nFollowup analysis of Yang et al. (2020)\n\n\nDigging into deduplication.\n\n\n\n\n\nMar 19, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Yang et al. (2020)\n\n\nWastewater from Xinjiang.\n\n\n\n\n\nMar 16, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nImproving read deduplication in the MGS workflow\n\n\nRemoving reverse-complement duplicates of human-viral reads.\n\n\n\n\n\nMar 1, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Rothman et al. (2021), part 2\n\n\nPanel-enriched samples.\n\n\n\n\n\nFeb 29, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Rothman et al. (2021), part 1\n\n\nUnenriched samples.\n\n\n\n\n\nFeb 27, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Crits-Christoph et al. (2021), part 3\n\n\nFixing the virus pipeline.\n\n\n\n\n\nFeb 15, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Crits-Christoph et al. (2021), part 2\n\n\nAbundance and composition of human-infecting viruses.\n\n\n\n\n\nFeb 8, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Crits-Christoph et al. (2021), part 1\n\n\nPreprocessing and composition.\n\n\n\n\n\nFeb 4, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nAutomating BLAST validation of human viral read assignment\n\n\nExperiments with BLASTN remote mode\n\n\n\n\n\nJan 30, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nProject Runway RNA-seq testing data: removing livestock reads\n\n\n\n\n\n\n\n\nDec 22, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Project Runway RNA-seq testing data\n\n\nApplying a new workflow to some oldish data.\n\n\n\n\n\nDec 19, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nEstimating the effect of read depth on duplication rate for Project Runway DNA data\n\n\nHow deep can we go?\n\n\n\n\n\nNov 8, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing viral read assignments between pipelines on Project Runway data\n\n\n\n\n\n\n\n\nNov 2, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nInitial analysis of Project Runway protocol testing data\n\n\n\n\n\n\n\n\nOct 31, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing options for read deduplication\n\n\nClumpify vs fastp\n\n\n\n\n\nOct 19, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing Ribodetector and bbduk for rRNA detection\n\n\nIn search of quick rRNA filtering.\n\n\n\n\n\nOct 16, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing FASTP and AdapterRemoval for MGS pre-processing\n\n\nTwo tools – how do they perform?\n\n\n\n\n\nOct 12, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nHow does Element AVITI sequencing work?\n\n\nFindings of a shallow investigation\n\n\n\n\n\nOct 11, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nExtraction experiment 2: high-level results & interpretation\n\n\nComparing RNA yields and quality across extraction kits for settled solids\n\n\n\n\n\nSep 21, 2023\n\n\n\n\n\n\nNo matching items"
},
{
"objectID": "notebooks/2023-10-12_fastp-vs-adapterremoval.html",
@@ -271,5 +271,12 @@
"title": "Workflow analysis of Spurbeck et al. (2023)",
"section": "",
"text": "Continuing my analysis of datasets from the P2RA preprint, I analyzed the data from Spurbeck et al. (2023), a wastewater RNA-sequencing study from Ohio. Samples for this study underwent a variety of processing protocols at different research centers across the state. Along with Rothman and Crits-Christoph, Spurbeck is one of three RNA-sequencing studies that underwent full in-depth analysis in the P2RA study, so is worth looking at closely here.\nThis one turned out to be a bit of a saga. As we’ll see in the section on human-virus identification, it took multiple tries and several substantial changes to the pipeline to get things to a state I was happy with. Still, I am happy with the outcome and think the changes will improve analysis of future datasets.\nThe raw data\nThe Spurbeck data comprises 55 samples from 8 processing groups, with 4 to 6 samples per group. The number of sequencing read pairs per sample varied widely from 4.5M-106.7M (mean 33.4M). Taken together, the samples totaled roughly 1.8B read pairs (425 gigabases of sequence). Read qualities were generally high but in need of some light preprocessing. Adapter levels were moderate. Inferred duplication levels were fairly high: 14-91% with a mean of 55%.\n\nCode# Data input paths\ndata_dir <- \"../data/2024-04-01_spurbeck/\"\nlibraries_path <- file.path(data_dir, \"sample-metadata.csv\")\nbasic_stats_path <- file.path(data_dir, \"qc_basic_stats_1.tsv\")\nadapter_stats_path <- file.path(data_dir, \"qc_adapter_stats.tsv\")\nquality_base_stats_path <- file.path(data_dir, \"qc_quality_base_stats.tsv\")\nquality_seq_stats_path <- file.path(data_dir, \"qc_quality_sequence_stats.tsv\")\n\n# Import libraries and extract metadata from sample names\nlibraries <- read_csv(libraries_path, show_col_types = FALSE)\n\n# Import QC data\nstages <- c(\"raw_concat\", \"cleaned\", \"dedup\", \"ribo_initial\", \"ribo_secondary\")\nbasic_stats <- read_tsv(basic_stats_path, show_col_types = FALSE) %>%\n # mutate(sample = sub(\"_2$\", \"\", sample)) %>%\n inner_join(libraries, by=\"sample\") %>% \n mutate(stage = factor(stage, levels = stages),\n sample = fct_inorder(sample))\nadapter_stats <- read_tsv(adapter_stats_path, show_col_types = FALSE) %>%\n mutate(sample = sub(\"_2$\", \"\", sample)) %>%\n inner_join(libraries, by=\"sample\") %>%\n mutate(stage = factor(stage, levels = stages),\n read_pair = fct_inorder(as.character(read_pair)))\nquality_base_stats <- read_tsv(quality_base_stats_path, show_col_types = FALSE) %>%\n # mutate(sample = sub(\"_2$\", \"\", sample)) %>%\n inner_join(libraries, by=\"sample\") %>%\n mutate(stage = factor(stage, levels = stages),\n read_pair = fct_inorder(as.character(read_pair)))\nquality_seq_stats <- read_tsv(quality_seq_stats_path, show_col_types = FALSE) %>%\n # mutate(sample = sub(\"_2$\", \"\", sample)) %>%\n inner_join(libraries, by=\"sample\") %>%\n mutate(stage = factor(stage, levels = stages),\n read_pair = fct_inorder(as.character(read_pair)))\n\n# Filter to raw data\nbasic_stats_raw <- basic_stats %>% filter(stage == \"raw_concat\")\nadapter_stats_raw <- adapter_stats %>% filter(stage == \"raw_concat\")\nquality_base_stats_raw <- quality_base_stats %>% filter(stage == \"raw_concat\")\nquality_seq_stats_raw <- quality_seq_stats %>% filter(stage == \"raw_concat\")\n\n\n\nCode# Visualize basic stats\nscale_fill_grp <- purrr::partial(scale_fill_brewer, palette=\"Set3\", name=\"Processing group\")\ng_basic <- ggplot(basic_stats_raw, aes(x=group, fill=group, group=sample)) +\n scale_fill_grp() + theme_kit\ng_nreads_raw <- g_basic + geom_col(aes(y=n_read_pairs), position = \"dodge\") +\n scale_y_continuous(name=\"# Read pairs\", expand=c(0,0))\ng_nreads_raw_2 <- g_nreads_raw + theme(legend.position = \"none\")\nlegend_group <- get_legend(g_nreads_raw)\n\n\ng_nbases_raw <- g_basic + geom_col(aes(y=n_bases_approx), position = \"dodge\") +\n scale_y_continuous(name=\"Total base pairs (approx)\", expand=c(0,0)) + \n theme(legend.position = \"none\")\ng_ndup_raw <- g_basic + geom_col(aes(y=percent_duplicates), position = \"dodge\") +\n scale_y_continuous(name=\"% Duplicates (FASTQC)\", expand=c(0,0), limits = c(0,100), breaks = seq(0,100,20)) +\n theme(legend.position = \"none\")\n\ng_basic_raw <- plot_grid(g_nreads_raw_2 + g_nbases_raw + g_ndup_raw, legend_group, ncol = 1, rel_heights = c(1,0.1))\ng_basic_raw\n\n\n\n\n\n\n\n\nCodescale_color_grp <- purrr::partial(scale_color_brewer,palette=\"Set3\",name=\"Processing group\")\ng_qual_raw <- ggplot(mapping=aes(color=group, linetype=read_pair, \n group=interaction(sample,read_pair))) + \n scale_color_grp() + scale_linetype_discrete(name = \"Read Pair\") +\n guides(color=guide_legend(nrow=2,byrow=TRUE),\n linetype = guide_legend(nrow=2,byrow=TRUE)) +\n theme_base\n\n# Visualize adapters\ng_adapters_raw <- g_qual_raw + \n geom_line(aes(x=position, y=pc_adapters), data=adapter_stats_raw) +\n scale_y_continuous(name=\"% Adapters\", limits=c(0,50),\n breaks = seq(0,50,10), expand=c(0,0)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_wrap(~adapter)\ng_adapters_raw\n\n\n\n\n\n\nCode# Visualize quality\ng_quality_base_raw <- g_qual_raw +\n geom_hline(yintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_hline(yintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats_raw) +\n scale_y_continuous(name=\"Mean Phred score\", expand=c(0,0), limits=c(10,45)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0))\ng_quality_base_raw\n\n\n\n\n\n\nCodeg_quality_seq_raw <- g_qual_raw +\n geom_vline(xintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_vline(xintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats_raw) +\n scale_x_continuous(name=\"Mean Phred score\", expand=c(0,0)) +\n scale_y_continuous(name=\"# Sequences\", expand=c(0,0))\ng_quality_seq_raw\n\n\n\n\n\n\n\nPreprocessing\nThe average fraction of reads lost at each stage in the preprocessing pipeline is shown in the following table. Significant numbers of reads were lost at each stage in the preprocessing pipeline. In total, cleaning, deduplication and initial ribodepletion removed 17-96% (mean 72%) of input reads, while secondary ribodepletion removed an additional 0-11% (mean 6%).\n\nCoden_reads_rel <- basic_stats %>% \n select(sample, group, stage, percent_duplicates, n_read_pairs) %>%\n group_by(sample, group) %>% arrange(sample, group, stage) %>%\n mutate(p_reads_retained = n_read_pairs / lag(n_read_pairs),\n p_reads_lost = 1 - p_reads_retained,\n p_reads_retained_abs = n_read_pairs / n_read_pairs[1],\n p_reads_lost_abs = 1-p_reads_retained_abs,\n p_reads_lost_abs_marginal = p_reads_lost_abs - lag(p_reads_lost_abs))\nn_reads_rel_display <- n_reads_rel %>% rename(Stage=stage) %>% group_by(Stage) %>% \n summarize(`% Total Reads Lost (Cumulative)` = paste0(round(min(p_reads_lost_abs*100),1), \"-\", round(max(p_reads_lost_abs*100),1), \" (mean \", round(mean(p_reads_lost_abs*100),1), \")\"),\n `% Total Reads Lost (Marginal)` = paste0(round(min(p_reads_lost_abs_marginal*100),1), \"-\", round(max(p_reads_lost_abs_marginal*100),1), \" (mean \", round(mean(p_reads_lost_abs_marginal*100),1), \")\")) %>% tail(-1) %>%\n mutate(Stage = Stage %>% as.numeric %>% factor(labels=c(\"Trimming & filtering\", \"Deduplication\", \"Initial ribodepletion\", \"Secondary ribodepletion\")))\nn_reads_rel_display\n\n\n \n\n\n\n\nCode# Plot reads over preprocessing\ng_reads_stages <- ggplot(basic_stats, aes(x=stage, y=n_read_pairs,fill=group,group=sample)) +\n geom_col(position=\"dodge\") + scale_fill_grp() +\n scale_y_continuous(\"# Read pairs\", expand=c(0,0)) +\n theme_kit\ng_reads_stages\n\n\n\n\n\n\nCode# Plot relative read losses during preprocessing\ng_reads_rel <- ggplot(n_reads_rel, aes(x=stage, y=p_reads_lost_abs_marginal,fill=group,group=sample)) +\n geom_col(position=\"dodge\") + scale_fill_grp() +\n scale_y_continuous(\"% Total Reads Lost\", expand=c(0,0), labels = function(x) x*100) +\n theme_kit\ng_reads_rel\n\n\n\n\n\n\n\nData cleaning with FASTP was mostly successful at removing adapters; however, detectable levels of Illumina Universal Adapter sequences persisted through the preprocessing pipeline. FASTP was successful at improving read quality.\n\nCodeg_qual <- ggplot(mapping=aes(color=group, linetype=read_pair, \n group=interaction(sample,read_pair))) + \n scale_color_grp() + scale_linetype_discrete(name = \"Read Pair\") +\n guides(color=guide_legend(nrow=2,byrow=TRUE),\n linetype = guide_legend(nrow=2,byrow=TRUE)) +\n theme_base\n\n# Visualize adapters\ng_adapters <- g_qual + \n geom_line(aes(x=position, y=pc_adapters), data=adapter_stats) +\n scale_y_continuous(name=\"% Adapters\", limits=c(0,20),\n breaks = seq(0,50,10), expand=c(0,0)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_grid(stage~adapter)\ng_adapters\n\n\n\n\n\n\nCode# Visualize quality\ng_quality_base <- g_qual +\n geom_hline(yintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_hline(yintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats) +\n scale_y_continuous(name=\"Mean Phred score\", expand=c(0,0), limits=c(10,45)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_grid(stage~.)\ng_quality_base\n\n\n\n\n\n\nCodeg_quality_seq <- g_qual +\n geom_vline(xintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_vline(xintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats) +\n scale_x_continuous(name=\"Mean Phred score\", expand=c(0,0)) +\n scale_y_continuous(name=\"# Sequences\", expand=c(0,0)) +\n facet_grid(stage~.)\ng_quality_seq\n\n\n\n\n\n\n\nAccording to FASTQC, deduplication was quite effective at reducing measured duplicate levels, with FASTQC-measured levels falling from an average of 55% to one of 22%:\n\nCodeg_dup_stages <- ggplot(basic_stats, aes(x=stage, y=percent_duplicates, fill=group, group=sample)) +\n geom_col(position=\"dodge\") + scale_fill_grp() +\n scale_y_continuous(\"% Duplicates\", limits=c(0,100), breaks=seq(0,100,20), expand=c(0,0)) +\n theme_kit\ng_readlen_stages <- ggplot(basic_stats, aes(x=stage, y=mean_seq_len, fill=group, group=sample)) +\n geom_col(position=\"dodge\") + scale_fill_grp() +\n scale_y_continuous(\"Mean read length (nt)\", expand=c(0,0)) +\n theme_kit\nlegend_loc <- get_legend(g_dup_stages)\ng_dup_stages\n\n\n\n\n\n\nCodeg_readlen_stages\n\n\n\n\n\n\n\nHigh-level composition\nAs before, to assess the high-level composition of the reads, I ran the ribodepleted files through Kraken (using the Standard 16 database) and summarized the results with Bracken. Combining these results with the read counts above gives us a breakdown of the inferred composition of the samples:\n\nCode# Import Bracken data\nbracken_path <- file.path(data_dir, \"bracken_counts.tsv\")\nbracken <- read_tsv(bracken_path, show_col_types = FALSE)\ntotal_assigned <- bracken %>% group_by(sample) %>% summarize(\n name = \"Total\",\n kraken_assigned_reads = sum(kraken_assigned_reads),\n added_reads = sum(added_reads),\n new_est_reads = sum(new_est_reads),\n fraction_total_reads = sum(fraction_total_reads)\n)\nbracken_spread <- bracken %>% select(name, sample, new_est_reads) %>%\n mutate(name = tolower(name)) %>%\n pivot_wider(id_cols = \"sample\", names_from = \"name\", values_from = \"new_est_reads\")\n# Count reads\nread_counts_preproc <- basic_stats %>% \n select(sample, group, stage, n_read_pairs) %>%\n pivot_wider(id_cols = c(\"sample\", \"group\"), names_from=\"stage\", values_from=\"n_read_pairs\")\nread_counts <- read_counts_preproc %>%\n inner_join(total_assigned %>% select(sample, new_est_reads), by = \"sample\") %>%\n rename(assigned = new_est_reads) %>%\n inner_join(bracken_spread, by=\"sample\")\n# Assess composition\nread_comp <- transmute(read_counts, sample=sample, group=group,\n n_filtered = raw_concat-cleaned,\n n_duplicate = cleaned-dedup,\n n_ribosomal = (dedup-ribo_initial) + (ribo_initial-ribo_secondary),\n n_unassigned = ribo_secondary-assigned,\n n_bacterial = bacteria,\n n_archaeal = archaea,\n n_viral = viruses,\n n_human = eukaryota)\nread_comp_long <- pivot_longer(read_comp, -(sample:group), names_to = \"classification\",\n names_prefix = \"n_\", values_to = \"n_reads\") %>%\n mutate(classification = fct_inorder(str_to_sentence(classification))) %>%\n group_by(sample) %>% mutate(p_reads = n_reads/sum(n_reads))\nread_comp_summ <- read_comp_long %>% \n group_by(group, classification) %>%\n summarize(n_reads = sum(n_reads), .groups = \"drop_last\") %>%\n mutate(n_reads = replace_na(n_reads,0),\n p_reads = n_reads/sum(n_reads),\n pc_reads = p_reads*100)\n \n# Plot overall composition\ng_comp <- ggplot(read_comp_long, aes(x=sample, y=p_reads, fill=classification)) +\n geom_col(position = \"stack\") +\n scale_y_continuous(name = \"% Reads\\n\", limits = c(0,1.01), breaks = seq(0,1,0.2),\n expand = c(0,0), labels = function(x) x*100) +\n scale_fill_brewer(palette = \"Set1\", name = \"Classification\") +\n facet_wrap(.~group, scales=\"free\", ncol=5) +\n theme_kit\ng_comp\n\n\n\n\n\n\nCode# Plot composition of minor components\nread_comp_minor <- read_comp_long %>% filter(classification %in% c(\"Archaeal\", \"Viral\", \"Human\", \"Other\"))\npalette_minor <- brewer.pal(9, \"Set1\")[6:9]\ng_comp_minor <- ggplot(read_comp_minor, aes(x=sample, y=p_reads, fill=classification)) +\n geom_col(position = \"stack\") +\n scale_y_continuous(name = \"% Reads\\n\",\n expand = c(0,0), labels = function(x) x*100) +\n scale_fill_manual(values=palette_minor, name = \"Classification\") +\n facet_wrap(.~group, scales = \"free_x\", ncol=5) +\n theme_kit\ng_comp_minor\n\n\n\n\n\n\n\n\nCodep_reads_summ_group <- read_comp_long %>%\n mutate(classification = ifelse(classification %in% c(\"Filtered\", \"Duplicate\", \"Unassigned\"), \"Excluded\", as.character(classification)),\n classification = fct_inorder(classification)) %>%\n group_by(classification, sample, group) %>%\n summarize(p_reads = sum(p_reads), .groups = \"drop\") %>%\n group_by(classification, group) %>%\n summarize(pc_min = min(p_reads)*100, pc_max = max(p_reads)*100, \n pc_mean = mean(p_reads)*100, .groups = \"drop\")\np_reads_summ_total <- read_comp_long %>%\n mutate(classification = ifelse(classification %in% c(\"Filtered\", \"Duplicate\", \"Unassigned\"), \"Excluded\", as.character(classification)),\n classification = fct_inorder(classification)) %>%\n group_by(classification, sample, group) %>%\n summarize(p_reads = sum(p_reads), .groups = \"drop\") %>%\n group_by(classification) %>%\n summarize(pc_min = min(p_reads)*100, pc_max = max(p_reads)*100, \n pc_mean = mean(p_reads)*100, .groups = \"drop\") %>%\n mutate(group = \"All groups\")\np_reads_summ_prep <- bind_rows(p_reads_summ_total, p_reads_summ_group) %>%\n mutate(group = fct_inorder(group),\n classification = fct_inorder(classification),\n pc_min = pc_min %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n pc_max = pc_max %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n pc_mean = pc_mean %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n display = paste0(pc_min, \"-\", pc_max, \"% (mean \", pc_mean, \"%)\"))\np_reads_summ <- p_reads_summ_prep %>%\n select(classification, group, display) %>%\n pivot_wider(names_from=group, values_from = display)\np_reads_summ\n\n\n \n\n\n\nAverage composition varied substantially between groups. Four groups (A, B, I, J) showed high within-group consistency, with high levels of ribosomal reads and very low levels of assigned reads. Other groups showed much more variability between samples, with higher average levels of assigned reads.\nOverall, the average relative abundance of viral reads was 0.04%; however, groups F, G & H showed substantially higher average abundance of around 0.1%. Even these elevated groups, however, still show lower total viral abundance than Rothman or Crits-Christoph, so this doesn’t seem particularly noteworthy.\nHuman-infecting virus reads: validation, round 1\nNext, I investigated the human-infecting virus read content of these unenriched samples, using a pipeline identical to that described in my entry on unenriched Rothman samples. This process identified a total of 31,610 read pairs across all samples (0.007% of surviving reads):\n\nCodehv_reads_filtered_1_path <- file.path(data_dir, \"hv_hits_putative_filtered_1.tsv.gz\")\nhv_reads_filtered_1 <- read_tsv(hv_reads_filtered_1_path, show_col_types = FALSE) %>%\n inner_join(libraries, by=\"sample\") %>% \n arrange(group) %>%\n mutate(sample = fct_inorder(sample))\nn_hv_filtered_1 <- hv_reads_filtered_1 %>% group_by(sample, group) %>% count %>% \n inner_join(basic_stats %>% filter(stage == \"ribo_initial\") %>% select(sample, n_read_pairs), by=\"sample\") %>% \n rename(n_putative = n, n_total = n_read_pairs) %>% mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)\nn_hv_filtered_1_summ <- n_hv_filtered_1 %>% ungroup %>% summarize(n_putative = sum(n_putative), n_total = sum(n_total)) %>% mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)\n\n\n\nCodemrg_1 <- hv_reads_filtered_1 %>%\n mutate(kraken_label = ifelse(assigned_hv, \"Kraken2 HV\\nassignment\",\n ifelse(hit_hv, \"Kraken2 HV\\nhit\",\n \"No hit or\\nassignment\"))) %>%\n group_by(sample) %>% arrange(desc(adj_score_fwd), desc(adj_score_rev)) %>%\n mutate(seq_num = row_number(),\n adj_score_max = pmax(adj_score_fwd, adj_score_rev))\n# Import Bowtie2/Kraken data and perform filtering steps\ng_mrg_1 <- ggplot(mrg_1, aes(x=adj_score_fwd, y=adj_score_rev)) +\n geom_point(alpha=0.5, shape=16) + \n scale_x_continuous(name=\"S(forward read)\", limits=c(0,40), breaks=seq(0,100,10), expand = c(0,0)) +\n scale_y_continuous(name=\"S(reverse read)\", limits=c(0,40), breaks=seq(0,100,10), expand = c(0,0)) +\n facet_wrap(~kraken_label, labeller = labeller(kit = label_wrap_gen(20))) +\n theme_base + theme(aspect.ratio=1)\ng_mrg_1\n\n\n\n\n\n\nCodeg_hist_1 <- ggplot(mrg_1, aes(x=adj_score_max)) + geom_histogram(binwidth=5,boundary=0,position=\"dodge\") + facet_wrap(~kraken_label) + scale_x_continuous(name = \"Maximum adjusted alignment score\") + scale_y_continuous(name=\"# Read pairs\") + scale_fill_brewer(palette = \"Dark2\") + theme_base + geom_vline(xintercept=20, linetype=\"dashed\", color=\"red\")\ng_hist_1\n\n\n\n\n\n\n\nTo analyze all these reads in a reasonable amount of time, I set up a dedicated EC2 instance, downloaded nt, and ran BLASTN locally there, otherwise using the same parameters I’ve used for past datasets:\n\nCodemrg_1_num <- mrg_1 %>% group_by(sample) %>%\n arrange(sample, desc(adj_score_max)) %>%\n mutate(seq_num = row_number(), seq_head = paste0(\">\", sample, \"_\", seq_num))\nmrg_1_fasta <- mrg_1_num %>% \n ungroup %>%\n select(header1=seq_head, seq1=query_seq_fwd, header2=seq_head, seq2=query_seq_rev) %>%\n mutate(header1=paste0(header1, \"_1\"), header2=paste0(header2, \"_2\"))\nmrg_1_fasta_out <- do.call(paste, c(mrg_1_fasta, sep=\"\\n\")) %>% paste(collapse=\"\\n\")\nwrite(mrg_1_fasta_out, file.path(data_dir, \"spurbeck-putative-viral-1.fasta\"))\n\n\n\nCodeviral_taxa_path <- file.path(data_dir, \"viral-taxids.tsv.gz\")\nviral_taxa <- read_tsv(viral_taxa_path, show_col_types = FALSE)\n\n# NB: Importing partially-processed BLAST results to save storage space\n# Import BLAST results\nblast_results_1_path <- file.path(data_dir, \"putative-viral-blast-best-1.tsv.gz\")\nblast_results_1 <- read_tsv(blast_results_1_path, show_col_types = FALSE, col_types = cols(.default=\"c\"))\n\n# Filter for best hit for each query/subject\nblast_results_1_best <- blast_results_1 %>% group_by(qseqid, staxid) %>% \n filter(bitscore == max(bitscore)) %>%\n filter(length == max(length)) %>% filter(row_number() == 1)\n\n# Rank hits for each query\nblast_results_1_ranked <- blast_results_1_best %>% \n group_by(qseqid) %>% mutate(rank = dense_rank(desc(bitscore)))\nblast_results_1_highrank <- blast_results_1_ranked %>% filter(rank <= 5) %>%\n mutate(read_pair = str_split(qseqid, \"_\") %>% sapply(nth, n=-1), \n seq_num = str_split(qseqid, \"_\") %>% sapply(nth, n=-2), \n sample = str_split(qseqid, \"_\") %>% lapply(head, n=-2) %>% \n sapply(paste, collapse=\"_\")) %>%\n mutate(bitscore = as.numeric(bitscore), seq_num = as.numeric(seq_num))\n\n# Summarize by read pair and taxid\nblast_results_1_paired <- blast_results_1_highrank %>%\n group_by(sample, seq_num, staxid) %>%\n summarize(bitscore_max = max(bitscore), bitscore_min = min(bitscore),\n best_rank = min(rank),\n n_reads = n(), .groups = \"drop\")\n\n# Add viral status\nblast_results_1_viral <- blast_results_1_paired %>%\n mutate(viral = staxid %in% viral_taxa$taxid,\n viral_full = viral & n_reads == 2)\n\n# Compare to Kraken & Bowtie assignments\nmrg_1_assign <- mrg_1_num %>% select(sample, seq_num, taxid, assigned_taxid)\nblast_results_1_assign <- full_join(blast_results_1_viral, mrg_1_assign, by=c(\"sample\", \"seq_num\")) %>%\n mutate(taxid_match_bowtie = (staxid == taxid),\n taxid_match_kraken = (staxid == assigned_taxid),\n taxid_match_any = taxid_match_bowtie | taxid_match_kraken)\nblast_results_1_out <- blast_results_1_assign %>%\n group_by(sample, seq_num) %>%\n summarize(viral_status = ifelse(any(viral_full), 2,\n ifelse(any(taxid_match_any), 2,\n ifelse(any(viral), 1, 0))),\n .groups = \"drop\") %>%\n mutate(viral_status = replace_na(viral_status, 0))\n\n\n\nCode# Merge BLAST results with unenriched read data\nmrg_1_blast <- full_join(mrg_1_num, blast_results_1_out, by=c(\"sample\", \"seq_num\")) %>%\n mutate(viral_status = replace_na(viral_status, 0),\n viral_status_out = ifelse(viral_status == 0, FALSE, TRUE))\n\n# Plot\ng_mrg_blast_1 <- mrg_1_blast %>%\n ggplot(aes(x=adj_score_fwd, y=adj_score_rev, color=viral_status_out)) +\n geom_point(alpha=0.5, shape=16) +\n scale_x_continuous(name=\"S(forward read)\", limits=c(0,40), breaks=seq(0,100,10), expand = c(0,0)) +\n scale_y_continuous(name=\"S(reverse read)\", limits=c(0,40), breaks=seq(0,100,10), expand = c(0,0)) +\n scale_color_brewer(palette = \"Set1\", name = \"Viral status\") +\n facet_grid(viral_status_out~kraken_label, labeller = labeller(kit = label_wrap_gen(20))) +\n theme_base + theme(aspect.ratio=1)\ng_mrg_blast_1\n\n\n\n\n\n\n\n\nCodeg_hist_blast_1 <- ggplot(mrg_1_blast, aes(x=adj_score_max)) + geom_histogram(binwidth=5,boundary=0,position=\"dodge\") + facet_wrap(~viral_status_out) + scale_x_continuous(name = \"Maximum adjusted alignment score\") + scale_y_continuous(name=\"# Read pairs\") + scale_fill_brewer(palette = \"Dark2\") + theme_base + geom_vline(xintercept=20, linetype=\"dashed\", color=\"red\")\ng_hist_blast_1\n\n\n\n\n\n\n\nThere results were…okay. There were a lot of false positives, but nearly all of them were low-scoring and excluded by my usual score filters. Most true positives, meanwhile, had high enough scores to be retained by those filters. However, there were enough high-scoring false positives and low-scoring true positives to drag down my precision and sensitivity, resulting in an F1 score (at a disjunctive score threshold of 20) a little under 90% – quite a few percentage points lower than I usually aim for.\n\nCodetest_sens_spec <- function(tab, score_threshold, conjunctive, include_special){\n if (!include_special) tab <- filter(tab, viral_status_out %in% c(\"TRUE\", \"FALSE\"))\n tab_retained <- tab %>% mutate(\n conjunctive = conjunctive,\n retain_score_conjunctive = (adj_score_fwd > score_threshold & adj_score_rev > score_threshold), \n retain_score_disjunctive = (adj_score_fwd > score_threshold | adj_score_rev > score_threshold),\n retain_score = ifelse(conjunctive, retain_score_conjunctive, retain_score_disjunctive),\n retain = assigned_hv | hit_hv | retain_score) %>%\n group_by(viral_status_out, retain) %>% count\n pos_tru <- tab_retained %>% filter(viral_status_out == \"TRUE\", retain) %>% pull(n) %>% sum\n pos_fls <- tab_retained %>% filter(viral_status_out != \"TRUE\", retain) %>% pull(n) %>% sum\n neg_tru <- tab_retained %>% filter(viral_status_out != \"TRUE\", !retain) %>% pull(n) %>% sum\n neg_fls <- tab_retained %>% filter(viral_status_out == \"TRUE\", !retain) %>% pull(n) %>% sum\n sensitivity <- pos_tru / (pos_tru + neg_fls)\n specificity <- neg_tru / (neg_tru + pos_fls)\n precision <- pos_tru / (pos_tru + pos_fls)\n f1 <- 2 * precision * sensitivity / (precision + sensitivity)\n out <- tibble(threshold=score_threshold, include_special = include_special, \n conjunctive = conjunctive, sensitivity=sensitivity, \n specificity=specificity, precision=precision, f1=f1)\n return(out)\n}\nrange_f1 <- function(intab, inc_special=FALSE, inrange=15:45){\n tss <- purrr::partial(test_sens_spec, tab=intab, include_special=inc_special)\n stats_conj <- lapply(inrange, tss, conjunctive=TRUE) %>% bind_rows\n stats_disj <- lapply(inrange, tss, conjunctive=FALSE) %>% bind_rows\n stats_all <- bind_rows(stats_conj, stats_disj) %>%\n pivot_longer(!(threshold:conjunctive), names_to=\"metric\", values_to=\"value\") %>%\n mutate(conj_label = ifelse(conjunctive, \"Conjunctive\", \"Disjunctive\"))\n return(stats_all)\n}\nstats_1 <- range_f1(mrg_1_blast)\nthreshold_opt_1 <- stats_1 %>% group_by(conj_label) %>% \n filter(metric == \"f1\") %>%\n filter(value == max(value)) %>% filter(threshold == min(threshold))\ng_stats_1 <- ggplot(stats_1, aes(x=threshold, y=value, color=metric)) +\n geom_vline(data = threshold_opt_1, mapping = aes(xintercept=threshold),\n color = \"red\", linetype = \"dashed\") +\n geom_line() +\n scale_y_continuous(name = \"Value\", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_x_continuous(name = \"Threshold\", expand = c(0,0)) +\n scale_color_brewer(palette=\"Set3\") +\n facet_wrap(~conj_label) +\n theme_base\ng_stats_1\n\n\n\n\n\n\n\nDigging into taxonomic assignments more deeply, we find that low-scoring false positives are primarily mapped by Bowtie2 to SARS-CoV-2, while higher-scoring false positives are mainly mapped to a variety of parvoviruses and parvo-like viruses, as well as Orf virus (cows again?). High-scoring true positives map to a wide range of viruses, but low-scoring true-positives primarily map to human gammaherpesvirus 4. Given that the latter is a DNA virus, I find this quite suspicious.\n\nCodefp_1 <- mrg_1_blast %>% \n group_by(viral_status_out, highscore = adj_score_max >= 20, taxid) %>% count %>% \n group_by(viral_status_out, highscore) %>% mutate(p=n/sum(n)) %>% \n left_join(viral_taxa, by=\"taxid\") %>%\n arrange(desc(p)) %>%\n mutate(name = ifelse(taxid == 194958, \"Porcine endogenous retrovirus A\", name))\nfp_1_major_tab <- fp_1 %>% filter(p > 0.05) %>% arrange(desc(p))\nfp_1_major_list <- fp_1_major_tab %>% pull(name) %>% sort %>% unique %>% c(., \"Other\")\nfp_1_major <- fp_1 %>% mutate(major = p > 0.1) %>% \n mutate(name_display = ifelse(major, name, \"Other\")) %>%\n group_by(viral_status_out, highscore, name_display) %>% \n summarize(n=sum(n), p=sum(p), .groups = \"drop\") %>%\n mutate(name_display = factor(name_display, levels = fp_1_major_list),\n score_display = ifelse(highscore, \"S >= 20\", \"S < 20\"),\n status_display = ifelse(viral_status_out, \"True positive\", \"False positive\"))\ng_fp_1 <- ggplot(fp_1_major, aes(x=score_display, y=p, fill=name_display)) +\n geom_col(position=\"stack\") +\n scale_x_discrete(name = \"True positive?\") +\n scale_y_continuous(name = \"% reads\", limits = c(0,1.01), \n breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_fill_brewer(palette = \"Set3\", name = \"Viral\\ntaxon\") +\n facet_wrap(~status_display) +\n guides(fill=guide_legend(nrow=6)) +\n theme_kit\ng_fp_1\n\n\n\n\n\n\n\nSince sensitivity is more of a problem here than precision, I decided to dig into into the “true positive” herpesvirus reads. There are 1,197 of these in total, 93% of which are low scoring. When I look into the top taxids that BLAST maps these reads to, we see the following:\n\nCode# Get taxon names\ntax_names_path <- file.path(data_dir, \"taxid-names.tsv.gz\")\ntax_names <- read_tsv(tax_names_path, show_col_types = FALSE)\n# Get BLAST staxids\nherp_seqs_1 <- mrg_1_blast %>% filter(taxid == 10376, viral_status_out) %>%\n select(sample, seq_num, taxid, adj_score_max) %>%\n mutate(highscore = adj_score_max >= 20)\nherp_seqs_1_total <- herp_seqs_1 %>% group_by(highscore) %>% \n count(name=\"n_seqs_total\")\nherp_blast_1_hits <- herp_seqs_1 %>% \n left_join(blast_results_1_paired, by=c(\"sample\", \"seq_num\")) %>%\n mutate(staxid = as.integer(staxid))\nherp_blast_1_hits_top <- herp_blast_1_hits %>% group_by(staxid) %>% count %>% \n arrange(desc(n)) %>% left_join(tax_names, by=\"staxid\") %>%\n mutate(name=ifelse(staxid == 3004166, \n \"Caudopunctatus cichlid hepacivirus\", name)) %>%\n mutate(name = fct_inorder(name))\n\n# Plot\ng_herp_1 <- ggplot(herp_blast_1_hits_top %>% head(10), \n aes(x=n, y=fct_inorder(name))) + geom_col() +\n scale_x_continuous(name=\"# Mapped Read Pairs\") + theme_base + \n theme(axis.title.y = element_blank())\ng_herp_1\n\n\n\n\n\n\n\nTwo things strike me as notable about these results. First, human gammaherpesvirus 4 is only the second-most-common subject taxid, after SARS-CoV-2, an unrelated virus with a different nucleic-acid type. Hmm. Second, close behind these two viruses is a distinctly non-viral taxon, the common carp. This jumped out at me, because the common carp genome is somewhat notorious for being contaminated with Illumina adapter sequences. This makes me suspect that Illumina adapter contamination is playing a role in these results.\n\nCodecarp_hits_1 <- herp_blast_1_hits %>% filter(staxid == 7962) %>% \n select(sample, seq_num, taxid) %>% \n left_join(mrg_1_blast, by=c(\"sample\", \"seq_num\", \"taxid\")) %>% \n select(sample, seq_num, taxid, adj_score_max, query_seq_fwd, query_seq_rev)\ncarp_hits_1_out <- carp_hits_1 %>%\n mutate(seq_head = paste0(\">\", sample, \"_\", seq_num))\ncarp_hits_1_fasta <- carp_hits_1_out %>% ungroup %>%\n select(header1=seq_head, seq1=query_seq_fwd, \n header2=seq_head, seq2=query_seq_rev) %>%\n mutate(header1=paste0(header1, \"_1\"), header2=paste0(header2, \"_2\"))\ncarp_hits_1_fasta_out <- do.call(paste, c(carp_hits_1_fasta, sep=\"\\n\")) %>% \n paste(collapse=\"\\n\")\nwrite(carp_hits_1_fasta_out, file.path(data_dir, \"spurbeck-carp-hits-1.fasta\"))\n\n\nInspecting these reads manually, I find that a large fraction do indeed show substantial adapter content. In particular, of the 1658 individual reads queried, at least 1338 (81%) showed a strong match to the Illumina multiplexing primer. As such, better removal of these adapter sequences would likely remove most or all of these “false true positives”, potentially significantly improving my results for this dataset.\nHuman-infecting virus reads: validation, round 2\nTo address this problem, I added Cutadapt to the pipeline, using settings that allowed it to trim known adaptors internal to the read:\ncutadapt -b file:<adapter_file> -B file:<adapter_file> -m 15 -e 0.25 --action=trim -o <fwd_reads_out> -p <rev_reads_out> <fwd_reads_in> <rev_reads_in>\nRunning this additional preprocessing step reduced the total number of reads surviving cleaning by 10.4M, or an average of 189k reads per sample. The number of putative human-viral reads, meanwhile, was reduced from 31,610 to 19,799, a 37% decrease. This reduction is primarily due to a large decrease in the number of low-scoring Bowtie2-only putative HV hits.\n\nCodebasic_stats_2_path <- file.path(data_dir, \"qc_basic_stats_2.tsv\")\nbasic_stats_2 <- read_tsv(basic_stats_2_path, show_col_types = FALSE) %>%\n inner_join(libraries, by=\"sample\") %>% \n mutate(stage = factor(stage, levels = stages),\n sample = fct_inorder(sample))\nbasic_stats_2_sum <- basic_stats_2 %>% group_by(stage) %>% summarize(n_read_pairs = sum(n_read_pairs), n_bases_approx = sum(n_bases_approx)) %>% mutate(label = \"With Cutadapt\")\nbasic_stats_sum <- basic_stats %>% group_by(stage) %>% summarize(n_read_pairs = sum(n_read_pairs), n_bases_approx = sum(n_bases_approx)) %>% mutate(label = \"Without Cutadapt\")\nsum_cat <- bind_rows(basic_stats_sum, basic_stats_2_sum) %>% mutate(label=fct_inorder(label))\ng_sum_cat_reads <- ggplot(sum_cat, aes(x=stage, y=n_read_pairs, fill=label)) +\n geom_col(position=\"dodge\") + \n scale_fill_brewer(palette = \"Set1\", name=\"Preprocessing\") + \n scale_y_continuous(name=\"# Read Pairs\", expand=c(0,0)) +\n theme_kit\ng_sum_cat_reads\n\n\n\n\n\n\n\n\nCodehv_reads_filtered_2_path <- file.path(data_dir, \"hv_hits_putative_filtered_2.tsv.gz\")\nhv_reads_filtered_2 <- read_tsv(hv_reads_filtered_2_path, show_col_types = FALSE) %>%\n inner_join(libraries, by=\"sample\") %>% \n arrange(group) %>%\n mutate(sample = fct_inorder(sample))\nn_hv_filtered_2 <- hv_reads_filtered_2 %>% group_by(sample, group) %>% count %>% inner_join(basic_stats %>% filter(stage == \"ribo_initial\") %>% select(sample, n_read_pairs), by=\"sample\") %>% rename(n_putative = n, n_total = n_read_pairs) %>% mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)\nn_hv_filtered_2_summ <- n_hv_filtered_2 %>% ungroup %>% summarize(n_putative = sum(n_putative), n_total = sum(n_total)) %>% mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)\n\n\n\nCode# Make new labelled HV dataset\nmrg_2 <- hv_reads_filtered_2 %>%\n mutate(kraken_label = ifelse(assigned_hv, \"Kraken2 HV\\nassignment\",\n ifelse(hit_hv, \"Kraken2 HV\\nhit\",\n \"No hit or\\nassignment\"))) %>%\n group_by(sample) %>% arrange(desc(adj_score_fwd), desc(adj_score_rev)) %>%\n mutate(seq_num = row_number(),\n adj_score_max = pmax(adj_score_fwd, adj_score_rev, na.rm = TRUE))\n\n# Merge and plot histogram\nmrg_2_join <- bind_rows(mrg_1 %>% mutate(label=\"Without Cutadapt\"),\n mrg_2 %>% mutate(label=\"With Cutadapt\")) %>%\n mutate(label = fct_inorder(label))\n\n\ng_hist_2 <- ggplot(mrg_2_join, aes(x=adj_score_max)) + \n geom_histogram(binwidth=5,boundary=0,position=\"dodge\") + \n facet_grid(label~kraken_label) + scale_x_continuous(name = \"Maximum adjusted alignment score\") + scale_y_continuous(name=\"# Read pairs\") + scale_fill_brewer(palette = \"Dark2\") + theme_base + geom_vline(xintercept=20, linetype=\"dashed\", color=\"red\")\ng_hist_2\n\n\n\n\n\n\n\nComparing the lengths of the same sequences in the old versus new results, we see that many reads are reduced in length as a result of adding Cutadapt. As such, it made sense to repeat the BLAST analysis on the reprocessed reads rather than using the BLAST assignments from the previous analysis.\n\nCodemrg_num_comp <- inner_join(mrg_2 %>% ungroup %>% select(-seq_num), \n mrg_1_num %>% ungroup %>% select(seq_id, seq_num), by=c(\"seq_id\"))\nmrg_num_diff <- mrg_1_num %>% select(sample, seq_num, query_len_fwd_old = query_len_fwd,\n query_len_rev_old = query_len_rev) %>% \n inner_join(mrg_num_comp %>% select(sample, seq_num, query_len_fwd_new = query_len_fwd, \n query_len_rev_new = query_len_rev),\n by=c(\"sample\", \"seq_num\")) %>%\n mutate(query_len_fwd_diff = query_len_fwd_new - query_len_fwd_old,\n query_len_rev_diff = query_len_rev_new - query_len_rev_old)\nmrg_num_diff_long <- mrg_num_diff %>% \n select(sample, seq_num, Forward=query_len_fwd_diff, Reverse=query_len_rev_diff) %>%\n pivot_longer(-(sample:seq_num), names_to=\"read\", values_to=\"length_difference\")\ng_len_diff <- ggplot(mrg_num_diff_long, aes(x=length_difference)) +\n geom_histogram(binwidth=4, boundary=2) +\n scale_x_continuous(name=\"Effect of Cutadapt on read length\") +\n scale_y_continuous(name=\"# of Reads\", expand=c(0,0)) +\n facet_wrap(~read) +\n theme_base\ng_len_diff\n\n\n\n\n\n\n\n\nCodemrg_2_num <- mrg_2 %>% group_by(sample) %>%\n arrange(sample, desc(adj_score_max)) %>%\n mutate(seq_num = row_number(), seq_head = paste0(\">\", sample, \"_\", seq_num))\nmrg_2_fasta <- mrg_2_num %>% \n ungroup %>%\n select(header1=seq_head, seq1=query_seq_fwd, header2=seq_head, seq2=query_seq_rev) %>%\n mutate(header1=paste0(header1, \"_1\"), header2=paste0(header2, \"_2\"))\nmrg_2_fasta_out <- do.call(paste, c(mrg_1_fasta, sep=\"\\n\")) %>% paste(collapse=\"\\n\")\nwrite(mrg_2_fasta_out, file.path(data_dir, \"spurbeck-putative-viral-2.fasta\"))\n\n\n\nCode# Import BLAST results (again, pre-filtered to save space)\nblast_results_2_path <- file.path(data_dir, \"putative-viral-blast-best-2.tsv.gz\")\nblast_results_2 <- read_tsv(blast_results_2_path, show_col_types = FALSE, col_types = cols(.default=\"c\"))\n\n# Filter for best hit for each query/subject\nblast_results_2_best <- blast_results_2 %>% group_by(qseqid, staxid) %>% \n filter(bitscore == max(bitscore)) %>%\n filter(length == max(length)) %>% filter(row_number() == 1)\n\n# Rank hits for each query\nblast_results_2_ranked <- blast_results_2_best %>% \n group_by(qseqid) %>% mutate(rank = dense_rank(desc(bitscore)))\nblast_results_2_highrank <- blast_results_2_ranked %>% filter(rank <= 5) %>%\n mutate(read_pair = str_split(qseqid, \"_\") %>% sapply(nth, n=-1), \n seq_num = str_split(qseqid, \"_\") %>% sapply(nth, n=-2), \n sample = str_split(qseqid, \"_\") %>% lapply(head, n=-2) %>% \n sapply(paste, collapse=\"_\")) %>%\n mutate(bitscore = as.numeric(bitscore), seq_num = as.numeric(seq_num))\n\n# Summarize by read pair and taxid\nblast_results_2_paired <- blast_results_2_highrank %>%\n group_by(sample, seq_num, staxid) %>%\n summarize(bitscore_max = max(bitscore), bitscore_min = min(bitscore),\n best_rank = min(rank),\n n_reads = n(), .groups = \"drop\")\n\n# Add viral status\nblast_results_2_viral <- blast_results_2_paired %>%\n mutate(viral = staxid %in% viral_taxa$taxid,\n viral_full = viral & n_reads == 2)\n\n# Compare to Kraken & Bowtie assignments\nmrg_2_assign <- mrg_2_num %>% select(sample, seq_num, taxid, assigned_taxid)\nblast_results_2_assign <- full_join(blast_results_2_viral, mrg_2_assign, \n by=c(\"sample\", \"seq_num\")) %>%\n mutate(taxid_match_bowtie = (staxid == taxid),\n taxid_match_kraken = (staxid == assigned_taxid),\n taxid_match_any = taxid_match_bowtie | taxid_match_kraken)\nblast_results_2_out <- blast_results_2_assign %>%\n group_by(sample, seq_num) %>%\n summarize(viral_status = ifelse(any(viral_full), 2,\n ifelse(any(taxid_match_any), 2,\n ifelse(any(viral), 1, 0))),\n .groups = \"drop\") %>%\n mutate(viral_status = replace_na(viral_status, 0))\n\n\nThe addition of Cutadapt results in a substantial decrease in low-scoring putative HV sequences, resulting in a large improvement in measured sensitivity and F1 score:\n\nCode# Merge BLAST results with unenriched read data\nmrg_2_blast <- full_join(mrg_2_num, blast_results_2_out, by=c(\"sample\", \"seq_num\")) %>%\n mutate(viral_status = replace_na(viral_status, 0),\n viral_status_out = ifelse(viral_status == 0, FALSE, TRUE))\n\n# Combine and label\nhist_blast_2_prep <- bind_rows(mrg_1_blast %>% mutate(label=\"Without Cutadapt\"),\n mrg_2_blast %>% mutate(label=\"With Cutadapt\")) %>%\n mutate(label = fct_inorder(label))\n\n# Plot\ng_hist_blast_2 <- ggplot(hist_blast_2_prep, aes(x=adj_score_max)) + geom_histogram(binwidth=5,boundary=0,position=\"dodge\") + facet_grid(label~viral_status_out) + scale_x_continuous(name = \"Maximum adjusted alignment score\") + scale_y_continuous(name=\"# Read pairs\") + scale_fill_brewer(palette = \"Dark2\") + theme_base + geom_vline(xintercept=20, linetype=\"dashed\", color=\"red\")\ng_hist_blast_2\n\n\n\n\n\n\n\n\nCodestats_2 <- range_f1(mrg_2_blast) %>% mutate(label = \"With Cutadapt\")\nstats_comb <- bind_rows(stats_1 %>% mutate(label = \"Without Cutadapt\"), stats_2)\n\ng_stats_2 <- ggplot(stats_comb, aes(x=threshold, y=value, color=label, linetype=conj_label)) +\n geom_line() +\n scale_y_continuous(name = \"Value\", limits=c(NA,1), breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_x_continuous(name = \"Threshold\", expand = c(0,0)) +\n scale_color_brewer(palette=\"Dark2\", name = \"Configuration\") +\n scale_linetype_discrete(name=\"Threshold type\") +\n facet_wrap(~metric, nrow=2, scales = \"free_y\") +\n theme_base\ng_stats_2\n\n\n\n\n\n\n\nAt a disjunctive threshold of 20, excluding “fake true positives” arising from adapter contamination improved measured sensitivity from 86% to 97%, bringing the measured F1 score up from 89% to 95%. I would feel much better about using these results for further downstream analyses.\nThat said, I think further improvement is possible. While addition of Cutadapt processing substantially improved measured sensitivity, measured precision only improved slightly. Looking at the apparent taxonomic makeup of putative HV reads, we see that, while low-scoring “true positives” mapping to herpesviruses have been mostly eliminated, high-scoring false positives still map primarily to parvoviruses and parvo-like viruses:\n\nCode# Calculate composition for 2nd attempt\nmajor_threshold <- 0.1\nfp_2 <- mrg_2_blast %>% group_by(viral_status_out, highscore = adj_score_max >= 20, taxid) %>% \n count %>% \n group_by(viral_status_out, highscore) %>% mutate(p=n/sum(n)) %>% \n left_join(viral_taxa, by=\"taxid\") %>%\n arrange(desc(p)) %>%\n mutate(name = ifelse(taxid == 194958, \"Porcine endogenous retrovirus A\", name),\n major = p > major_threshold)\nfp_2_major_tab <- fp_2 %>% filter(major) %>% arrange(desc(p))\nfp_2_major_list <- fp_2_major_tab %>% pull(name) %>% sort %>% unique %>% c(., \"Other\")\nfp_2_major <- fp_2 %>% mutate(name_display = ifelse(major, name, \"Other\")) %>%\n group_by(viral_status_out, highscore, name_display) %>% summarize(n=sum(n), p=sum(p), .groups = \"drop\") %>%\n mutate(name_display = factor(name_display, levels = fp_2_major_list),\n score_display = ifelse(highscore, \"S >= 20\", \"S < 20\"),\n status_display = ifelse(viral_status_out, \"True positive\", \"False positive\"))\n\n# Combine composition\nfp_major_comb <- bind_rows(fp_1_major %>% mutate(label = \"Without Cutadapt\"),\n fp_2_major %>% mutate(label = \"With Cutadapt\")) %>%\n mutate(label = fct_inorder(label)) %>%\n arrange(as.character(name_display)) %>% arrange(str_detect(name_display, \"Other\")) %>%\n mutate(name_display = fct_inorder(name_display))\n\n# Palette\npalette_fp_2 <- c(brewer.pal(12, \"Set3\"), \"#999999\")\ng_fp_2 <- ggplot(fp_major_comb, aes(x=score_display, y=p, fill=name_display)) +\n geom_col(position=\"stack\") +\n scale_x_discrete(name = \"True positive?\") +\n scale_y_continuous(name = \"% reads\", limits = c(0,1.01), breaks = seq(0,1,0.2), expand = c(0,0), label = function(y) y*100) +\n scale_fill_manual(name = \"Viral\\ntaxon\", values=palette_fp_2) +\n facet_grid(label~status_display) +\n guides(fill=guide_legend(nrow=6)) +\n theme_kit\ng_fp_2\n\n\n\n\n\n\n\nNot only are these parvoviruses and parvo-like viruses DNA viruses (and thus suspicious as abundant components of the RNA virome), they are also the subject of a famous controversy in early viral metagenomics, where viruses apparently widespread in Chinese hepatitis patients were found to arise from spin column contamination. In fact, these viruses seem not to be human-infecting at all; the authors of the study reporting the contamination suggested that they might be viruses of the diatom algae used as the source of silica for these columns. As such, it seems appropriate to remove these from the human-virus database used to identify putative HV reads.\nWe also see a significant number of false-positive reads mapped to Orf virus. Historically this has been a sign of contamination with bovine sequences, and indeed the top taxa mapped to these reads by BLAST include Bos taurus (cattle), Bos mutus (wild yak), and Cervus elaphus (red deer).\n\nCodeorf_taxid <- 10258\norf_seqs <- mrg_1_blast %>% filter(taxid == 10258)\norf_matches <- orf_seqs %>% inner_join(blast_results_1_paired, by=c(\"sample\", \"seq_num\"))\norf_staxids <- orf_matches %>% group_by(staxid) %>% count %>% arrange(desc(n)) %>% mutate(staxid = as.integer(staxid)) %>% left_join(tax_names, by=\"staxid\")\norf_fp_out <- orf_seqs %>% filter(!viral_status_out) %>% arrange(desc(adj_score_max)) %>%\n ungroup %>%\n mutate(seq_head = paste0(\">\", taxid, \"_\", row_number()))\norf_fp_fasta <- orf_fp_out %>% ungroup %>%\n select(header1=seq_head, seq1=query_seq_fwd, header2=seq_head, seq2=query_seq_rev) %>%\n mutate(header1=paste0(header1, \"_1\"), header2=paste0(header2, \"_2\"))\norf_fp_fasta_out <- do.call(paste, c(orf_fp_fasta, sep=\"\\n\")) %>% paste(collapse=\"\\n\")\nwrite(orf_fp_fasta_out, file.path(data_dir, \"spurbeck-orf-fp.fasta\"))\n\n\nThe ideal approach to removing these false positives would be to add the cow genome to the Kraken database we use for validation of Bowtie2 assignments, along with the pig genome and probably some other known contaminants causing issues. This is probably worth doing at some point, but represents a substantial amount of additional work; I’d rather use a simpler alternative right now if one is available. One option that comes to mind is to try replacing the BBMap aligner I’m using to detect and remove contaminants with Bowtie2; in quick experiments, running Bowtie2 (--sensitive) on the putative Orf virus sequences above, using the same dataset of contaminants for the index, successfully removed about two-thirds of them. While this doesn’t guarantee that Bowtie2 would perform as well on the whole population of contaminating cow sequences, it suggests that using Bowtie2 in place of BBMap is worth trying.\nAs such, I re-ran the HV detection pipeline again, having made two changes: first, removing Parvovirus NIH-CQV and the parvo-like viruses from the HV database used to identify putative HV reads, and secondly, replacing BBMap with Bowtie2 for contaminant removal.\nHuman-infecting virus reads: validation, round 3\nI tried re-running the HV detection pipeline with three different sets of Bowtie2 settings: (a) --sensitive, (b) --local --sensitive-local, and (c) --local --very-sensitive-local. In total, these find 26,370, 15,338 and 13,160 putative human-viral reads, respectively, compared to 31,610 for my initial attempt and 19,799 after the addition of Cutadapt:\n\nCodehv_reads_filtered_3_paths <- paste0(data_dir, \"hv_hits_putative_filtered_3\", \n c(\"a\", \"b\", \"c\"), \".tsv.gz\")\nhv_reads_filtered_3_raw <- lapply(hv_reads_filtered_3_paths, read_tsv, show_col_types = FALSE)\nhv_reads_filtered_3 <- lapply(1:3, function(n) hv_reads_filtered_3_raw[[n]] %>% \n mutate(attempt=paste0(\"Bowtie2\", c(\"(a)\",\"(b)\",\"(c)\")[n]))) %>% \n bind_rows() %>%\n inner_join(libraries, by=\"sample\") %>% \n arrange(group) %>%\n mutate(sample = fct_inorder(sample))\nn_hv_filtered_3 <- hv_reads_filtered_3 %>% group_by(sample, group, attempt) %>% count %>% inner_join(basic_stats %>% filter(stage == \"ribo_initial\") %>% select(sample, n_read_pairs), by=\"sample\") %>% rename(n_putative = n, n_total = n_read_pairs) %>% mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)\nn_hv_filtered_3_summ <- n_hv_filtered_3 %>% group_by(attempt) %>% summarize(n_putative = sum(n_putative), n_total = sum(n_total)) %>% mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)\nbind_rows(n_hv_filtered_2_summ %>% mutate(attempt=\"BBMap\"), n_hv_filtered_3_summ) %>% \n select(attempt, everything())\n\n\n \n\n\n\n\nCode# Make new labelled HV dataset\nmrg_3 <- hv_reads_filtered_3 %>%\n mutate(kraken_label = ifelse(assigned_hv, \"Kraken2 HV\\nassignment\",\n ifelse(hit_hv, \"Kraken2 HV\\nhit\",\n \"No hit or\\nassignment\"))) %>%\n group_by(sample) %>% arrange(desc(adj_score_fwd), desc(adj_score_rev)) %>%\n mutate(seq_num = row_number(),\n adj_score_max = pmax(adj_score_fwd, adj_score_rev, na.rm = TRUE))\n\n# Merge and plot histogram\nmrg_join_3 <- bind_rows(mrg_2 %>% mutate(attempt=\"BBMap\"), mrg_3)\n\n\ng_hist_3 <- ggplot(mrg_join_3, aes(x=adj_score_max)) + \n geom_histogram(binwidth=5,boundary=0,position=\"dodge\") + \n facet_grid(attempt~kraken_label) + scale_x_continuous(name = \"Maximum adjusted alignment score\") + scale_y_continuous(name=\"# Read pairs\") + scale_fill_brewer(palette = \"Dark2\") + theme_base + geom_vline(xintercept=20, linetype=\"dashed\", color=\"red\")\ng_hist_3\n\n\n\n\n\n\n\nIn all cases, the great majority of false-positive sequences from previous attempts were successfully removed (85%/89%/90%, respectively), along with a small number of true-positives (0.19%/0.23%/0.23%, respectively). At the same time, a number of new putative HV sequences arose as a result of the change in filtering algorithm: 8,512/2,224/1,166 respectively. The great majority of these (97.6%/99.1%/98.8%) are low-scoring, suggesting that they are mostly or entirely false matches that are being let through by Bowtie2 that were previously being caught by BBMap.\n\nCodemrg_2_blast_prep <- mrg_2_blast %>% ungroup %>% \n select(seq_id, seq_num, viral_status, viral_status_out)\nmrg_3_blast_old <- mrg_join_3 %>% select(-seq_num) %>% ungroup %>% \n left_join(mrg_2_blast_prep, by = \"seq_id\") %>%\n rename(viral_status_old = viral_status) %>%\n mutate(viral_status_old_out = replace_na(as.character(viral_status_out), \"UNKNOWN\")) %>%\n select(-viral_status_out)\nmrg_3_blast_old_cum <- mrg_3_blast_old %>%\n mutate(vs_label = paste0(\"HV status:\\n\", viral_status_old_out),\n attempt_label = paste(\"Attempt\", attempt))\nmrg_3_blast_old_summ <- mrg_3_blast_old_cum %>% \n group_by(attempt, viral_status_old_out, highscore = adj_score_max >= 20) %>% count %>%\n mutate(score_label = ifelse(highscore, \"S >= 20\", \"S < 20\"),\n vs_label = paste(\"HV status:\", viral_status_old_out))\ng_status_summ <- ggplot(mrg_3_blast_old_summ, aes(x=attempt, y=n, fill=attempt)) +\n geom_col(position=\"dodge\") +\n facet_wrap(score_label~vs_label, scales = \"free_y\") +\n scale_y_continuous(name=\"# Read pairs\") +\n scale_fill_brewer(palette = \"Dark2\") +\n guides(fill=FALSE) +\n theme_kit\n\nWarning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use \"none\" instead as\nof ggplot2 3.3.4.\n\nCodeg_status_summ\n\n\n\n\n\n\n\nChecking putative Orf virus reads specifically, all three Bowtie2-based attempts successfully remove most putative Orf reads from the previous attempt, while introducing some number of new putative hits (of which I assume the vast majority are fake hits arising from bovine contamination). In attempt (a), these new putative hits outnumber the old hits that were removed, resulting in a net increase in putative (but, I’m fairly confident, false) Orf-virus hits, whether total or high-scoring. Conversely, attempts (b) and (c) manage to remove even more old putative Orf hits while creating many fewer new ones, resulting in a large net decrease in total and high-scoring putative Orf hits:\n\nCodeorf_seqs_3 <- mrg_3_blast_old_cum %>% filter(taxid == 10258)\norf_seqs_3_summ <- orf_seqs_3 %>% \n group_by(attempt, viral_status_old_out, highscore = adj_score_max >= 20) %>%\n count\ng_orf_total <- orf_seqs_3_summ %>% group_by(attempt) %>% summarize(n=sum(n)) %>%\n ggplot(aes(x=attempt, y=n, fill=attempt)) + geom_col() +\n scale_y_continuous(name=\"Total reads mapped to Orf virus\", limits=c(0,250), expand=c(0,0)) +\n scale_fill_brewer(palette = \"Dark2\") +\n guides(fill=FALSE) +\n theme_kit\ng_orf_high <- orf_seqs_3_summ %>% filter(highscore) %>%\n ggplot(aes(x=attempt, y=n, fill=attempt)) + geom_col() +\n scale_fill_brewer(palette = \"Dark2\") +\n guides(fill=FALSE) +\n scale_y_continuous(name=\"High-scoring reads mapped to Orf virus\",\n limits=c(0,250), expand=c(0,0)) +\n theme_kit\ng_orf_total + g_orf_high\n\n\n\n\n\n\n\nNext, I validated all new putative HV sequences with BLASTN as before, then evaluated each attempt’s performance against all the putative HV reads identified by any attempt:\n\nCodemrg_3_fasta_prep <- mrg_join_3 %>%\n filter(! seq_id %in% mrg_2_blast$seq_id) %>%\n group_by(seq_id) %>% filter(row_number() == 1) %>% ungroup\nmrg_3_fasta <- mrg_3_fasta_prep %>%\n mutate(seq_head = paste0(\">\", seq_id)) %>% ungroup %>%\n select(header1=seq_head, seq1=query_seq_fwd, header2=seq_head, seq2=query_seq_rev) %>%\n mutate(header1=paste0(header1, \"_1\"), header2=paste0(header2, \"_2\"))\nmrg_3_fasta_out <- do.call(paste, c(mrg_2_fasta, sep=\"\\n\")) %>% paste(collapse=\"\\n\")\nwrite(mrg_3_fasta_out, file.path(data_dir, \"spurbeck-putative-viral-3.fasta\"))\n\n\n\nCode# Import BLAST results (again, pre-filtered to save space)\n# blast_cols <- c(\"qseqid\", \"sseqid\", \"sgi\", \"staxid\", \"qlen\", \"evalue\", \"bitscore\", \"qcovs\", \"length\", \"pident\", \"mismatch\", \"gapopen\", \"sstrand\", \"qstart\", \"qend\", \"sstart\", \"send\")\n# blast_results_3_path_0 <- file.path(data_dir, \"spurbeck-putative-viral-3.blast.gz\")\n# blast_results_3 <- read_tsv(blast_results_3_path_0, show_col_types = FALSE,\n# col_names = blast_cols, col_types = cols(.default=\"c\"))\n\nblast_results_3_path <- file.path(data_dir, \"putative-viral-blast-best-3.tsv.gz\")\nblast_results_3 <- read_tsv(blast_results_3_path, show_col_types = FALSE, col_types = cols(.default=\"c\"))\n\n# Filter for best hit for each query/subject\nblast_results_3_best <- blast_results_3 %>% group_by(qseqid, staxid) %>%\n filter(bitscore == max(bitscore)) %>%\n filter(length == max(length)) %>% filter(row_number() == 1)\n\n# Rank hits for each query\nblast_results_3_ranked <- blast_results_3_best %>% \n group_by(qseqid) %>% mutate(rank = dense_rank(desc(bitscore)))\nblast_results_3_highrank <- blast_results_3_ranked %>% filter(rank <= 5) %>%\n mutate(read_pair = str_split(qseqid, \"_\") %>% sapply(nth, n=2), \n seq_id = str_split(qseqid, \"_\") %>% sapply(nth, n=1)) %>%\n mutate(bitscore = as.numeric(bitscore))\n\n# Summarize by read pair and taxid\nblast_results_3_paired <- blast_results_3_highrank %>%\n group_by(seq_id, staxid) %>%\n summarize(bitscore_max = max(bitscore), bitscore_min = min(bitscore),\n best_rank = min(rank),\n n_reads = n(), .groups = \"drop\")\n\n# Add viral status\nblast_results_3_viral <- blast_results_3_paired %>%\n mutate(viral = staxid %in% viral_taxa$taxid,\n viral_full = viral & n_reads == 2)\n\n# Compare to Kraken & Bowtie assignments\nmrg_3_assign <- mrg_3_fasta_prep %>% select(seq_id, taxid, assigned_taxid)\nblast_results_3_assign <- full_join(blast_results_3_viral, mrg_3_assign, by=c(\"seq_id\")) %>%\n mutate(taxid_match_bowtie = (staxid == taxid),\n taxid_match_kraken = (staxid == assigned_taxid),\n taxid_match_any = taxid_match_bowtie | taxid_match_kraken)\nblast_results_3_out <- blast_results_3_assign %>%\n group_by(seq_id) %>%\n summarize(viral_status_new = ifelse(any(viral_full), 2,\n ifelse(any(taxid_match_any), 2,\n ifelse(any(viral), 1, 0))),\n .groups = \"drop\") %>%\n mutate(viral_status_new = replace_na(viral_status_new, 0))\n\n\n\nCodemrg_3_blast_new <- mrg_3_blast_old %>%\n left_join(blast_results_3_out, by=\"seq_id\")\nmrg_3_blast <- mrg_3_blast_new %>%\n mutate(viral_status = pmax(viral_status_old, viral_status_new, na.rm = TRUE),\n viral_status = replace_na(viral_status, 0),\n viral_status_out = viral_status > 0)\nmrg_3_blast_fill_fwd <- mrg_3_blast %>% select(attempt, seq_id, adj_score_fwd) %>%\n pivot_wider(names_from=\"attempt\", values_from=\"adj_score_fwd\", values_fill=0) %>%\n pivot_longer(-seq_id, names_to = \"attempt\", values_to = \"adj_score_fwd\")\nmrg_3_blast_fill_rev <- mrg_3_blast %>% select(attempt, seq_id, adj_score_rev) %>%\n pivot_wider(names_from=\"attempt\", values_from=\"adj_score_rev\", values_fill=0) %>%\n pivot_longer(-seq_id, names_to = \"attempt\", values_to = \"adj_score_rev\")\nmrg_3_blast_fill_ass <- mrg_3_blast %>% select(attempt, seq_id, assigned_hv) %>%\n pivot_wider(names_from=\"attempt\", values_from=\"assigned_hv\", values_fill=FALSE) %>%\n pivot_longer(-seq_id, names_to = \"attempt\", values_to = \"assigned_hv\")\nmrg_3_blast_fill_hit <- mrg_3_blast %>% select(attempt, seq_id, hit_hv) %>%\n pivot_wider(names_from=\"attempt\", values_from=\"hit_hv\", values_fill=FALSE) %>%\n pivot_longer(-seq_id, names_to = \"attempt\", values_to = \"hit_hv\")\nmrg_3_blast_fill_ref <- mrg_3_blast %>% group_by(seq_id, sample, viral_status_out, taxid) %>% summarize(.groups = \"drop\") %>% group_by(seq_id, sample, viral_status_out) %>% summarize(taxid = taxid[1], .groups = \"drop\")\nmrg_3_blast_fill <- full_join(mrg_3_blast_fill_fwd, mrg_3_blast_fill_rev, \n by=c(\"attempt\", \"seq_id\")) %>%\n left_join(mrg_3_blast_fill_ass, by=c(\"attempt\", \"seq_id\")) %>%\n left_join(mrg_3_blast_fill_hit, by=c(\"attempt\", \"seq_id\")) %>%\n left_join(mrg_3_blast_fill_ref, by=\"seq_id\") %>%\n mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev))\n\n\n\nCodeattempts <- mrg_3_blast_fill %>% group_by(attempt) %>% summarize %>% pull(attempt)\nstats_3_raw <- mrg_3_blast_fill %>% group_by(attempt) %>% group_split %>%\n lapply(range_f1)\nstats_3 <- lapply(1:4, function(n) stats_3_raw[[n]] %>% mutate(attempt=attempts[n])) %>%\n bind_rows\ng_stats_3 <- ggplot(stats_3, aes(x=threshold, y=value, color=attempt, linetype=conj_label)) +\n geom_line() +\n scale_y_continuous(name = \"Value\", limits=c(NA,1), breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_x_continuous(name = \"Threshold\", expand = c(0,0)) +\n scale_color_brewer(palette=\"Dark2\", name=\"Configuration\") +\n scale_linetype_discrete(name=\"Threshold type\") +\n facet_wrap(~metric, nrow=2, scales = \"free_y\") +\n guides(color=guide_legend(nrow=2), linetype=guide_legend(nrow=2)) +\n theme_base\ng_stats_3\n\n\n\n\n\n\n\nAt a disjunctive threshold of 20, Bowtie2(c) (--very-sensitive-local) performs the best, with an F1 score of 0.979; Bowtie2(b) (--sensitive-local) is very close behind. In both cases, precision (>=0.988) is higher than sensitivity (~0.969), suggesting that any further improvements would come from investigating low-scoring true-positives:\n\nCodemajor_threshold <- 0.1\nfp_3 <- mrg_3_blast %>% group_by(attempt, viral_status_out, \n highscore = adj_score_max >= 20, taxid) %>% \n count %>% \n group_by(attempt, viral_status_out, highscore) %>% mutate(p=n/sum(n)) %>% \n left_join(viral_taxa, by=\"taxid\") %>%\n arrange(desc(p)) %>%\n mutate(name = ifelse(taxid == 194958, \"Porcine endogenous retrovirus A\", name),\n major = p > major_threshold)\nfp_3_major_tab <- fp_3 %>% filter(major) %>% arrange(desc(p))\nfp_3_major_list <- fp_3_major_tab %>% pull(name) %>% sort %>% unique %>% c(., \"Other\")\nfp_3_major <- fp_3 %>% mutate(name_display = ifelse(major, name, \"Other\")) %>%\n group_by(attempt, viral_status_out, highscore, name_display) %>% summarize(n=sum(n), p=sum(p), .groups = \"drop\") %>%\n mutate(name_display = factor(name_display, levels = fp_3_major_list),\n score_display = ifelse(highscore, \"S >= 20\", \"S < 20\"),\n status_display = paste(\"VS:\", viral_status_out)) #ifelse(viral_status_out, \"True positive\", \"False positive\"))\npalette_fp_3 <- c(brewer.pal(12, \"Set3\"), brewer.pal(3, \"Dark2\"), \"#999999\")\ng_fp_3 <- ggplot(fp_3_major, aes(x=score_display, y=p, fill=name_display)) +\n geom_col(position=\"stack\") +\n scale_x_discrete(name = \"True positive?\") +\n scale_y_continuous(name = \"% reads\", limits = c(0,1.01), breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_fill_manual(values=palette_fp_3, name = \"Viral\\ntaxon\") +\n facet_grid(attempt~status_display) +\n guides(fill=guide_legend(nrow=8)) +\n theme_kit\ng_fp_3\n\n\n\n\n\n\n\nI suspect that small additional gains are possible here, since I’m suspicious about the low-scoring “true-positives” mapping to human betaherpesvirus 5 and human mastadenovirus F. However, I’ve already spent a long time optimizing the results for this dataset, and the results are now good enough that I feel okay with leaving this here. Going forward, I’ll use Bowtie2(c) as my alignment strategy for the HV identification pipeline.\nHuman-infecting virus reads: analysis\nAfter several rounds of validation and refinement, we finally come to actually analyzing the human-infecting virus content of Spurbeck et al. This section might seem disappointingly short compared to all the effort expended to get here.\n\nCode# Get raw read counts\nread_counts_raw <- basic_stats_raw %>%\n select(sample, group, date, n_reads_raw = n_read_pairs)\n\n# Get HV read counts & RA\nmrg_hv <- mrg_3 %>% filter(attempt == \"Bowtie2(c)\") %>%\n mutate(hv_status = assigned_hv | hit_hv | adj_score_max >= 20)\nread_counts_hv <- mrg_hv %>% filter(hv_status) %>% group_by(sample) %>% count(name = \"n_reads_hv\")\nread_counts <- read_counts_raw %>%\n left_join(read_counts_hv, by=\"sample\") %>%\n mutate(n_reads_hv = replace_na(n_reads_hv, 0),\n p_reads_hv = n_reads_hv/n_reads_raw)\n\n# Aggregate\nread_counts_group <- read_counts %>% group_by(group) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv)) %>%\n mutate(p_reads_hv = n_reads_hv/n_reads_raw)\nread_counts_total <- read_counts_group %>% ungroup %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv)) %>%\n mutate(p_reads_hv = n_reads_hv/n_reads_raw,\n group = \"All groups\")\nread_counts_agg <- read_counts_group %>%\n bind_rows(read_counts_total) %>%\n arrange(group) %>% arrange(str_detect(group, \"All groups\")) %>%\n mutate(group = fct_inorder(group))\n\n\nApplying a disjunctive cutoff at S=20 identifies 9,990 reads as human-viral out of 1.84B total reads, for a relative HV abundance of \\(5.44 \\times 10^{-6}\\). This compares to \\(2.8 \\times 10^{-4}\\) on the public dashboard, corresponding to the results for Kraken-only identification: a roughly 2x increase, smaller than the 4-5x increases seen for Crits-Christoph and Rothman. Relative HV abundances for individual sample groups ranged from \\(9.19 \\times 10^{-8}\\) to \\(1.58 \\times 10^{-5}\\); as with total virus reads, groups F, G & H showed the highest relative abundance:\n\nCode# Visualize\ng_phv_agg <- ggplot(read_counts_agg, aes(x=group, y=p_reads_hv, color=group)) +\n geom_point() +\n scale_y_log10(\"Relative abundance of human virus reads\") +\n scale_color_brewer(palette = \"Set3\", name = \"Group\") +\n theme_kit\ng_phv_agg\n\n\n\n\n\n\n\nDigging into particular viruses, we see that Mamastrovirus, Mastadenovirus, Rotavirus and Betacoronavirus are the most abundant genera across samples:\n\nCode# Get viral taxon names for putative HV reads\nviral_taxa$name[viral_taxa$taxid == 249588] <- \"Mamastrovirus\"\nviral_taxa$name[viral_taxa$taxid == 194960] <- \"Kobuvirus\"\nviral_taxa$name[viral_taxa$taxid == 688449] <- \"Salivirus\"\nviral_taxa$name[viral_taxa$taxid == 694002] <- \"Betacoronavirus\"\nviral_taxa$name[viral_taxa$taxid == 694009] <- \"SARS-CoV\"\nviral_taxa$name[viral_taxa$taxid == 694003] <- \"Betacoronavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 568715] <- \"Astrovirus MLB1\"\nviral_taxa$name[viral_taxa$taxid == 194965] <- \"Aichivirus B\"\n\nmrg_hv_named <- mrg_hv %>% left_join(viral_taxa, by=\"taxid\")\n\n# Discover viral species & genera for HV reads\nraise_rank <- function(read_db, taxid_db, out_rank = \"species\", verbose = FALSE){\n # Get higher ranks than search rank\n ranks <- c(\"subspecies\", \"species\", \"subgenus\", \"genus\", \"subfamily\", \"family\", \"suborder\", \"order\", \"class\", \"subphylum\", \"phylum\", \"kingdom\", \"superkingdom\")\n rank_match <- which.max(ranks == out_rank)\n high_ranks <- ranks[rank_match:length(ranks)]\n # Merge read DB and taxid DB\n reads <- read_db %>% select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n # Extract sequences that are already at appropriate rank\n reads_rank <- filter(reads, rank == out_rank)\n # Drop sequences at a higher rank and return unclassified sequences\n reads_norank <- reads %>% filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))\n while(nrow(reads_norank) > 0){ # As long as there are unclassified sequences...\n # Promote read taxids and re-merge with taxid DB, then re-classify and filter\n reads_remaining <- reads_norank %>% mutate(taxid = parent_taxid) %>%\n select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n reads_rank <- reads_remaining %>% filter(rank == out_rank) %>%\n bind_rows(reads_rank)\n reads_norank <- reads_remaining %>%\n filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))\n }\n # Finally, extract and append reads that were excluded during the process\n reads_dropped <- reads %>% filter(!seq_id %in% reads_rank$seq_id)\n reads_out <- reads_rank %>% bind_rows(reads_dropped) %>%\n select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n return(reads_out)\n}\nhv_reads_genera <- raise_rank(mrg_hv_named, viral_taxa, \"genus\")\n\n# Count relative abundance for genera\nhv_genera_counts_raw <- hv_reads_genera %>% group_by(group, name) %>%\n summarize(n_reads_hv = sum(hv_status), .groups = \"drop\") %>%\n inner_join(read_counts_agg %>% select(group, n_reads_raw), by=\"group\")\nhv_genera_counts_all <- hv_genera_counts_raw %>% group_by(name) %>%\n summarize(n_reads_hv = sum(n_reads_hv),\n n_reads_raw = sum(n_reads_raw)) %>%\n mutate(group = \"All groups\")\nhv_genera_counts_agg <- bind_rows(hv_genera_counts_raw, hv_genera_counts_all) %>%\n mutate(p_reads_hv = n_reads_hv/n_reads_raw)\n\n# Compute ranks for species and genera and restrict to high-ranking taxa\nmax_rank_genera <- 5\nhv_genera_counts_ranked <- hv_genera_counts_agg %>% group_by(group) %>%\n mutate(rank = rank(desc(n_reads_hv), ties.method=\"max\"),\n highrank = rank <= max_rank_genera) %>%\n group_by(name) %>%\n mutate(highrank_any = any(highrank),\n name_display = ifelse(highrank_any, name, \"Other\")) %>%\n group_by(name_display) %>%\n mutate(mean_rank = mean(rank)) %>%\n arrange(mean_rank) %>% ungroup %>%\n mutate(name_display = fct_inorder(name_display)) %>%\n arrange(str_detect(group, \"Other\")) %>%\n mutate(group = fct_inorder(group))\n\n\n# Plot composition\npalette_rank <- c(brewer.pal(8, \"Dark2\"), brewer.pal(8, \"Set2\"), \"#888888\")\ng_vcomp_genera <- ggplot(hv_genera_counts_ranked, aes(x=group, y=n_reads_hv, fill=name_display)) +\n geom_col(position = \"fill\") +\n scale_fill_manual(values = palette_rank, name = \"Viral genus\") +\n scale_y_continuous(name = \"Fraction of HV reads\", breaks = seq(0,1,0.2), expand = c(0,0)) +\n guides(fill = guide_legend(ncol=3)) +\n theme_kit\ng_vcomp_genera\n\n\n\n\n\n\n\nMamastrovirus and Rotavirus are predominantly enteric RNA viruses whose prominence here makes sense, while the prevalence of Betacoronavirus is probably a result of the ongoing SARS-CoV-2 pandemic. As a DNA virus, the abundance of Mastadenovirus is a bit more surprising; however, this finding is consistent with the public dashboard, and is also borne out by the other BLASTN matches for these sequences, all of which are other adenovirus taxa:\n\nCodemasta_ids <- hv_reads_genera %>% filter(name==\"Mastadenovirus\") %>% pull(seq_id)\nmasta_hits <- bind_rows(blast_results_2_paired %>% full_join(mrg_2_blast %>% select(seq_id, sample, seq_num), by=c(\"sample\", \"seq_num\")),\n blast_results_3_paired) %>%\n select(-sample, -seq_num) %>%\n filter(seq_id %in% masta_ids) %>%\n mutate(staxid = as.integer(staxid))\nmasta_hits_sorted <- masta_hits %>% group_by(staxid) %>% count %>% arrange(desc(n)) %>%\n left_join(tax_names, by=\"staxid\") %>%\n mutate(name = fct_inorder(name))\nmasta_hits_sorted_head <- masta_hits_sorted %>% head(10) %>%\n mutate(name=fct_inorder(as.character(name)))\ng_masta <- ggplot(masta_hits_sorted_head,\n aes(x=n, y=fct_inorder(name))) + geom_col() +\n scale_x_continuous(name = \"# Mapped read pairs\") + theme_base +\n theme(axis.title.y = element_blank())\ng_masta\n\n\n\n\n\n\n\nConclusion\nCompared to the last few datasets I analyzed, the analysis of Spurbeck took a long time, numerous attempts, and a lot of computational resources. However, as I said at the start of this post, I’m happy with the outcome and am confident it will improve analysis of future datasets. While the overall prevalence of human-infecting viruses is fairly low in Spurbeck compared to other wastewater datasets I’ve looked at, its inclusion as a core dataset for the P2RA analysis make it especially important to process in a reliable and high-quality manner.\nNext, I’ll turn my attention to more datasets included in the P2RA analysis, as well as some air-sampling datasets we’re interested in for another project."
+ },
+ {
+ "objectID": "notebooks/2024-04-08_brumfield.html",
+ "href": "notebooks/2024-04-08_brumfield.html",
+ "title": "Workflow analysis of Brumfield et al. (2022)",
+ "section": "",
+ "text": "Continuing my analysis of datasets from the P2RA preprint, I analyzed the data from Brumfield et al. (2022). This study conducted longitudinal monitoring of SARS-CoV-2 via qPCR from a single manhole in Maryland, combined with comprehensive microbiome analysis using metagenomics during a major COVID spike in early 2021. In total, they sequenced six samples from February to April 2021.\nOne unusual feature that makes this study interesting is that they conducted both RNA and DNA sequencing on each study. Prior to sequencing, samples underwent concentration with the InnovaPrep Concentrating Pipette, followed by separate DNA & RNA extraction using different kits. RNA samples underwent rRNA depletion prior to library prep. All samples were sequenced on an Illumina HiSeq4000 with 2x150bp reads.\nThe raw data\nThe 6 samples from the Brumfield dataset yielded 24M-45M (mean 33M) DNA-sequencing reads and 29M-64M (mean 46M) RNA-sequencing reads per sample, for a total of 196M DNA read pairs and 276M RNA read pairs (59 and 83 gigabases of sequence, respectively). Read qualities were mostly high but tailed off at the 3’ end in some samples, suggesting the need for trimming. Adapter levels were very high. Inferred duplication levels were 44-58% in DNA reads and 90-96% in RNA reads, i.e. very high.\n\nCode# Data input paths\ndata_dir <- \"../data/2024-03-19_brumfield/\"\nlibraries_path <- file.path(data_dir, \"sample-metadata.csv\")\nbasic_stats_path <- file.path(data_dir, \"qc_basic_stats.tsv\")\nadapter_stats_path <- file.path(data_dir, \"qc_adapter_stats.tsv\")\nquality_base_stats_path <- file.path(data_dir, \"qc_quality_base_stats.tsv\")\nquality_seq_stats_path <- file.path(data_dir, \"qc_quality_sequence_stats.tsv\")\n\n# Import libraries and extract metadata from sample names\nlibraries <- read_csv(libraries_path, show_col_types = FALSE) %>%\n arrange(date) %>% mutate(date = fct_inorder(as.character(date))) %>%\n arrange(desc(na_type)) %>% mutate(na_type = fct_inorder(na_type))\n\n# Import QC data\nstages <- c(\"raw_concat\", \"cleaned\", \"dedup\", \"ribo_initial\", \"ribo_secondary\")\nbasic_stats <- read_tsv(basic_stats_path, show_col_types = FALSE) %>%\n inner_join(libraries, by=\"sample\") %>% \n mutate(stage = factor(stage, levels = stages),\n sample = fct_inorder(sample))\nadapter_stats <- read_tsv(adapter_stats_path, show_col_types = FALSE) %>%\n mutate(sample = sub(\"_2$\", \"\", sample)) %>%\n inner_join(libraries, by=\"sample\") %>%\n mutate(stage = factor(stage, levels = stages),\n read_pair = fct_inorder(as.character(read_pair)))\nquality_base_stats <- read_tsv(quality_base_stats_path, show_col_types = FALSE) %>%\n inner_join(libraries, by=\"sample\") %>%\n mutate(stage = factor(stage, levels = stages),\n read_pair = fct_inorder(as.character(read_pair)))\nquality_seq_stats <- read_tsv(quality_seq_stats_path, show_col_types = FALSE) %>%\n inner_join(libraries, by=\"sample\") %>%\n mutate(stage = factor(stage, levels = stages),\n read_pair = fct_inorder(as.character(read_pair)))\n\n# Filter to raw data\nbasic_stats_raw <- basic_stats %>% filter(stage == \"raw_concat\")\nadapter_stats_raw <- adapter_stats %>% filter(stage == \"raw_concat\")\nquality_base_stats_raw <- quality_base_stats %>% filter(stage == \"raw_concat\")\nquality_seq_stats_raw <- quality_seq_stats %>% filter(stage == \"raw_concat\")\n\n\n\nCode# Visualize basic stats\nscale_fill_na <- purrr::partial(scale_fill_brewer, palette=\"Set1\", name=\"Nucleic acid type\")\ng_basic <- ggplot(basic_stats_raw, aes(x=date, fill=na_type, group=sample)) +\n scale_fill_na() + theme_kit\ng_nreads_raw <- g_basic + geom_col(aes(y=n_read_pairs), position = \"dodge\") +\n scale_y_continuous(name=\"# Read pairs\", expand=c(0,0))\ng_nreads_raw_2 <- g_nreads_raw + theme(legend.position = \"none\")\nlegend_group <- get_legend(g_nreads_raw)\n\ng_nbases_raw <- g_basic + geom_col(aes(y=n_bases_approx), position = \"dodge\") +\n scale_y_continuous(name=\"Total base pairs (approx)\", expand=c(0,0)) + \n theme(legend.position = \"none\")\ng_ndup_raw <- g_basic + geom_col(aes(y=percent_duplicates), position = \"dodge\") +\n scale_y_continuous(name=\"% Duplicates (FASTQC)\", expand=c(0,0), limits = c(0,100), breaks = seq(0,100,20)) +\n theme(legend.position = \"none\")\n\ng_basic_raw <- plot_grid(g_nreads_raw_2 + g_nbases_raw + g_ndup_raw, legend_group, ncol = 1, rel_heights = c(1,0.1))\ng_basic_raw\n\n\n\n\n\n\n\n\nCodescale_color_na <- purrr::partial(scale_color_brewer,palette=\"Set1\",name=\"Nucleic acid type\")\ng_qual_raw <- ggplot(mapping=aes(color=na_type, linetype=read_pair, \n group=interaction(sample,read_pair))) + \n scale_color_na() + scale_linetype_discrete(name = \"Read Pair\") +\n guides(color=guide_legend(nrow=2,byrow=TRUE),\n linetype = guide_legend(nrow=2,byrow=TRUE)) +\n theme_base\n\n# Visualize adapters\ng_adapters_raw <- g_qual_raw + \n geom_line(aes(x=position, y=pc_adapters), data=adapter_stats_raw) +\n scale_y_continuous(name=\"% Adapters\", limits=c(0,NA),\n breaks = seq(0,100,10), expand=c(0,0)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_wrap(~adapter)\ng_adapters_raw\n\n\n\n\n\n\nCode# Visualize quality\ng_quality_base_raw <- g_qual_raw +\n geom_hline(yintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_hline(yintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats_raw) +\n scale_y_continuous(name=\"Mean Phred score\", expand=c(0,0), limits=c(10,45)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0))\ng_quality_base_raw\n\n\n\n\n\n\nCodeg_quality_seq_raw <- g_qual_raw +\n geom_vline(xintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_vline(xintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats_raw) +\n scale_x_continuous(name=\"Mean Phred score\", expand=c(0,0)) +\n scale_y_continuous(name=\"# Sequences\", expand=c(0,0))\ng_quality_seq_raw\n\n\n\n\n\n\n\nPreprocessing\nThe average fraction of reads lost at each stage in the preprocessing pipeline is shown in the following table. On average, cleaning & deduplication together removed about 50% of total reads from DNA libraries and about 92% from RNA libraries, primarily during deduplication. Only a few reads (low single-digit percentages at most) were lost during subsequent ribodepletion, even in RNA reads, consistent with the samples having undergone rRNA depletion prior to sequencing.\n\nCoden_reads_rel <- basic_stats %>% \n select(sample, na_type, stage, percent_duplicates, n_read_pairs) %>%\n group_by(sample, na_type) %>% arrange(sample, na_type, stage) %>%\n mutate(p_reads_retained = n_read_pairs / lag(n_read_pairs),\n p_reads_lost = 1 - p_reads_retained,\n p_reads_retained_abs = n_read_pairs / n_read_pairs[1],\n p_reads_lost_abs = 1-p_reads_retained_abs,\n p_reads_lost_abs_marginal = p_reads_lost_abs - lag(p_reads_lost_abs))\nn_reads_rel_display <- n_reads_rel %>% rename(Stage=stage, `NA Type`=na_type) %>% \n group_by(`NA Type`, Stage) %>% \n summarize(`% Total Reads Lost (Cumulative)` = paste0(round(min(p_reads_lost_abs*100),1), \"-\", round(max(p_reads_lost_abs*100),1), \" (mean \", round(mean(p_reads_lost_abs*100),1), \")\"),\n `% Total Reads Lost (Marginal)` = paste0(round(min(p_reads_lost_abs_marginal*100),1), \"-\", round(max(p_reads_lost_abs_marginal*100),1), \" (mean \", round(mean(p_reads_lost_abs_marginal*100),1), \")\"), .groups=\"drop\") %>% \n filter(Stage != \"raw_concat\") %>%\n mutate(Stage = Stage %>% as.numeric %>% factor(labels=c(\"Trimming & filtering\", \"Deduplication\", \"Initial ribodepletion\", \"Secondary ribodepletion\")))\nn_reads_rel_display\n\n\n \n\n\n\n\nCode# Plot reads over preprocessing\ng_reads_stages <- ggplot(basic_stats, aes(x=stage, y=n_read_pairs,fill=na_type,group=sample)) +\n geom_col(position=\"dodge\") + scale_fill_na() +\n scale_y_continuous(\"# Read pairs\", expand=c(0,0)) +\n theme_kit\ng_reads_stages\n\n\n\n\n\n\nCode# Plot relative read losses during preprocessing\ng_reads_rel <- ggplot(n_reads_rel, aes(x=stage, y=p_reads_lost_abs_marginal,fill=na_type,group=sample)) +\n geom_col(position=\"dodge\") + scale_fill_na() +\n scale_y_continuous(\"% Total Reads Lost\", expand=c(0,0), labels = function(x) x*100) +\n theme_kit\ng_reads_rel\n\n\n\n\n\n\n\nData cleaning with FASTP was very successful at removing adapters, with very few adapter sequences found by FASTQC at any stage after the raw data. FASTP was also successful at improving read quality.\n\nCodeg_qual <- ggplot(mapping=aes(color=na_type, linetype=read_pair, \n group=interaction(sample,read_pair))) + \n scale_color_na() + scale_linetype_discrete(name = \"Read Pair\") +\n guides(color=guide_legend(nrow=2,byrow=TRUE),\n linetype = guide_legend(nrow=2,byrow=TRUE)) +\n theme_base\n\n# Visualize adapters\ng_adapters <- g_qual + \n geom_line(aes(x=position, y=pc_adapters), data=adapter_stats) +\n scale_y_continuous(name=\"% Adapters\", limits=c(0,20),\n breaks = seq(0,50,10), expand=c(0,0)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_grid(stage~adapter)\ng_adapters\n\n\n\n\n\n\nCode# Visualize quality\ng_quality_base <- g_qual +\n geom_hline(yintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_hline(yintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats) +\n scale_y_continuous(name=\"Mean Phred score\", expand=c(0,0), limits=c(10,45)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_grid(stage~.)\ng_quality_base\n\n\n\n\n\n\nCodeg_quality_seq <- g_qual +\n geom_vline(xintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_vline(xintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats) +\n scale_x_continuous(name=\"Mean Phred score\", expand=c(0,0)) +\n scale_y_continuous(name=\"# Sequences\", expand=c(0,0)) +\n facet_grid(stage~.)\ng_quality_seq\n\n\n\n\n\n\n\nAccording to FASTQC, deduplication was moderately effective at reducing measured duplicate levels, with FASTQC-measured levels falling from an average of 50% to one of 16% for DNA reads, and from 93% to 42% for RNA reads:\n\nCodeg_dup_stages <- ggplot(basic_stats, aes(x=stage, y=percent_duplicates, fill=na_type, group=sample)) +\n geom_col(position=\"dodge\") + scale_fill_na() +\n scale_y_continuous(\"% Duplicates\", limits=c(0,100), breaks=seq(0,100,20), expand=c(0,0)) +\n theme_kit\ng_readlen_stages <- ggplot(basic_stats, aes(x=stage, y=mean_seq_len, fill=na_type, group=sample)) +\n geom_col(position=\"dodge\") + scale_fill_na() +\n scale_y_continuous(\"Mean read length (nt)\", expand=c(0,0)) +\n theme_kit\nlegend_loc <- get_legend(g_dup_stages)\ng_dup_stages\n\n\n\n\n\n\nCodeg_readlen_stages\n\n\n\n\n\n\n\nHigh-level composition\nAs before, to assess the high-level composition of the reads, I ran the ribodepleted files through Kraken (using the Standard 16 database) and summarized the results with Bracken. Combining these results with the read counts above gives us a breakdown of the inferred composition of the samples:\n\nCode# Import Bracken data\nbracken_path <- file.path(data_dir, \"bracken_counts.tsv\")\nbracken <- read_tsv(bracken_path, show_col_types = FALSE)\ntotal_assigned <- bracken %>% group_by(sample) %>% summarize(\n name = \"Total\",\n kraken_assigned_reads = sum(kraken_assigned_reads),\n added_reads = sum(added_reads),\n new_est_reads = sum(new_est_reads),\n fraction_total_reads = sum(fraction_total_reads)\n)\nbracken_spread <- bracken %>% select(name, sample, new_est_reads) %>%\n mutate(name = tolower(name)) %>%\n pivot_wider(id_cols = \"sample\", names_from = \"name\", values_from = \"new_est_reads\")\n# Count reads\nread_counts_preproc <- basic_stats %>% \n select(sample, na_type, date, stage, n_read_pairs) %>%\n pivot_wider(id_cols = c(\"sample\", \"na_type\", \"date\"), names_from=\"stage\", values_from=\"n_read_pairs\")\nread_counts <- read_counts_preproc %>%\n inner_join(total_assigned %>% select(sample, new_est_reads), by = \"sample\") %>%\n rename(assigned = new_est_reads) %>%\n inner_join(bracken_spread, by=\"sample\")\n# Assess composition\nread_comp <- transmute(read_counts, sample=sample, na_type=na_type, date=date,\n n_filtered = raw_concat-cleaned,\n n_duplicate = cleaned-dedup,\n n_ribosomal = (dedup-ribo_initial) + (ribo_initial-ribo_secondary),\n n_unassigned = ribo_secondary-assigned,\n n_bacterial = bacteria,\n n_archaeal = archaea,\n n_viral = viruses,\n n_human = eukaryota)\nread_comp_long <- pivot_longer(read_comp, -(sample:date), names_to = \"classification\",\n names_prefix = \"n_\", values_to = \"n_reads\") %>%\n mutate(classification = fct_inorder(str_to_sentence(classification))) %>%\n group_by(sample) %>% mutate(p_reads = n_reads/sum(n_reads))\nread_comp_summ <- read_comp_long %>% \n group_by(na_type, classification) %>%\n summarize(n_reads = sum(n_reads), .groups = \"drop_last\") %>%\n mutate(n_reads = replace_na(n_reads,0),\n p_reads = n_reads/sum(n_reads),\n pc_reads = p_reads*100)\n \n# Plot overall composition\ng_comp <- ggplot(read_comp_long, aes(x=sample, y=p_reads, fill=classification)) +\n geom_col(position = \"stack\") +\n scale_y_continuous(name = \"% Reads\", limits = c(0,1.01), breaks = seq(0,1,0.2),\n expand = c(0,0), labels = function(x) x*100) +\n scale_fill_brewer(palette = \"Set1\", name = \"Classification\") +\n facet_wrap(.~na_type, scales=\"free\", ncol=5) +\n theme_kit\ng_comp\n\n\n\n\n\n\nCode# Plot composition of minor components\nread_comp_minor <- read_comp_long %>% filter(classification %in% c(\"Archaeal\", \"Viral\", \"Human\", \"Other\"))\npalette_minor <- brewer.pal(9, \"Set1\")[6:9]\ng_comp_minor <- ggplot(read_comp_minor, aes(x=sample, y=p_reads, fill=classification)) +\n geom_col(position = \"stack\") +\n scale_y_continuous(name = \"% Reads\",\n expand = c(0,0), labels = function(x) x*100) +\n scale_fill_manual(values=palette_minor, name = \"Classification\") +\n facet_wrap(.~na_type, scales = \"free_x\", ncol=5) +\n theme_kit\ng_comp_minor\n\n\n\n\n\n\n\n\nCodep_reads_summ_group <- read_comp_long %>%\n mutate(classification = ifelse(classification %in% c(\"Filtered\", \"Duplicate\", \"Unassigned\"), \"Excluded\", as.character(classification)),\n classification = fct_inorder(classification)) %>%\n group_by(classification, sample, na_type) %>%\n summarize(p_reads = sum(p_reads), .groups = \"drop\") %>%\n group_by(classification, na_type) %>%\n summarize(pc_min = min(p_reads)*100, pc_max = max(p_reads)*100, \n pc_mean = mean(p_reads)*100, .groups = \"drop\")\np_reads_summ_prep <- p_reads_summ_group %>%\n mutate(classification = fct_inorder(classification),\n pc_min = pc_min %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n pc_max = pc_max %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n pc_mean = pc_mean %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n display = paste0(pc_min, \"-\", pc_max, \"% (mean \", pc_mean, \"%)\"))\np_reads_summ <- p_reads_summ_prep %>%\n select(classification, na_type, display) %>%\n pivot_wider(names_from=na_type, values_from = display)\np_reads_summ\n\n\n \n\n\n\nAs previously noted, RNA samples were overwhelmingly composed of duplicates. Despite this, the RNA samples achieved a decently high level of total viral reads, with an average of 0.55% of reads mapping to viruses (higher than Crits-Christoph). Levels of total viral reads were much lower in DNA libraries, which were dominated by unassigned and (non-ribosomal) bacterial sequences.\nWithin viral reads, RNA libraries were (as usual) dominated by plant viruses, particularly Virgaviridae – though one sample, unusually, has a substantial minority of reads from Fiersviridae, a family of RNA phages. Detected DNA viruses come from a wider range of families, but the most abundant families (Suoliviridae, Intestiviridae, Autographiviridae, Myoviridae) are all DNA phages.\n\nCode# Get viral taxonomy\nviral_taxa_path <- file.path(data_dir, \"viral-taxids.tsv.gz\")\nviral_taxa <- read_tsv(viral_taxa_path, show_col_types = FALSE)\n\n# Import Kraken reports & extract & count viral families\nsamples <- as.character(basic_stats_raw$sample)\ncol_names <- c(\"pc_reads_total\", \"n_reads_clade\", \"n_reads_direct\", \"rank\", \"taxid\", \"name\")\nreport_paths <- paste0(data_dir, \"kraken/\", samples, \".report.gz\")\nkraken_reports <- lapply(1:length(samples), \n function(n) read_tsv(report_paths[n], col_names = col_names,\n show_col_types = FALSE) %>%\n mutate(sample = samples[n])) %>% bind_rows\nkraken_reports_viral <- filter(kraken_reports, taxid %in% viral_taxa$taxid) %>%\n group_by(sample) %>%\n mutate(p_reads_viral = n_reads_clade/n_reads_clade[1])\nviral_families <- kraken_reports_viral %>% filter(rank == \"F\") %>%\n inner_join(libraries, by=\"sample\")\n\n# Identify major viral families\nviral_families_major_tab <- viral_families %>% group_by(name, taxid, na_type) %>%\n summarize(p_reads_viral_max = max(p_reads_viral), .groups=\"drop\") %>%\n filter(p_reads_viral_max >= 0.04)\nviral_families_major_list <- viral_families_major_tab %>% pull(name)\nviral_families_major <- viral_families %>% filter(name %in% viral_families_major_list) %>%\n select(name, taxid, sample, na_type, date, p_reads_viral)\nviral_families_minor <- viral_families_major %>% group_by(sample, date, na_type) %>%\n summarize(p_reads_viral_major = sum(p_reads_viral), .groups = \"drop\") %>%\n mutate(name = \"Other\", taxid=NA, p_reads_viral = 1-p_reads_viral_major) %>%\n select(name, taxid, sample, na_type, date, p_reads_viral)\nviral_families_display <- bind_rows(viral_families_major, viral_families_minor) %>%\n arrange(desc(p_reads_viral)) %>% mutate(name = factor(name, levels=c(viral_families_major_list, \"Other\")))\ng_families <- ggplot(viral_families_display, aes(x=date, y=p_reads_viral, fill=name)) +\n geom_col(position=\"stack\") +\n scale_fill_brewer(palette = \"Set3\", name = \"Viral family\") +\n scale_y_continuous(name=\"% Viral Reads\", limits=c(0,1), breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n facet_wrap(~na_type) +\n theme_kit\ng_families\n\n\n\n\n\n\n\nGiven the very high level of duplicates in the RNA data, I also repeated the analysis from my second Yang et al. entry, re-running taxonomic composition analysis on pre-deduplication data and comparing the effects of deduplication on composition:\n\nCodeclass_levels <- c(\"Unassigned\", \"Bacterial\", \"Archaeal\", \"Viral\", \"Human\")\nsubset_factor <- 0.05\n\n# 1. Remove filtered & duplicate reads from original Bracken output & renormalize\nread_comp_renorm <- read_comp_long %>%\n filter(! classification %in% c(\"Filtered\", \"Duplicate\")) %>%\n group_by(sample) %>%\n mutate(p_reads = n_reads/sum(n_reads),\n classification = classification %>% as.character %>%\n ifelse(. == \"Ribosomal\", \"Bacterial\", .)) %>%\n group_by(sample, na_type, date, classification) %>%\n summarize(n_reads = sum(n_reads), p_reads = sum(p_reads), .groups = \"drop\") %>%\n mutate(classification = factor(classification, levels = class_levels))\n \n# 2. Import pre-deduplicated \nbracken_path_predup <- file.path(data_dir, \"bracken_counts_subset.tsv\")\nbracken_predup <- read_tsv(bracken_path_predup, show_col_types = FALSE)\ntotal_assigned_predup <- bracken_predup %>% group_by(sample) %>% summarize(\n name = \"Total\",\n kraken_assigned_reads = sum(kraken_assigned_reads),\n added_reads = sum(added_reads),\n new_est_reads = sum(new_est_reads),\n fraction_total_reads = sum(fraction_total_reads)\n)\nbracken_spread_predup <- bracken_predup %>% select(name, sample, new_est_reads) %>%\n mutate(name = tolower(name)) %>%\n pivot_wider(id_cols = \"sample\", names_from = \"name\", values_from = \"new_est_reads\")\n# Count reads\nread_counts_predup <- read_counts_preproc %>%\n select(sample, na_type, date, raw_concat, cleaned) %>%\n mutate(raw_concat = raw_concat * subset_factor, cleaned = cleaned * subset_factor) %>%\n inner_join(total_assigned_predup %>% select(sample, new_est_reads), by = \"sample\") %>%\n rename(assigned = new_est_reads) %>%\n inner_join(bracken_spread_predup, by=\"sample\")\n# Assess composition\nread_comp_predup <- transmute(read_counts_predup, sample=sample, na_type = na_type,\n date=date,\n n_filtered = raw_concat-cleaned,\n n_unassigned = cleaned-assigned,\n n_bacterial = bacteria,\n n_archaeal = archaea,\n n_viral = viruses,\n n_human = eukaryota)\nread_comp_predup_long <- pivot_longer(read_comp_predup, -(sample:date), \n names_to = \"classification\",\n names_prefix = \"n_\", values_to = \"n_reads\") %>%\n mutate(classification = fct_inorder(str_to_sentence(classification))) %>%\n group_by(sample) %>% mutate(p_reads = n_reads/sum(n_reads)) %>%\n filter(! classification %in% c(\"Filtered\", \"Duplicate\")) %>%\n group_by(sample) %>%\n mutate(p_reads = n_reads/sum(n_reads))\n\n# 3. Combine\nread_comp_comb <- bind_rows(read_comp_predup_long %>% mutate(deduplicated = FALSE),\n read_comp_renorm %>% mutate(deduplicated = TRUE)) %>%\n mutate(label = ifelse(deduplicated, \"Post-dedup\", \"Pre-dedup\") %>% fct_inorder)\n\n# Plot overall composition\ng_comp_predup <- ggplot(read_comp_comb, aes(x=label, y=p_reads, fill=classification)) +\n geom_col(position = \"stack\") +\n scale_y_continuous(name = \"% Reads\", limits = c(0,1.01), breaks = seq(0,1,0.2),\n expand = c(0,0), labels = function(x) x*100) +\n scale_fill_brewer(palette = \"Set1\", name = \"Classification\") +\n facet_grid(na_type~date) +\n theme_kit\ng_comp_predup\n\n\n\n\n\n\n\nIn general, deduplication has little effect on the composition of DNA samples, which remain primarily bacterial and unassigned. A few RNA samples show a reduction in the bacterial (more precisely, bacterial + rRNA) fraction after deduplication, consistent with the presence of some true biological duplicate sequences (most likely rRNA) that are being collapsed. Surprisingly, however, the largest effect is in the opposite direction, with several samples showing large increases in bacterial sequences following deduplication. This suggests that some highly abundant non-bacterial sequence is being collapsed in these samples, increasing the apparent abundance of bacteria.\nThe increase in bacterial reads in these samples comes at the expense of (1) human reads and (2) the unassigned fraction. The former suggests the presence of human rRNA sequences that are being erroneously incorporated into the post-deduplication “bacterial” fraction; however, this effect is smaller than the decrease in unassigned reads. One possibility might be other non-human eukaryotic rRNA sequences, which Kraken is unable to assign using our current database.\nHuman-infecting virus reads: validation\nNext, I investigated the human-infecting virus read content of these unenriched samples. To get good results here, I needed to make some changes to the pipeline, including adding Trimmomatic as an additional primer-trimming step to prevent further primer-contamination-based false positives. However, for the sake of time I’m not describing them in detail here; if you want to see more I encourage you to check the commit log at the workflow repo.\nHaving made these changes, the workflow identified a total of 14,073 RNA read pairs and 70 DNA read pairs across all samples (0.09% and 0.00009% of surviving reads, respectively).\n\nCodehv_reads_filtered_path <- file.path(data_dir, \"hv_hits_putative_filtered.tsv.gz\")\nhv_reads_filtered <- read_tsv(hv_reads_filtered_path, show_col_types = FALSE) %>%\n inner_join(libraries, by=\"sample\") %>% \n arrange(date, na_type)\nn_hv_filtered <- hv_reads_filtered %>% group_by(sample, date, na_type) %>% count %>% inner_join(basic_stats %>% filter(stage == \"ribo_initial\") %>% select(sample, n_read_pairs), by=\"sample\") %>% rename(n_putative = n, n_total = n_read_pairs) %>% mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)\nn_hv_filtered_summ <- n_hv_filtered %>% group_by(na_type) %>% summarize(n_putative = sum(n_putative), n_total = sum(n_total)) %>% mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)\n\n\n\nCodemrg <- hv_reads_filtered %>%\n mutate(kraken_label = ifelse(assigned_hv, \"Kraken2 HV\\nassignment\",\n ifelse(hit_hv, \"Kraken2 HV\\nhit\",\n \"No hit or\\nassignment\"))) %>%\n group_by(sample, na_type) %>% arrange(desc(adj_score_fwd), desc(adj_score_rev)) %>%\n mutate(seq_num = row_number(),\n adj_score_max = pmax(adj_score_fwd, adj_score_rev))\n# Import Bowtie2/Kraken data and perform filtering steps\ng_mrg <- ggplot(mrg, aes(x=adj_score_fwd, y=adj_score_rev)) +\n geom_point(alpha=0.5, shape=16) + \n scale_x_continuous(name=\"S(forward read)\", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +\n scale_y_continuous(name=\"S(reverse read)\", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +\n facet_grid(na_type~kraken_label, labeller = labeller(kit = label_wrap_gen(20))) +\n theme_base + theme(aspect.ratio=1)\ng_mrg\n\n\n\n\n\n\nCodeg_hist_0 <- ggplot(mrg, aes(x=adj_score_max)) + geom_histogram(binwidth=5,boundary=0,position=\"dodge\") + facet_grid(na_type~kraken_label, scales=\"free_y\") + scale_x_continuous(name = \"Maximum adjusted alignment score\") + scale_y_continuous(name=\"# Read pairs\") + scale_fill_brewer(palette = \"Dark2\") + theme_base + geom_vline(xintercept=20, linetype=\"dashed\", color=\"red\")\ng_hist_0\n\n\n\n\n\n\n\nAs previously described, I ran BLASTN on these reads via a dedicated EC2 instance, using the same parameters I’ve used for previous datasets.\n\nCodemrg_fasta <- mrg %>%\n mutate(seq_head = paste0(\">\", seq_id)) %>%\n ungroup %>%\n select(header1=seq_head, seq1=query_seq_fwd, \n header2=seq_head, seq2=query_seq_rev) %>%\n mutate(header1=paste0(header1, \"_1\"), header2=paste0(header2, \"_2\"))\nmrg_fasta_out <- do.call(paste, c(mrg_fasta, sep=\"\\n\")) %>% paste(collapse=\"\\n\")\nwrite(mrg_fasta_out, file.path(data_dir, \"brumfield-putative-viral.fasta\"))\n\n\n\nCode# Import BLAST results\nblast_results_path <- file.path(data_dir, \"blast/brumfield-putative-viral.blast.gz\")\nblast_cols <- c(\"qseqid\", \"sseqid\", \"sgi\", \"staxid\", \"qlen\", \"evalue\", \"bitscore\", \"qcovs\", \"length\", \"pident\", \"mismatch\", \"gapopen\", \"sstrand\", \"qstart\", \"qend\", \"sstart\", \"send\")\nblast_results <- read_tsv(blast_results_path, show_col_types = FALSE, \n col_names = blast_cols, col_types = cols(.default=\"c\"))\n\n# Add viral status\nblast_results_viral <- mutate(blast_results, viral = staxid %in% viral_taxa$taxid)\n\n# Filter for best hit for each query/subject, then rank hits for each query\nblast_results_best <- blast_results_viral %>% group_by(qseqid, staxid) %>% \n filter(bitscore == max(bitscore)) %>%\n filter(length == max(length)) %>% filter(row_number() == 1)\nblast_results_ranked <- blast_results_best %>% \n group_by(qseqid) %>% mutate(rank = dense_rank(desc(bitscore)))\nblast_results_highrank <- blast_results_ranked %>% filter(rank <= 5) %>%\n mutate(read_pair = str_split(qseqid, \"_\") %>% sapply(nth, n=-1), \n seq_id = str_split(qseqid, \"_\") %>% sapply(nth, n=1)) %>%\n mutate(bitscore = as.numeric(bitscore))\n\n# Summarize by read pair and taxid\nblast_results_paired <- blast_results_highrank %>%\n group_by(seq_id, staxid, viral) %>%\n summarize(bitscore_max = max(bitscore), bitscore_min = min(bitscore),\n n_reads = n(), .groups = \"drop\") %>%\n mutate(viral_full = viral & n_reads == 2)\n\n# Compare to Kraken & Bowtie assignments\nmrg_assign <- mrg %>% select(sample, seq_id, taxid, assigned_taxid, adj_score_max, na_type)\nblast_results_assign <- left_join(blast_results_paired, mrg_assign, by=\"seq_id\") %>%\n mutate(taxid_match_bowtie = (staxid == taxid),\n taxid_match_kraken = (staxid == assigned_taxid),\n taxid_match_any = taxid_match_bowtie | taxid_match_kraken)\nblast_results_out <- blast_results_assign %>%\n group_by(seq_id) %>%\n summarize(viral_status = ifelse(any(viral_full), 2,\n ifelse(any(taxid_match_any), 2,\n ifelse(any(viral), 1, 0))),\n .groups = \"drop\")\n\n\n\nCode# Merge BLAST results with unenriched read data\nmrg_blast <- full_join(mrg, blast_results_out, by=\"seq_id\") %>%\n mutate(viral_status = replace_na(viral_status, 0),\n viral_status_out = ifelse(viral_status == 0, FALSE, TRUE))\nmrg_blast_rna <- mrg_blast %>% filter(na_type == \"RNA\")\nmrg_blast_dna <- mrg_blast %>% filter(na_type == \"DNA\")\n\n# Plot RNA\ng_mrg_blast_rna <- mrg_blast_rna %>%\n ggplot(aes(x=adj_score_fwd, y=adj_score_rev, color=viral_status_out)) +\n geom_point(alpha=0.5, shape=16) +\n scale_x_continuous(name=\"S(forward read)\", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +\n scale_y_continuous(name=\"S(reverse read)\", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +\n scale_color_brewer(palette = \"Set1\", name = \"Viral status\") +\n facet_grid(viral_status_out~kraken_label, labeller = labeller(kit = label_wrap_gen(20))) +\n theme_base + labs(title=\"RNA\") + \n theme(aspect.ratio=1, plot.title = element_text(size=rel(2), hjust=0))\ng_mrg_blast_rna\n\n\n\n\n\n\nCode# Plot DNA\ng_mrg_blast_dna <- mrg_blast_dna %>%\n ggplot(aes(x=adj_score_fwd, y=adj_score_rev, color=viral_status_out)) +\n geom_point(alpha=0.5, shape=16) +\n scale_x_continuous(name=\"S(forward read)\", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +\n scale_y_continuous(name=\"S(reverse read)\", limits=c(0,65), breaks=seq(0,100,10), expand = c(0,0)) +\n scale_color_brewer(palette = \"Set1\", name = \"Viral status\") +\n facet_grid(viral_status_out~kraken_label, labeller = labeller(kit = label_wrap_gen(20))) +\n theme_base + labs(title=\"DNA\") + \n theme(aspect.ratio=1, plot.title = element_text(size=rel(2), hjust=0))\ng_mrg_blast_dna\n\n\n\n\n\n\n\n\nCodeg_hist_1 <- ggplot(mrg_blast, aes(x=adj_score_max)) + geom_histogram(binwidth=5,boundary=0,position=\"dodge\") + facet_grid(na_type~viral_status_out, scales = \"free_y\") + scale_x_continuous(name = \"Maximum adjusted alignment score\") + scale_y_continuous(name=\"# Read pairs\") + scale_fill_brewer(palette = \"Dark2\") + theme_base + geom_vline(xintercept=20, linetype=\"dashed\", color=\"red\")\ng_hist_1\n\n\n\n\n\n\n\nThese results look very good on visual inspection, and indeed precision and sensitivity are both very high. For a disjunctive score threshold of 20, my updated workflow achieves an F1 score of 99.0% for RNA sequences and 99.2% for DNA sequences; more than high enough to continue on to human viral analysis.\n\nCodetest_sens_spec <- function(tab, score_threshold, conjunctive, include_special){\n if (!include_special) tab <- filter(tab, viral_status_out %in% c(\"TRUE\", \"FALSE\"))\n tab_retained <- tab %>% mutate(\n conjunctive = conjunctive,\n retain_score_conjunctive = (adj_score_fwd > score_threshold & adj_score_rev > score_threshold), \n retain_score_disjunctive = (adj_score_fwd > score_threshold | adj_score_rev > score_threshold),\n retain_score = ifelse(conjunctive, retain_score_conjunctive, retain_score_disjunctive),\n retain = assigned_hv | hit_hv | retain_score) %>%\n group_by(viral_status_out, retain) %>% count\n pos_tru <- tab_retained %>% filter(viral_status_out == \"TRUE\", retain) %>% pull(n) %>% sum\n pos_fls <- tab_retained %>% filter(viral_status_out != \"TRUE\", retain) %>% pull(n) %>% sum\n neg_tru <- tab_retained %>% filter(viral_status_out != \"TRUE\", !retain) %>% pull(n) %>% sum\n neg_fls <- tab_retained %>% filter(viral_status_out == \"TRUE\", !retain) %>% pull(n) %>% sum\n sensitivity <- pos_tru / (pos_tru + neg_fls)\n specificity <- neg_tru / (neg_tru + pos_fls)\n precision <- pos_tru / (pos_tru + pos_fls)\n f1 <- 2 * precision * sensitivity / (precision + sensitivity)\n out <- tibble(threshold=score_threshold, include_special = include_special, \n conjunctive = conjunctive, sensitivity=sensitivity, \n specificity=specificity, precision=precision, f1=f1)\n return(out)\n}\nrange_f1 <- function(intab, inc_special, inrange=15:45){\n tss <- purrr::partial(test_sens_spec, tab=intab, include_special=inc_special)\n stats_conj <- lapply(inrange, tss, conjunctive=TRUE) %>% bind_rows\n stats_disj <- lapply(inrange, tss, conjunctive=FALSE) %>% bind_rows\n stats_all <- bind_rows(stats_conj, stats_disj) %>%\n pivot_longer(!(threshold:conjunctive), names_to=\"metric\", values_to=\"value\") %>%\n mutate(conj_label = ifelse(conjunctive, \"Conjunctive\", \"Disjunctive\"))\n return(stats_all)\n}\ninc_special <- FALSE\nstats_rna <- range_f1(mrg_blast_rna, inc_special) %>% mutate(na_type = \"RNA\")\nstats_dna <- range_f1(mrg_blast_dna, inc_special) %>% mutate(na_type = \"DNA\")\nstats_0 <- bind_rows(stats_rna, stats_dna) %>% mutate(attempt=0)\nthreshold_opt_0 <- stats_0 %>% group_by(conj_label,attempt,na_type) %>% \n filter(metric == \"f1\") %>%\n filter(value == max(value)) %>% filter(threshold == min(threshold))\ng_stats_0 <- ggplot(stats_0, aes(x=threshold, y=value, color=metric)) +\n geom_vline(data = threshold_opt_0, mapping = aes(xintercept=threshold),\n color = \"red\", linetype = \"dashed\") +\n geom_line() +\n scale_y_continuous(name = \"Value\", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_x_continuous(name = \"Threshold\", expand = c(0,0)) +\n scale_color_brewer(palette=\"Set3\") +\n facet_grid(na_type~conj_label) +\n theme_base\ng_stats_0\n\n\n\n\n\n\n\nHuman-infecting virus reads: analysis\n\nCode# Get raw read counts\nread_counts_raw <- basic_stats_raw %>%\n select(sample, date, na_type, n_reads_raw = n_read_pairs)\n\n# Get HV read counts\nmrg_hv <- mrg %>% mutate(hv_status = assigned_hv | hit_hv | adj_score_max >= 20)\nread_counts_hv <- mrg_hv %>% filter(hv_status) %>% group_by(sample) %>% count(name=\"n_reads_hv\")\nread_counts <- read_counts_raw %>% left_join(read_counts_hv, by=\"sample\") %>%\n mutate(n_reads_hv = replace_na(n_reads_hv, 0))\n\n# Aggregate\nread_counts_total <- read_counts %>% group_by(na_type) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv)) %>%\n mutate(sample= \"All samples\", date = \"All dates\")\nread_counts_agg <- read_counts %>% arrange(date) %>% \n mutate(date = as.character(date)) %>% arrange(sample) %>%\n bind_rows(read_counts_total) %>% \n mutate(sample = fct_inorder(sample),\n date = fct_inorder(date),\n p_reads_hv = n_reads_hv/n_reads_raw)\n\n# Get old counts\nn_hv_reads_old <- c(691, 121, 18, 224, 2, 26, 6, 21, 4, 29, 12, 22)\nn_hv_reads_old[13] <- sum(n_hv_reads_old[which(read_counts$na_type==\"RNA\")])\nn_hv_reads_old[14] <- sum(n_hv_reads_old[which(read_counts$na_type==\"DNA\")])\n\nread_counts_comp <- read_counts_agg %>%\n mutate(n_reads_hv_old = n_hv_reads_old,\n p_reads_hv_old = n_reads_hv_old/n_reads_raw)\n\n\nApplying a disjunctive cutoff at S=20 identifies 13,809 RNA reads and 66 DNA reads as human-viral. This gives an overall relative HV abundance of \\(5.00 \\times 10^{-5}\\) for RNA reads and \\(3.66 \\times 10^{-7}\\) for DNA reads. In contrast, the overall HV in the public dashboard is \\(3.93 \\times 10^{-6}\\) for RNA reads and \\(4.54 \\times 10^{-7}\\); the measured HV has thus increased 12.7x for RNA reads, but decreased slightly for DNA reads.\n\nCode# Visualize\ng_phv_agg <- ggplot(read_counts_agg, aes(x=date, color=na_type)) +\n geom_point(aes(y=p_reads_hv)) +\n scale_y_log10(\"Relative abundance of human virus reads\") +\n scale_color_na() + theme_kit\ng_phv_agg\n\n\n\n\n\n\n\nDigging into individual viruses, we see that fecal-oral viruses predominate, especially Mamastrovirus, Rotavirus, Salivirus, and fecal-oral Enterovirus species (especially Enterovirus C, which includes poliovirus):\n\nCode# Get viral taxon names for putative HV reads\nviral_taxa$name[viral_taxa$taxid == 249588] <- \"Mamastrovirus\"\nviral_taxa$name[viral_taxa$taxid == 194960] <- \"Kobuvirus\"\nviral_taxa$name[viral_taxa$taxid == 688449] <- \"Salivirus\"\nviral_taxa$name[viral_taxa$taxid == 585893] <- \"Picobirnaviridae\"\nviral_taxa$name[viral_taxa$taxid == 333922] <- \"Betapapillomavirus\"\n\nviral_taxa$name[viral_taxa$taxid == 334207] <- \"Betapapillomavirus 3\"\nviral_taxa$name[viral_taxa$taxid == 369960] <- \"Porcine type-C oncovirus\"\nviral_taxa$name[viral_taxa$taxid == 333924] <- \"Betapapillomavirus 2\"\n\nmrg_hv_named <- mrg_hv %>% left_join(viral_taxa, by=\"taxid\")\n\n# Discover viral species & genera for HV reads\nraise_rank <- function(read_db, taxid_db, out_rank = \"species\", verbose = FALSE){\n # Get higher ranks than search rank\n ranks <- c(\"subspecies\", \"species\", \"subgenus\", \"genus\", \"subfamily\", \"family\", \"suborder\", \"order\", \"class\", \"subphylum\", \"phylum\", \"kingdom\", \"superkingdom\")\n rank_match <- which.max(ranks == out_rank)\n high_ranks <- ranks[rank_match:length(ranks)]\n # Merge read DB and taxid DB\n reads <- read_db %>% select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n # Extract sequences that are already at appropriate rank\n reads_rank <- filter(reads, rank == out_rank)\n # Drop sequences at a higher rank and return unclassified sequences\n reads_norank <- reads %>% filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))\n while(nrow(reads_norank) > 0){ # As long as there are unclassified sequences...\n # Promote read taxids and re-merge with taxid DB, then re-classify and filter\n reads_remaining <- reads_norank %>% mutate(taxid = parent_taxid) %>%\n select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n reads_rank <- reads_remaining %>% filter(rank == out_rank) %>%\n bind_rows(reads_rank)\n reads_norank <- reads_remaining %>%\n filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))\n }\n # Finally, extract and append reads that were excluded during the process\n reads_dropped <- reads %>% filter(!seq_id %in% reads_rank$seq_id)\n reads_out <- reads_rank %>% bind_rows(reads_dropped) %>%\n select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n return(reads_out)\n}\nhv_reads_species <- raise_rank(mrg_hv_named, viral_taxa, \"species\")\nhv_reads_genus <- raise_rank(mrg_hv_named, viral_taxa, \"genus\")\nhv_reads_family <- raise_rank(mrg_hv_named, viral_taxa, \"family\")\n\n\n\nCode# Count reads for each human-viral family\nhv_family_counts <- hv_reads_family %>% \n group_by(sample, date, na_type, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, date, na_type) %>%\n mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\nhv_family_major_tab <- hv_family_counts %>% group_by(name) %>% \n filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%\n arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > 0.01)\nhv_family_counts_major <- hv_family_counts %>%\n mutate(name_display = ifelse(name %in% hv_family_major_tab$name, name, \"Other\")) %>%\n group_by(sample, date, na_type, name_display) %>%\n summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(hv_family_major_tab$name, \"Other\")))\n\n# Plot family composition\npalette <- c(brewer.pal(12, \"Set3\"), \"#AAAAAA\")\ng_hv_families <- ggplot(hv_family_counts_major, \n aes(x=date, y=p_reads_hv, fill=name_display)) +\n geom_col(position=\"stack\") +\n scale_fill_manual(name=\"Viral\\nfamily\", values=palette) +\n scale_y_continuous(name=\"% HV Reads\", limits=c(0,1), breaks=seq(0,1,0.2)) +\n facet_grid(.~na_type) +\n guides(fill=guide_legend(ncol=3)) +\n theme_kit\ng_hv_families\n\n\n\n\n\n\n\n\nCode# Count reads for each human-viral genus\nhv_genus_counts <- hv_reads_genus %>% \n group_by(sample, date, na_type, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, date, na_type) %>%\n mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking genera and group others\nhv_genus_major_tab <- hv_genus_counts %>% group_by(name) %>% \n filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%\n arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > 0.02)\nhv_genus_counts_major <- hv_genus_counts %>%\n mutate(name_display = ifelse(name %in% hv_genus_major_tab$name, name, \"Other\")) %>%\n group_by(sample, date, na_type, name_display) %>%\n summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(hv_genus_major_tab$name, \"Other\")))\n\n# Plot genus composition\npalette <- c(brewer.pal(12, \"Set3\"), \"#AAAAAA\")\ng_hv_genera <- ggplot(hv_genus_counts_major, \n aes(x=date, y=p_reads_hv, fill=name_display)) +\n geom_col(position=\"stack\") +\n scale_fill_manual(name=\"Viral\\ngenus\", values=palette) +\n scale_y_continuous(name=\"% HV Reads\", limits=c(0,1), breaks=seq(0,1,0.2)) +\n facet_grid(.~na_type) +\n guides(fill=guide_legend(ncol=3)) +\n theme_kit\ng_hv_genera\n\n\n\n\n\n\n\n\nCode# Count reads for each human-viral species\nhv_species_counts <- hv_reads_species %>% \n group_by(sample, date, na_type, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, date, na_type) %>%\n mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking species and group others\nhv_species_major_tab <- hv_species_counts %>% group_by(name, taxid) %>% \n filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%\n arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > 0.05)\nhv_species_counts_major <- hv_species_counts %>%\n mutate(name_display = ifelse(name %in% hv_species_major_tab$name, name, \"Other\")) %>%\n group_by(sample, date, na_type, name_display) %>%\n summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(hv_species_major_tab$name, \"Other\")))\n\n# Plot species composition\npalette <- c(brewer.pal(12, \"Set3\"), \"#AAAAAA\")\ng_hv_species <- ggplot(hv_species_counts_major, \n aes(x=date, y=p_reads_hv, fill=name_display)) +\n geom_col(position=\"stack\") +\n scale_fill_manual(name=\"Viral\\nspecies\", values=palette) +\n scale_y_continuous(name=\"% HV Reads\", limits=c(0,1), breaks=seq(0,1,0.2)) +\n facet_grid(.~na_type) +\n guides(fill=guide_legend(ncol=3)) +\n theme_kit\ng_hv_species\n\n\n\n\n\n\n\nBy far the most common virus according to my pipeline (with over 91% of human-viral reads) is Rotavirus A; second (with 8%) is Orthopicobirnavirus hominis. Together, these two enteric RNA viruses dominate HV RNA reads in all samples. Both appear to be real; at least, BLASTN primarily maps these reads to the same or closely-related viruses to that identified by Bowtie2:\n\nCode# Import names db\ntax_names_path <- file.path(data_dir, \"taxid-names.tsv.gz\")\ntax_names <- read_tsv(tax_names_path, show_col_types = FALSE)\n\n# Add missing names\ntax_names_new <- tibble(\n staxid = c(557241, 557245, 557232, 557247, 557242, 557245),\n name = c(\"Canine rotavirus A79-10/G3P[3]\", \"Human rotavirus Ro1845\", \"Canine rotavirus K9\",\n \"Human rotavirus HCR3A\", \"Feline rotavirus Cat97/G3P[3]\", \"Feline rotavirus Cat2/G3P[9]\")\n)\ntax_names <- bind_rows(tax_names, tax_names_new)\n\nrota_ids <- hv_reads_species %>% filter(taxid==28875) %>% pull(seq_id)\nrota_hits <- blast_results_paired %>%\n filter(seq_id %in% rota_ids) %>%\n mutate(staxid = as.integer(staxid))\nrota_hits_sorted <- rota_hits %>% group_by(staxid) %>% count %>% arrange(desc(n)) %>%\n left_join(tax_names, by=\"staxid\") %>%\n mutate(name = fct_inorder(name))\nrota_hits_sorted_head <- rota_hits_sorted %>% head(10) %>%\n mutate(name=fct_inorder(as.character(name)))\ng_rota <- ggplot(rota_hits_sorted_head,\n aes(x=n, y=fct_inorder(name))) + geom_col() +\n scale_x_continuous(name = \"# Mapped read pairs\") + theme_base +\n theme(axis.title.y = element_blank())\ng_rota\n\n\n\n\n\n\n\n\nCode# Add missing names\ntax_names_new <- tibble(\n staxid = c(442302, 3003954, 585895),\n name = c(\"Porcine picobirnavirus\", \"ticpantry virus 7\", \"uncultured picobirnavirus\")\n)\ntax_names <- bind_rows(tax_names, tax_names_new)\n\npico_ids <- hv_reads_species %>% filter(taxid==2956252) %>% pull(seq_id)\npico_hits <- blast_results_paired %>%\n filter(seq_id %in% pico_ids) %>%\n mutate(staxid = as.integer(staxid))\npico_hits_sorted <- pico_hits %>% group_by(staxid) %>% count %>% arrange(desc(n)) %>%\n left_join(tax_names, by=\"staxid\") %>%\n mutate(name = fct_inorder(name))\npico_hits_sorted_head <- pico_hits_sorted %>% head(10) %>%\n mutate(name=fct_inorder(as.character(name)))\ng_pico <- ggplot(pico_hits_sorted_head,\n aes(x=n, y=fct_inorder(name))) + geom_col() +\n scale_x_continuous(name = \"# Mapped read pairs\") + theme_base +\n theme(axis.title.y = element_blank())\ng_pico\n\n\n\n\n\n\n\nConclusion\nI’m quite happy with the quality of the human-viral selection process for this dataset, which ended up achieving very high precision and sensitivity. The most striking result coming out of this analysis is probably the drastic difference in total HV abundance between RNA and DNA reads, with the former >100x higher despite very similar processing methods. In the past we’ve attributed low HV abundance in the DNA datasets we’ve analyzed to the lack of viral enrichment in available DNA datasets; in this case, however, there is very little difference between the DNA and RNA processing methods, so this explanation doesn’t really hold. This makes me more pessimistic about achieving good HV relative abundance with DNA sequencing, even with improved viral enrichment methods."
}
]
\ No newline at end of file
diff --git a/notebooks/2024-04-08_brumfield.qmd b/notebooks/2024-04-08_brumfield.qmd
index 7881ed8..810b062 100644
--- a/notebooks/2024-04-08_brumfield.qmd
+++ b/notebooks/2024-04-08_brumfield.qmd
@@ -11,7 +11,6 @@ format:
df-print: paged
editor: visual
title-block-banner: black
-draft: true
---
```{r}