diff --git a/data/2024-05-01_ng/adapters.fasta b/data/2024-05-01_ng/adapters.fasta new file mode 100644 index 0000000..cd41de8 --- /dev/null +++ b/data/2024-05-01_ng/adapters.fasta @@ -0,0 +1,41 @@ +>0 +GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +>1 +TGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>2 +unspecified +>3 +heifigepsna +>4 +CTGTCTCTTATACACATCTGACGCTGCCGACGA +>5 +CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +>6 +CTGTCTCTTATACACATCTCCGAGCCCACGAGAC +>7 +AGATCGGAAGAGCACACGTCTGAACTCCAGTCA +>8 +CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC +T +>9 +GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG +>10 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC +>11 +ACACTCTTTCCCTACACGACGCTCTTCCGATCT +>12 +GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG +>13 +CAAGCAGAAGACGGCATACGAGAT +>14 +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +>15 +GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +>16 +CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +>17 +TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG +>18 +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>19 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT diff --git a/data/2024-05-01_ng/hv_clade_counts.tsv.gz b/data/2024-05-01_ng/hv_clade_counts.tsv.gz new file mode 100644 index 0000000..f95beed Binary files /dev/null and b/data/2024-05-01_ng/hv_clade_counts.tsv.gz differ diff --git a/data/2024-05-01_ng/hv_hits_blast_paired.tsv.gz b/data/2024-05-01_ng/hv_hits_blast_paired.tsv.gz new file mode 100644 index 0000000..189785f Binary files /dev/null and b/data/2024-05-01_ng/hv_hits_blast_paired.tsv.gz differ diff --git a/data/2024-05-01_ng/hv_hits_putative_filtered.tsv.gz b/data/2024-05-01_ng/hv_hits_putative_filtered.tsv.gz new file mode 100644 index 0000000..32fd26f Binary files /dev/null and b/data/2024-05-01_ng/hv_hits_putative_filtered.tsv.gz differ diff --git a/data/2024-05-01_ng/kraken_reports.tsv.gz b/data/2024-05-01_ng/kraken_reports.tsv.gz new file mode 100644 index 0000000..b35716a Binary files /dev/null and b/data/2024-05-01_ng/kraken_reports.tsv.gz differ diff --git a/data/2024-05-01_ng/qc_adapter_stats.tsv.gz b/data/2024-05-01_ng/qc_adapter_stats.tsv.gz new file mode 100644 index 0000000..ce48d39 Binary files /dev/null and b/data/2024-05-01_ng/qc_adapter_stats.tsv.gz differ diff --git a/data/2024-05-01_ng/qc_basic_stats.tsv.gz b/data/2024-05-01_ng/qc_basic_stats.tsv.gz new file mode 100644 index 0000000..09a55d0 Binary files /dev/null and b/data/2024-05-01_ng/qc_basic_stats.tsv.gz differ diff --git a/data/2024-05-01_ng/qc_quality_base_stats.tsv.gz b/data/2024-05-01_ng/qc_quality_base_stats.tsv.gz new file mode 100644 index 0000000..8dd266c Binary files /dev/null and b/data/2024-05-01_ng/qc_quality_base_stats.tsv.gz differ diff --git a/data/2024-05-01_ng/qc_quality_sequence_stats.tsv.gz b/data/2024-05-01_ng/qc_quality_sequence_stats.tsv.gz new file mode 100644 index 0000000..00beff0 Binary files /dev/null and b/data/2024-05-01_ng/qc_quality_sequence_stats.tsv.gz differ diff --git a/data/2024-05-01_ng/sample-metadata.csv b/data/2024-05-01_ng/sample-metadata.csv new file mode 100644 index 0000000..9917be9 --- /dev/null +++ b/data/2024-05-01_ng/sample-metadata.csv @@ -0,0 +1,37 @@ +library,sample,sample_type,date,bioproject,dataset +SRR6837549,SRR6837549,Effluent from Secondary Settling Tank (SST),2016-10,PRJNA438174,Ng 2019 +SRR6837550,SRR6837550,Effluent from Membrane Biorector (MBR),2016-10,PRJNA438174,Ng 2019 +SRR6837551,SRR6837551,Effluent from Primary Settling Tank (PST),2016-10,PRJNA438174,Ng 2019 +SRR6837552,SRR6837552,Influent,2016-10,PRJNA438174,Ng 2019 +SRR6837553,SRR6837553,Effluent from Primary Settling Tank (PST),2016-11,PRJNA438174,Ng 2019 +SRR6837554,SRR6837554,Influent,2016-11,PRJNA438174,Ng 2019 +SRR6837555,SRR6837555,Sludge (SLUDGE),2016-10,PRJNA438174,Ng 2019 +SRR6837556,SRR6837556,Effluent from Wet Well (WW),2016-10,PRJNA438174,Ng 2019 +SRR6837557,SRR6837557,Effluent from Secondary Settling Tank (SST),2016-11,PRJNA438174,Ng 2019 +SRR6837558,SRR6837558,Effluent from Membrane Biorector (MBR),2016-11,PRJNA438174,Ng 2019 +SRR6837559,SRR6837559,Influent,2017-05,PRJNA438174,Ng 2019 +SRR6837560,SRR6837560,Effluent from Primary Settling Tank (PST),2017-05,PRJNA438174,Ng 2019 +SRR6837561,SRR6837561,Effluent from Membrane Biorector (MBR),2017-05,PRJNA438174,Ng 2019 +SRR6837562,SRR6837562,Effluent from Secondary Settling Tank (SST),2017-05,PRJNA438174,Ng 2019 +SRR6837563,SRR6837563,Effluent from Membrane Biorector (MBR),2017-03,PRJNA438174,Ng 2019 +SRR6837564,SRR6837564,Effluent from Secondary Settling Tank (SST),2017-03,PRJNA438174,Ng 2019 +SRR6837565,SRR6837565,Effluent from Wet Well (WW),2017-03,PRJNA438174,Ng 2019 +SRR6837566,SRR6837566,Sludge (SLUDGE),2017-03,PRJNA438174,Ng 2019 +SRR6837567,SRR6837567,Effluent from Membrane Biorector (MBR),2017-08,PRJNA438174,Ng 2019 +SRR6837568,SRR6837568,Effluent from Wet Well (WW),2017-05,PRJNA438174,Ng 2019 +SRR6837569,SRR6837569,Sludge (SLUDGE),2017-05,PRJNA438174,Ng 2019 +SRR6837570,SRR6837570,Effluent from Secondary Settling Tank (SST),2017-08,PRJNA438174,Ng 2019 +SRR6837571,SRR6837571,Sludge (SLUDGE),2016-11,PRJNA438174,Ng 2019 +SRR6837572,SRR6837572,Effluent from Wet Well (WW),2016-11,PRJNA438174,Ng 2019 +SRR6837573,SRR6837573,Effluent from Primary Settling Tank (PST),2017-01,PRJNA438174,Ng 2019 +SRR6837574,SRR6837574,Influent,2017-01,PRJNA438174,Ng 2019 +SRR6837575,SRR6837575,Effluent from Secondary Settling Tank (SST),2017-01,PRJNA438174,Ng 2019 +SRR6837576,SRR6837576,Effluent from Membrane Biorector (MBR),2017-01,PRJNA438174,Ng 2019 +SRR6837577,SRR6837577,Sludge (SLUDGE),2017-01,PRJNA438174,Ng 2019 +SRR6837578,SRR6837578,Effluent from Wet Well (WW),2017-01,PRJNA438174,Ng 2019 +SRR6837579,SRR6837579,Effluent from Primary Settling Tank (PST),2017-03,PRJNA438174,Ng 2019 +SRR6837580,SRR6837580,Influent,2017-03,PRJNA438174,Ng 2019 +SRR6837581,SRR6837581,Effluent from Primary Settling Tank (PST),2017-08,PRJNA438174,Ng 2019 +SRR6837582,SRR6837582,Influent,2017-08,PRJNA438174,Ng 2019 +SRR6837583,SRR6837583,Sludge (SLUDGE),2017-08,PRJNA438174,Ng 2019 +SRR6837584,SRR6837584,Effluent from Wet Well (WW),2017-08,PRJNA438174,Ng 2019 \ No newline at end of file diff --git a/data/2024-05-01_ng/taxid-names.tsv.gz b/data/2024-05-01_ng/taxid-names.tsv.gz new file mode 120000 index 0000000..626546b --- /dev/null +++ b/data/2024-05-01_ng/taxid-names.tsv.gz @@ -0,0 +1 @@ +../2024-04-01_spurbeck/taxid-names.tsv.gz \ No newline at end of file diff --git a/data/2024-05-01_ng/taxonomic_composition.tsv.gz b/data/2024-05-01_ng/taxonomic_composition.tsv.gz new file mode 100644 index 0000000..848aca6 Binary files /dev/null and b/data/2024-05-01_ng/taxonomic_composition.tsv.gz differ diff --git a/data/2024-05-01_ng/viral-taxids.tsv.gz b/data/2024-05-01_ng/viral-taxids.tsv.gz new file mode 120000 index 0000000..349083e --- /dev/null +++ b/data/2024-05-01_ng/viral-taxids.tsv.gz @@ -0,0 +1 @@ +../2024-03-19_brumfield/viral-taxids.tsv.gz \ No newline at end of file diff --git a/docs/img/2024-05-01_ng-schematic.png b/docs/img/2024-05-01_ng-schematic.png new file mode 100644 index 0000000..eecfad3 Binary files /dev/null and b/docs/img/2024-05-01_ng-schematic.png differ diff --git a/docs/index.html b/docs/index.html index da0da46..e315eeb 100644 --- a/docs/index.html +++ b/docs/index.html @@ -165,7 +165,29 @@
-
+
+
+

+

+

+
+ + +
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
-
+
@@ -715,7 +737,7 @@

-
+

diff --git a/docs/listings.json b/docs/listings.json index 275ffe5..b0f90b2 100644 --- a/docs/listings.json +++ b/docs/listings.json @@ -2,6 +2,7 @@ { "listing": "/index.html", "items": [ + "/notebooks/2024-05-01_ng.html", "/notebooks/2024-05-01_bengtsson-palme.html", "/notebooks/2024-04-30_brinch.html", "/notebooks/2024-04-19_leung.html", diff --git a/docs/notebooks/2024-05-01_bengtsson-palme.html b/docs/notebooks/2024-05-01_bengtsson-palme.html index 342e86e..6a98f8b 100644 --- a/docs/notebooks/2024-05-01_bengtsson-palme.html +++ b/docs/notebooks/2024-05-01_bengtsson-palme.html @@ -6,7 +6,7 @@ -Will’s Public NAO Notebook - Workflow analysis of Bengtsson-Palme et al. (2021) +Will’s Public NAO Notebook - Workflow analysis of Bengtsson-Palme et al. (2016) + + + + + + + + + + + + + + + + + + + + + + +
+
+
+

Workflow analysis of Ng et al. (2019)

+

Wastewater from Singapore.

+
+
+ + +
+ +
+
Author
+
+

Will Bradshaw

+
+
+ +
+
Published
+
+

May 1, 2024

+
+
+ + +
+ + +
+ + + + +

Continuing my analysis of datasets from the P2RA preprint, I analyzed the data from Ng et al. (2019), a study that used DNA sequencing of wastewater samples to characterize the bacterial microbiota and resistome in Singapore. This study used processing methods I haven’t seen before:

+
    +
  1. All samples passed through “a filter” on-site at the WWTP prior to further processing in lab.

  2. +
  3. Samples concentrated to 400ml using a Hemoflow dialyzer “via standard bloodline tubing”.

  4. +
  5. Eluted concentrates then further concentrated by passing through a 0.22um filter and retaining the retentate (NB: this is anti-selecting for viruses).

  6. +
  7. Sludge samples were instead centrifuged and the pellet kept for further analysis.

  8. +
  9. After concentration, samples underwent DNA extraction with the PowerSoil DNA Isolation Kit, then underwent library prep and Illumina sequencing with an Illumina HiSeq2500 (2x250bp).

  10. +
+

Since this was a bacteria-focused study that used processing methods we expect to select against viruses, we wouldn’t expect to see high viral relative abundances here. Nevertheless, it’s worth seeing what we can see.

+

The raw data

+

Samples were collected from six different locations in the treatment plant on six different dates (from October 2016 to August 2017) for a total of 36 samples:

+

+
+
Code
# Importing the data is a bit more complicated this time as the samples are split across three pipeline runs
+data_dir <- "../data/2024-05-01_ng"
+
+# Data input paths
+libraries_path <- file.path(data_dir, "sample-metadata.csv")
+basic_stats_path <- file.path(data_dir, "qc_basic_stats.tsv.gz")
+adapter_stats_path <- file.path(data_dir, "qc_adapter_stats.tsv.gz")
+quality_base_stats_path <- file.path(data_dir, "qc_quality_base_stats.tsv.gz")
+quality_seq_stats_path <- file.path(data_dir, "qc_quality_sequence_stats.tsv.gz")
+
+# Import libraries and extract metadata from sample names
+locs <- c("INF", "PST", "SLUDGE", "SST", "MBR", "WW")
+libraries_raw <- lapply(libraries_path, read_csv, show_col_types = FALSE) %>%
+  bind_rows
+libraries <- libraries_raw %>%
+  mutate(sample_type_long = gsub(" \\(.*", "", sample_type),
+         sample_type_short = ifelse(sample_type_long == "Influent", "INF",
+                                    sub(".*\\((.*)\\)", "\\1", sample_type)),
+         sample_type_short = factor(sample_type_short, levels=locs)) %>%
+  arrange(sample_type_short, date) %>%
+  mutate(sample_type_long = fct_inorder(sample_type_long),
+         sample = fct_inorder(sample)) %>%
+  arrange(date) %>%
+  mutate(date = fct_inorder(date))
+
+# Make table
+count_samples <- libraries %>% group_by(sample_type_long, sample_type_short) %>%
+  count %>%
+  rename(`Sample Type`=sample_type_long, Abbreviation=sample_type_short)
+count_samples
+
+
+ +
+
+
+
+
Code
# Import QC data
+stages <- c("raw_concat", "cleaned", "dedup", "ribo_initial", "ribo_secondary")
+import_basic <- function(paths){
+  lapply(paths, read_tsv, show_col_types = FALSE) %>% bind_rows %>%
+    inner_join(libraries, by="sample") %>%
+      arrange(sample_type_short, date, sample) %>%
+    mutate(stage = factor(stage, levels = stages),
+           sample = fct_inorder(sample))
+}
+import_basic_paired <- function(paths){
+  import_basic(paths) %>% arrange(read_pair) %>% 
+    mutate(read_pair = fct_inorder(as.character(read_pair)))
+}
+basic_stats <- import_basic(basic_stats_path)
+adapter_stats <- import_basic_paired(adapter_stats_path)
+quality_base_stats <- import_basic_paired(quality_base_stats_path)
+quality_seq_stats <- import_basic_paired(quality_seq_stats_path)
+
+# Filter to raw data
+basic_stats_raw <- basic_stats %>% filter(stage == "raw_concat")
+adapter_stats_raw <- adapter_stats %>% filter(stage == "raw_concat")
+quality_base_stats_raw <- quality_base_stats %>% filter(stage == "raw_concat")
+quality_seq_stats_raw <- quality_seq_stats %>% filter(stage == "raw_concat")
+
+# Get key values for readout
+raw_read_counts <- basic_stats_raw %>% ungroup %>% 
+  summarize(rmin = min(n_read_pairs), rmax=max(n_read_pairs),
+            rmean=mean(n_read_pairs), 
+            rtot = sum(n_read_pairs),
+            btot = sum(n_bases_approx),
+            dmin = min(percent_duplicates), dmax=max(percent_duplicates),
+            dmean=mean(percent_duplicates), .groups = "drop")
+
+
+

These 36 samples yielded 26.6M-74.1M (mean 46.1M) reads per sample, for a total of 1.7B read pairs (830 gigabases of sequence). Read qualities were mostly high but tailed off towards the 3’ end, requiring some trimming. Adapter levels were fairly low but still in need of some trimming. Inferred duplication levels were variable (1-64%, mean 31%), with libraries with lower read depth showing much lower duplicate levels.

+
+
Code
# Prepare data
+basic_stats_raw_metrics <- basic_stats_raw %>%
+  select(sample, sample_type_short, date,
+         `# Read pairs` = n_read_pairs,
+         `Total base pairs\n(approx)` = n_bases_approx,
+         `% Duplicates\n(FASTQC)` = percent_duplicates) %>%
+  pivot_longer(-(sample:date), names_to = "metric", values_to = "value") %>%
+  mutate(metric = fct_inorder(metric))
+
+# Set up plot templates
+scale_fill_st <- purrr::partial(scale_fill_brewer, palette="Set1", name="Sample Type")
+g_basic <- ggplot(basic_stats_raw_metrics, 
+                  aes(x=sample, y=value, fill=sample_type_short, 
+                      group=interaction(sample_type_short,sample))) +
+  geom_col(position = "dodge") +
+  scale_y_continuous(expand=c(0,0)) +
+  expand_limits(y=c(0,100)) +
+  scale_fill_st() + 
+  facet_grid(metric~., scales = "free", space="free_x", switch="y") +
+  theme_xblank + theme(
+    axis.title.y = element_blank(),
+    strip.text.y = element_text(face="plain")
+  )
+g_basic
+
+
+

+
+
+
+
+
+
Code
# Set up plotting templates
+scale_color_st <- purrr::partial(scale_color_brewer, palette="Set1",
+                                   name="Sample Type")
+g_qual_raw <- ggplot(mapping=aes(color=sample_type_short, linetype=read_pair, 
+                         group=interaction(sample,read_pair))) + 
+  scale_color_st() + scale_linetype_discrete(name = "Read Pair") +
+  guides(color=guide_legend(nrow=2,byrow=TRUE),
+         linetype = guide_legend(nrow=2,byrow=TRUE)) +
+  theme_base
+
+# Visualize adapters
+g_adapters_raw <- g_qual_raw + 
+  geom_line(aes(x=position, y=pc_adapters), data=adapter_stats_raw) +
+  scale_y_continuous(name="% Adapters", limits=c(0,NA),
+                     breaks = seq(0,100,1), expand=c(0,0)) +
+  scale_x_continuous(name="Position", limits=c(0,NA),
+                     breaks=seq(0,500,20), expand=c(0,0)) +
+  facet_grid(.~adapter)
+g_adapters_raw
+
+
+

+
+
+
+
Code
# Visualize quality
+g_quality_base_raw <- g_qual_raw +
+  geom_hline(yintercept=25, linetype="dashed", color="red") +
+  geom_hline(yintercept=30, linetype="dashed", color="red") +
+  geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats_raw) +
+  scale_y_continuous(name="Mean Phred score", expand=c(0,0), limits=c(10,45)) +
+  scale_x_continuous(name="Position", limits=c(0,NA),
+                     breaks=seq(0,500,20), expand=c(0,0))
+g_quality_base_raw
+
+
+

+
+
+
+
Code
g_quality_seq_raw <- g_qual_raw +
+  geom_vline(xintercept=25, linetype="dashed", color="red") +
+  geom_vline(xintercept=30, linetype="dashed", color="red") +
+  geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats_raw) +
+  scale_x_continuous(name="Mean Phred score", expand=c(0,0)) +
+  scale_y_continuous(name="# Sequences", expand=c(0,0))
+g_quality_seq_raw
+
+
+

+
+
+
+
+

Preprocessing

+

The average fraction of reads lost at each stage in the preprocessing pipeline is shown in the following table. As expected given the observed difference in duplication levels, many more reads were lost during deduplication in liquid samples than sludge samples. Conversely, trimming and filtering consistently removed more reads in sludge than in liquid samples, though the effect was less dramatic than for deduplication. Very few reads were lost during ribodepletion, as expected for DNA sequencing libraries.

+
+
Code
n_reads_rel <- basic_stats %>% 
+  select(sample, sample_type_short, date, stage, 
+         percent_duplicates, n_read_pairs) %>%
+  group_by(sample) %>% arrange(sample, stage) %>%
+  mutate(p_reads_retained = replace_na(n_read_pairs / lag(n_read_pairs), 0),
+         p_reads_lost = 1 - p_reads_retained,
+         p_reads_retained_abs = n_read_pairs / n_read_pairs[1],
+         p_reads_lost_abs = 1-p_reads_retained_abs,
+         p_reads_lost_abs_marginal = replace_na(p_reads_lost_abs - lag(p_reads_lost_abs), 0))
+n_reads_rel_display <- n_reads_rel %>% 
+  group_by(`Sample Type`=sample_type_short, Stage=stage) %>% 
+  summarize(`% Total Reads Lost (Cumulative)` = paste0(round(min(p_reads_lost_abs*100),1), "-", round(max(p_reads_lost_abs*100),1), " (mean ", round(mean(p_reads_lost_abs*100),1), ")"),
+            `% Total Reads Lost (Marginal)` = paste0(round(min(p_reads_lost_abs_marginal*100),1), "-", round(max(p_reads_lost_abs_marginal*100),1), " (mean ", round(mean(p_reads_lost_abs_marginal*100),1), ")"), .groups="drop") %>% 
+  filter(Stage != "raw_concat") %>%
+  mutate(Stage = Stage %>% as.numeric %>% factor(labels=c("Trimming & filtering", "Deduplication", "Initial ribodepletion", "Secondary ribodepletion")))
+n_reads_rel_display
+
+
+ +
+
+
+
+
Code
g_stage_base <- ggplot(mapping=aes(x=stage, color=sample_type_short, group=sample)) +
+  scale_color_st() +
+  theme_kit
+
+# Plot reads over preprocessing
+g_reads_stages <- g_stage_base +
+  geom_line(aes(y=n_read_pairs), data=basic_stats) +
+  scale_y_continuous("# Read pairs", expand=c(0,0), limits=c(0,NA))
+g_reads_stages
+
+
+

+
+
+
+
Code
# Plot relative read losses during preprocessing
+g_reads_rel <- g_stage_base +
+  geom_line(aes(y=p_reads_lost_abs_marginal), data=n_reads_rel) +
+  scale_y_continuous("% Total Reads Lost", expand=c(0,0), 
+                     labels = function(x) x*100)
+g_reads_rel
+
+
+

+
+
+
+
+

Data cleaning was very successful at removing adapters and improving read qualities:

+
+
Code
g_qual <- ggplot(mapping=aes(color=sample_type_short, linetype=read_pair, 
+                         group=interaction(sample,read_pair))) + 
+  scale_color_st() + scale_linetype_discrete(name = "Read Pair") +
+  guides(color=guide_legend(nrow=2,byrow=TRUE),
+         linetype = guide_legend(nrow=2,byrow=TRUE)) +
+  theme_base
+
+# Visualize adapters
+g_adapters <- g_qual + 
+  geom_line(aes(x=position, y=pc_adapters), data=adapter_stats) +
+  scale_y_continuous(name="% Adapters", limits=c(0,20),
+                     breaks = seq(0,50,10), expand=c(0,0)) +
+  scale_x_continuous(name="Position", limits=c(0,NA),
+                     breaks=seq(0,140,20), expand=c(0,0)) +
+  facet_grid(stage~adapter)
+g_adapters
+
+
+

+
+
+
+
Code
# Visualize quality
+g_quality_base <- g_qual +
+  geom_hline(yintercept=25, linetype="dashed", color="red") +
+  geom_hline(yintercept=30, linetype="dashed", color="red") +
+  geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats) +
+  scale_y_continuous(name="Mean Phred score", expand=c(0,0), limits=c(10,45)) +
+  scale_x_continuous(name="Position", limits=c(0,NA),
+                     breaks=seq(0,140,20), expand=c(0,0)) +
+  facet_grid(stage~.)
+g_quality_base
+
+
+

+
+
+
+
Code
g_quality_seq <- g_qual +
+  geom_vline(xintercept=25, linetype="dashed", color="red") +
+  geom_vline(xintercept=30, linetype="dashed", color="red") +
+  geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats) +
+  scale_x_continuous(name="Mean Phred score", expand=c(0,0)) +
+  scale_y_continuous(name="# Sequences", expand=c(0,0)) +
+  facet_grid(stage~.)
+g_quality_seq
+
+
+

+
+
+
+
+

According to FASTQC, cleaning + deduplication was very effective at reducing measured duplicate levels, which fell from an average of 31% to 6.5%:

+
+
Code
stage_dup <- basic_stats %>% group_by(stage) %>% 
+  summarize(dmin = min(percent_duplicates), dmax=max(percent_duplicates),
+            dmean=mean(percent_duplicates), .groups = "drop")
+
+g_dup_stages <- g_stage_base +
+  geom_line(aes(y=percent_duplicates), data=basic_stats) +
+  scale_y_continuous("% Duplicates", limits=c(0,NA), expand=c(0,0))
+g_dup_stages
+
+
+

+
+
+
+
Code
g_readlen_stages <- g_stage_base + 
+  geom_line(aes(y=mean_seq_len), data=basic_stats) +
+  scale_y_continuous("Mean read length (nt)", expand=c(0,0), limits=c(0,NA))
+g_readlen_stages
+
+
+

+
+
+
+
+

High-level composition

+

As before, to assess the high-level composition of the reads, I ran the ribodepleted files through Kraken (using the Standard 16 database) and summarized the results with Bracken. Combining these results with the read counts above gives us a breakdown of the inferred composition of the samples:

+
+
Code
classifications <- c("Filtered", "Duplicate", "Ribosomal", "Unassigned",
+                     "Bacterial", "Archaeal", "Viral", "Human")
+
+# Import composition data
+comp_path <- file.path(data_dir, "taxonomic_composition.tsv.gz")
+comp <- read_tsv(comp_path, show_col_types = FALSE) %>%
+  left_join(libraries, by="sample") %>%
+  mutate(classification = factor(classification, levels = classifications))
+  
+
+# Summarize composition
+read_comp_summ <- comp %>% 
+  group_by(sample_type_short, classification) %>%
+  summarize(n_reads = sum(n_reads), .groups = "drop_last") %>%
+  mutate(n_reads = replace_na(n_reads,0),
+    p_reads = n_reads/sum(n_reads),
+    pc_reads = p_reads*100)
+
+
+
+
Code
# Prepare plotting templates
+g_comp_base <- ggplot(mapping=aes(x=sample, y=p_reads, fill=classification)) +
+  facet_wrap(~sample_type_short, scales = "free_x", ncol=3,
+             labeller = label_wrap_gen(multi_line=FALSE, width=20)) +
+  theme_xblank
+scale_y_pc_reads <- purrr::partial(scale_y_continuous, name = "% Reads",
+                                   expand = c(0,0), labels = function(y) y*100)
+
+# Plot overall composition
+g_comp <- g_comp_base + geom_col(data = comp, position = "stack", width=1) +
+  scale_y_pc_reads(limits = c(0,1.01), breaks = seq(0,1,0.2)) +
+  scale_fill_brewer(palette = "Set1", name = "Classification")
+g_comp
+
+
+

+
+
+
+
Code
# Plot composition of minor components
+comp_minor <- comp %>% 
+  filter(classification %in% c("Archaeal", "Viral", "Human", "Other"))
+palette_minor <- brewer.pal(9, "Set1")[6:9]
+g_comp_minor <- g_comp_base + 
+  geom_col(data=comp_minor, position = "stack", width=1) +
+  scale_y_pc_reads() +
+  scale_fill_manual(values=palette_minor, name = "Classification")
+g_comp_minor
+
+
+

+
+
+
+
+
+
Code
p_reads_summ_group <- comp %>%
+  mutate(classification = ifelse(classification %in% c("Filtered", "Duplicate", "Unassigned"), "Excluded", as.character(classification)),
+         classification = fct_inorder(classification)) %>%
+  group_by(classification, sample, sample_type_short) %>%
+  summarize(p_reads = sum(p_reads), .groups = "drop") %>%
+  group_by(classification, sample_type_short) %>%
+  summarize(pc_min = min(p_reads)*100, pc_max = max(p_reads)*100, 
+            pc_mean = mean(p_reads)*100, .groups = "drop")
+p_reads_summ_prep <- p_reads_summ_group %>%
+  mutate(classification = fct_inorder(classification),
+         pc_min = pc_min %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),
+         pc_max = pc_max %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),
+         pc_mean = pc_mean %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),
+         display = paste0(pc_min, "-", pc_max, "% (mean ", pc_mean, "%)"))
+p_reads_summ <- p_reads_summ_prep %>%
+  select(`Sample Type`=sample_type_short, Classification=classification, 
+         `Read Fraction`=display) %>%
+  arrange(`Sample Type`, Classification)
+p_reads_summ
+
+
+ +
+
+
+

As in previous DNA datasets, the vast majority of classified reads were bacterial in origin. The fraction of virus reads varied substantially between sample types, averaging <0.01% in influent and final effluent but closer to 0.05% in other sample types. Interestingly (though not particularly relevantly for this analysis), the fraction of archaeal reads was much higher in influent than other sample types, in contrast to Bengtsson-Palme where it was highest in slidge.

+

As is common for DNA data, viral reads were overwhelmingly dominated by Caudoviricetes phages, though one wet-well sample contained a substantial fraction of Alsuviricetes (a class of mainly plant pathogens that includes Virgaviridae):

+
+
Code
# Get Kraken reports
+reports_path <- file.path(data_dir, "kraken_reports.tsv.gz")
+reports <- read_tsv(reports_path, show_col_types = FALSE)
+
+# Get viral taxonomy
+viral_taxa_path <- file.path(data_dir, "viral-taxids.tsv.gz")
+viral_taxa <- read_tsv(viral_taxa_path, show_col_types = FALSE)
+
+# Filter to viral taxa
+kraken_reports_viral <- filter(reports, taxid %in% viral_taxa$taxid) %>%
+  group_by(sample) %>%
+  mutate(p_reads_viral = n_reads_clade/n_reads_clade[1])
+kraken_reports_viral_cleaned <- kraken_reports_viral %>%
+  inner_join(libraries, by="sample") %>%
+  select(-pc_reads_total, -n_reads_direct, -contains("minimizers")) %>%
+  select(name, taxid, p_reads_viral, n_reads_clade, everything())
+
+viral_classes <- kraken_reports_viral_cleaned %>% filter(rank == "C")
+viral_families <- kraken_reports_viral_cleaned %>% filter(rank == "F")
+
+
+
+
Code
major_threshold <- 0.02
+
+# Identify major viral classes
+viral_classes_major_tab <- viral_classes %>% 
+  group_by(name, taxid) %>%
+  summarize(p_reads_viral_max = max(p_reads_viral), .groups="drop") %>%
+  filter(p_reads_viral_max >= major_threshold)
+viral_classes_major_list <- viral_classes_major_tab %>% pull(name)
+viral_classes_major <- viral_classes %>% 
+  filter(name %in% viral_classes_major_list) %>%
+  select(name, taxid, sample, sample_type_short, date, p_reads_viral)
+viral_classes_minor <- viral_classes_major %>% 
+  group_by(sample, sample_type_short, date) %>%
+  summarize(p_reads_viral_major = sum(p_reads_viral), .groups = "drop") %>%
+  mutate(name = "Other", taxid=NA, p_reads_viral = 1-p_reads_viral_major) %>%
+  select(name, taxid, sample, sample_type_short, date, p_reads_viral)
+viral_classes_display <- bind_rows(viral_classes_major, viral_classes_minor) %>%
+  arrange(desc(p_reads_viral)) %>% 
+  mutate(name = factor(name, levels=c(viral_classes_major_list, "Other")),
+         p_reads_viral = pmax(p_reads_viral, 0)) %>%
+  rename(p_reads = p_reads_viral, classification=name)
+
+palette_viral <- c(brewer.pal(12, "Set3"), brewer.pal(8, "Dark2"))
+g_classes <- g_comp_base + 
+  geom_col(data=viral_classes_display, position = "stack", width=1) +
+  scale_y_continuous(name="% Viral Reads", limits=c(0,1.01), breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral class")
+  
+g_classes
+
+
+

+
+
+
+
+

Human-infecting virus reads: validation

+

Next, I investigated the human-infecting virus read content of these unenriched samples. A grand total of 527 reads were identified as putatively human-viral, with half of samples showing 5 or fewer total HV read pairs.

+
+
Code
# Import HV read data
+hv_reads_filtered_path <- file.path(data_dir, "hv_hits_putative_filtered.tsv.gz")
+hv_reads_filtered <- lapply(hv_reads_filtered_path, read_tsv,
+                            show_col_types = FALSE) %>%
+  bind_rows() %>%
+  left_join(libraries, by="sample")
+
+# Count reads
+n_hv_filtered <- hv_reads_filtered %>%
+  group_by(sample, date, sample_type_short, seq_id) %>% count %>%
+  group_by(sample, date, sample_type_short) %>% count %>% 
+  inner_join(basic_stats %>% filter(stage == "ribo_initial") %>% 
+               select(sample, n_read_pairs), by="sample") %>% 
+  rename(n_putative = n, n_total = n_read_pairs) %>% 
+  mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)
+n_hv_filtered_summ <- n_hv_filtered %>% ungroup %>%
+  summarize(n_putative = sum(n_putative), n_total = sum(n_total), 
+            .groups="drop") %>% 
+  mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)
+
+
+
+
Code
# Collapse multi-entry sequences
+rmax <- purrr::partial(max, na.rm = TRUE)
+collapse <- function(x) ifelse(all(x == x[1]), x[1], paste(x, collapse="/"))
+mrg <- hv_reads_filtered %>% 
+  mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev, na.rm = TRUE)) %>%
+  arrange(desc(adj_score_max)) %>%
+  group_by(seq_id) %>%
+  summarize(sample = collapse(sample),
+            genome_id = collapse(genome_id),
+            taxid_best = taxid[1],
+            taxid = collapse(as.character(taxid)),
+            best_alignment_score_fwd = rmax(best_alignment_score_fwd),
+            best_alignment_score_rev = rmax(best_alignment_score_rev),
+            query_len_fwd = rmax(query_len_fwd),
+            query_len_rev = rmax(query_len_rev),
+            query_seq_fwd = query_seq_fwd[!is.na(query_seq_fwd)][1],
+            query_seq_rev = query_seq_rev[!is.na(query_seq_rev)][1],
+            classified = rmax(classified),
+            assigned_name = collapse(assigned_name),
+            assigned_taxid_best = assigned_taxid[1],
+            assigned_taxid = collapse(as.character(assigned_taxid)),
+            assigned_hv = rmax(assigned_hv),
+            hit_hv = rmax(hit_hv),
+            encoded_hits = collapse(encoded_hits),
+            adj_score_fwd = rmax(adj_score_fwd),
+            adj_score_rev = rmax(adj_score_rev)
+            ) %>%
+  inner_join(libraries, by="sample") %>%
+  mutate(kraken_label = ifelse(assigned_hv, "Kraken2 HV\nassignment",
+                               ifelse(hit_hv, "Kraken2 HV\nhit",
+                                      "No hit or\nassignment"))) %>%
+  mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev),
+         highscore = adj_score_max >= 20)
+
+# Plot results
+geom_vhist <- purrr::partial(geom_histogram, binwidth=5, boundary=0)
+g_vhist_base <- ggplot(mapping=aes(x=adj_score_max)) +
+  geom_vline(xintercept=20, linetype="dashed", color="red") +
+  facet_wrap(~kraken_label, labeller = labeller(kit = label_wrap_gen(20)), scales = "free_y") +
+  scale_x_continuous(name = "Maximum adjusted alignment score") + 
+  scale_y_continuous(name="# Read pairs") + 
+  theme_base 
+g_vhist_0 <- g_vhist_base + geom_vhist(data=mrg)
+g_vhist_0
+
+
+

+
+
+
+
+

BLASTing these reads against nt, we find that the pipeline performs well, with only a single high-scoring false-positive read:

+
+
Code
# Import paired BLAST results
+blast_paired_path <- file.path(data_dir, "hv_hits_blast_paired.tsv.gz")
+blast_paired <- read_tsv(blast_paired_path, show_col_types = FALSE)
+
+# Add viral status
+blast_viral <- mutate(blast_paired, viral = staxid %in% viral_taxa$taxid) %>%
+  mutate(viral_full = viral & n_reads == 2)
+
+# Compare to Kraken & Bowtie assignments
+match_taxid <- function(taxid_1, taxid_2){
+  p1 <- mapply(grepl, paste0("/", taxid_1, "$"), taxid_2)
+  p2 <- mapply(grepl, paste0("^", taxid_1, "/"), taxid_2)
+  p3 <- mapply(grepl, paste0("^", taxid_1, "$"), taxid_2)
+  out <- setNames(p1|p2|p3, NULL)
+  return(out)
+}
+mrg_assign <- mrg %>% select(sample, seq_id, taxid, assigned_taxid, adj_score_max)
+blast_assign <- inner_join(blast_viral, mrg_assign, by="seq_id") %>%
+    mutate(taxid_match_bowtie = match_taxid(staxid, taxid),
+           taxid_match_kraken = match_taxid(staxid, assigned_taxid),
+           taxid_match_any = taxid_match_bowtie | taxid_match_kraken)
+blast_out <- blast_assign %>%
+  group_by(seq_id) %>%
+  summarize(viral_status = ifelse(any(viral_full), 2,
+                                  ifelse(any(taxid_match_any), 2,
+                                             ifelse(any(viral), 1, 0))),
+            .groups = "drop")
+
+
+
+
Code
# Merge BLAST results with unenriched read data
+mrg_blast <- full_join(mrg, blast_out, by="seq_id") %>%
+  mutate(viral_status = replace_na(viral_status, 0),
+         viral_status_out = ifelse(viral_status == 0, FALSE, TRUE))
+
+# Plot
+g_vhist_1 <- g_vhist_base + geom_vhist(data=mrg_blast, mapping=aes(fill=viral_status_out)) +
+  scale_fill_brewer(palette = "Set1", name = "Viral status")
+g_vhist_1
+
+
+

+
+
+
+
+

My usual disjunctive score threshold of 20 gave precision, sensitivity, and F1 scores all >97%:

+
+
Code
test_sens_spec <- function(tab, score_threshold){
+  tab_retained <- tab %>% 
+    mutate(retain_score = (adj_score_fwd > score_threshold | adj_score_rev > score_threshold),
+           retain = assigned_hv | retain_score) %>%
+    group_by(viral_status_out, retain) %>% count
+  pos_tru <- tab_retained %>% filter(viral_status_out == "TRUE", retain) %>% pull(n) %>% sum
+  pos_fls <- tab_retained %>% filter(viral_status_out != "TRUE", retain) %>% pull(n) %>% sum
+  neg_tru <- tab_retained %>% filter(viral_status_out != "TRUE", !retain) %>% pull(n) %>% sum
+  neg_fls <- tab_retained %>% filter(viral_status_out == "TRUE", !retain) %>% pull(n) %>% sum
+  sensitivity <- pos_tru / (pos_tru + neg_fls)
+  specificity <- neg_tru / (neg_tru + pos_fls)
+  precision   <- pos_tru / (pos_tru + pos_fls)
+  f1 <- 2 * precision * sensitivity / (precision + sensitivity)
+  out <- tibble(threshold=score_threshold, sensitivity=sensitivity, 
+                specificity=specificity, precision=precision, f1=f1)
+  return(out)
+}
+range_f1 <- function(intab, inrange=15:45){
+  tss <- purrr::partial(test_sens_spec, tab=intab)
+  stats <- lapply(inrange, tss) %>% bind_rows %>%
+    pivot_longer(!threshold, names_to="metric", values_to="value")
+  return(stats)
+}
+stats_0 <- range_f1(mrg_blast)
+g_stats_0 <- ggplot(stats_0, aes(x=threshold, y=value, color=metric)) +
+  geom_vline(xintercept=20, color = "red", linetype = "dashed") +
+  geom_line() +
+  scale_y_continuous(name = "Value", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) +
+  scale_x_continuous(name = "Adjusted Score Threshold", expand = c(0,0)) +
+  scale_color_brewer(palette="Dark2") +
+  theme_base
+g_stats_0
+
+
+

+
+
+
+
Code
stats_0 %>% filter(threshold == 20) %>% 
+  select(Threshold=threshold, Metric=metric, Value=value)
+
+
+ +
+
+
+

Human-infecting viruses: overall relative abundance

+
+
Code
# Get raw read counts
+read_counts_raw <- basic_stats_raw %>%
+  select(sample, sample_type_short, date, n_reads_raw = n_read_pairs)
+
+# Get HV read counts
+mrg_hv <- mrg %>% mutate(hv_status = assigned_hv | highscore) %>%
+  rename(taxid_all = taxid, taxid = taxid_best)
+read_counts_hv <- mrg_hv %>% filter(hv_status) %>% group_by(sample) %>% 
+  count(name="n_reads_hv")
+read_counts <- read_counts_raw %>% left_join(read_counts_hv, by="sample") %>%
+  mutate(n_reads_hv = replace_na(n_reads_hv, 0))
+
+# Aggregate
+read_counts_grp <- read_counts %>% group_by(date, sample_type_short) %>%
+  summarize(n_reads_raw = sum(n_reads_raw),
+            n_reads_hv = sum(n_reads_hv), .groups="drop") %>%
+  mutate(sample= "All samples")
+read_counts_st <- read_counts_grp %>% group_by(sample, sample_type_short) %>%
+  summarize(n_reads_raw = sum(n_reads_raw),
+            n_reads_hv = sum(n_reads_hv), .groups="drop") %>%
+  mutate(date = "All dates")
+read_counts_date <- read_counts_grp %>%
+  group_by(sample, date) %>%
+  summarize(n_reads_raw = sum(n_reads_raw),
+            n_reads_hv = sum(n_reads_hv), .groups="drop") %>%
+  mutate(sample_type_short = "All sample types")
+read_counts_tot <- read_counts_date %>% group_by(sample, sample_type_short) %>%
+  summarize(n_reads_raw = sum(n_reads_raw),
+            n_reads_hv = sum(n_reads_hv), .groups="drop") %>%
+  mutate(date = "All dates")
+read_counts_agg <- bind_rows(read_counts_grp, read_counts_st,
+                             read_counts_date, read_counts_tot) %>%
+  mutate(p_reads_hv = n_reads_hv/n_reads_raw,
+         date = factor(date, levels = c(levels(libraries$date), "All dates")),
+         sample_type_short = factor(sample_type_short, levels = c(levels(libraries$sample_type_short), "All sample types")))
+
+
+

Applying a disjunctive cutoff at S=20 identifies 482 read pairs as human-viral. This gives an overall relative HV abundance of \(2.90 \times 10^{-7}\); on the low end across all datasets I’ve analyzed, though higher than for Bengtsson-Palme:

+
+
Code
# Visualize
+g_phv_agg <- ggplot(read_counts_agg, aes(x=date, color=sample_type_short)) +
+  geom_point(aes(y=p_reads_hv)) +
+  scale_y_log10("Relative abundance of human virus reads") +
+  scale_color_st() + theme_kit
+g_phv_agg
+
+
+

+
+
+
+
+
+
Code
# Collate past RA values
+ra_past <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched,
+                   "Brumfield", 5e-5, "RNA", FALSE,
+                   "Brumfield", 3.66e-7, "DNA", FALSE,
+                   "Spurbeck", 5.44e-6, "RNA", FALSE,
+                   "Yang", 3.62e-4, "RNA", FALSE,
+                   "Rothman (unenriched)", 1.87e-5, "RNA", FALSE,
+                   "Rothman (panel-enriched)", 3.3e-5, "RNA", TRUE,
+                   "Crits-Christoph (unenriched)", 1.37e-5, "RNA", FALSE,
+                   "Crits-Christoph (panel-enriched)", 1.26e-2, "RNA", TRUE,
+                   "Prussin (non-control)", 1.63e-5, "RNA", FALSE,
+                   "Prussin (non-control)", 4.16e-5, "DNA", FALSE,
+                   "Rosario (non-control)", 1.21e-5, "RNA", FALSE,
+                   "Rosario (non-control)", 1.50e-4, "DNA", FALSE,
+                   "Leung", 1.73e-5, "DNA", FALSE,
+                   "Brinch", 3.88e-6, "DNA", FALSE,
+                   "Bengtsson-Palme", 8.86e-8, "DNA", FALSE
+)
+
+# Collate new RA values
+ra_new <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched,
+                  "Ng", 2.90e-7, "DNA", FALSE)
+
+
+# Plot
+scale_color_na <- purrr::partial(scale_color_brewer, palette="Set1",
+                                 name="Nucleic acid type")
+ra_comp <- bind_rows(ra_past, ra_new) %>% mutate(dataset = fct_inorder(dataset))
+g_ra_comp <- ggplot(ra_comp, aes(y=dataset, x=ra, color=na_type)) +
+  geom_point() +
+  scale_color_na() +
+  scale_x_log10(name="Relative abundance of human virus reads") +
+  theme_base + theme(axis.title.y = element_blank())
+g_ra_comp
+
+
+

+
+
+
+
+

Human-infecting viruses: taxonomy and composition

+

In investigating the taxonomy of human-infecting virus reads, I restricted my analysis to samples with more than 5 HV read pairs total across all viruses, to reduce noise arising from extremely low HV read counts in some samples. 13 samples met this criterion.

+

At the family level, most samples were overwhelmingly dominated by Adenoviridae, with Picornaviridae, Polyomaviridae and Papillomaviridae making up most of the rest:

+
+
Code
# Get viral taxon names for putative HV reads
+viral_taxa$name[viral_taxa$taxid == 249588] <- "Mamastrovirus"
+viral_taxa$name[viral_taxa$taxid == 194960] <- "Kobuvirus"
+viral_taxa$name[viral_taxa$taxid == 688449] <- "Salivirus"
+viral_taxa$name[viral_taxa$taxid == 585893] <- "Picobirnaviridae"
+viral_taxa$name[viral_taxa$taxid == 333922] <- "Betapapillomavirus"
+viral_taxa$name[viral_taxa$taxid == 334207] <- "Betapapillomavirus 3"
+viral_taxa$name[viral_taxa$taxid == 369960] <- "Porcine type-C oncovirus"
+viral_taxa$name[viral_taxa$taxid == 333924] <- "Betapapillomavirus 2"
+viral_taxa$name[viral_taxa$taxid == 687329] <- "Anelloviridae"
+viral_taxa$name[viral_taxa$taxid == 325455] <- "Gammapapillomavirus"
+viral_taxa$name[viral_taxa$taxid == 333750] <- "Alphapapillomavirus"
+viral_taxa$name[viral_taxa$taxid == 694002] <- "Betacoronavirus"
+viral_taxa$name[viral_taxa$taxid == 334202] <- "Mupapillomavirus"
+viral_taxa$name[viral_taxa$taxid == 197911] <- "Alphainfluenzavirus"
+viral_taxa$name[viral_taxa$taxid == 186938] <- "Respirovirus"
+viral_taxa$name[viral_taxa$taxid == 333926] <- "Gammapapillomavirus 1"
+viral_taxa$name[viral_taxa$taxid == 337051] <- "Betapapillomavirus 1"
+viral_taxa$name[viral_taxa$taxid == 337043] <- "Alphapapillomavirus 4"
+viral_taxa$name[viral_taxa$taxid == 694003] <- "Betacoronavirus 1"
+viral_taxa$name[viral_taxa$taxid == 334204] <- "Mupapillomavirus 2"
+viral_taxa$name[viral_taxa$taxid == 334208] <- "Betapapillomavirus 4"
+viral_taxa$name[viral_taxa$taxid == 333928] <- "Gammapapillomavirus 2"
+viral_taxa$name[viral_taxa$taxid == 337039] <- "Alphapapillomavirus 2"
+viral_taxa$name[viral_taxa$taxid == 333929] <- "Gammapapillomavirus 3"
+viral_taxa$name[viral_taxa$taxid == 337042] <- "Alphapapillomavirus 7"
+viral_taxa$name[viral_taxa$taxid == 334203] <- "Mupapillomavirus 1"
+viral_taxa$name[viral_taxa$taxid == 333757] <- "Alphapapillomavirus 8"
+viral_taxa$name[viral_taxa$taxid == 337050] <- "Alphapapillomavirus 6"
+viral_taxa$name[viral_taxa$taxid == 333767] <- "Alphapapillomavirus 3"
+viral_taxa$name[viral_taxa$taxid == 333754] <- "Alphapapillomavirus 10"
+viral_taxa$name[viral_taxa$taxid == 687363] <- "Torque teno virus 24"
+viral_taxa$name[viral_taxa$taxid == 687342] <- "Torque teno virus 3"
+viral_taxa$name[viral_taxa$taxid == 687359] <- "Torque teno virus 20"
+viral_taxa$name[viral_taxa$taxid == 194441] <- "Primate T-lymphotropic virus 2"
+viral_taxa$name[viral_taxa$taxid == 334209] <- "Betapapillomavirus 5"
+viral_taxa$name[viral_taxa$taxid == 194965] <- "Aichivirus B"
+viral_taxa$name[viral_taxa$taxid == 333930] <- "Gammapapillomavirus 4"
+viral_taxa$name[viral_taxa$taxid == 337048] <- "Alphapapillomavirus 1"
+viral_taxa$name[viral_taxa$taxid == 337041] <- "Alphapapillomavirus 9"
+viral_taxa$name[viral_taxa$taxid == 337049] <- "Alphapapillomavirus 11"
+viral_taxa$name[viral_taxa$taxid == 337044] <- "Alphapapillomavirus 5"
+
+# Filter samples and add viral taxa information
+samples_keep <- read_counts %>% filter(n_reads_hv > 5) %>% pull(sample)
+mrg_hv_named <- mrg_hv %>% filter(sample %in% samples_keep, hv_status) %>% left_join(viral_taxa, by="taxid") 
+
+# Discover viral species & genera for HV reads
+raise_rank <- function(read_db, taxid_db, out_rank = "species", verbose = FALSE){
+  # Get higher ranks than search rank
+  ranks <- c("subspecies", "species", "subgenus", "genus", "subfamily", "family", "suborder", "order", "class", "subphylum", "phylum", "kingdom", "superkingdom")
+  rank_match <- which.max(ranks == out_rank)
+  high_ranks <- ranks[rank_match:length(ranks)]
+  # Merge read DB and taxid DB
+  reads <- read_db %>% select(-parent_taxid, -rank, -name) %>%
+    left_join(taxid_db, by="taxid")
+  # Extract sequences that are already at appropriate rank
+  reads_rank <- filter(reads, rank == out_rank)
+  # Drop sequences at a higher rank and return unclassified sequences
+  reads_norank <- reads %>% filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))
+  while(nrow(reads_norank) > 0){ # As long as there are unclassified sequences...
+    # Promote read taxids and re-merge with taxid DB, then re-classify and filter
+    reads_remaining <- reads_norank %>% mutate(taxid = parent_taxid) %>%
+      select(-parent_taxid, -rank, -name) %>%
+      left_join(taxid_db, by="taxid")
+    reads_rank <- reads_remaining %>% filter(rank == out_rank) %>%
+      bind_rows(reads_rank)
+    reads_norank <- reads_remaining %>%
+      filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))
+  }
+  # Finally, extract and append reads that were excluded during the process
+  reads_dropped <- reads %>% filter(!seq_id %in% reads_rank$seq_id)
+  reads_out <- reads_rank %>% bind_rows(reads_dropped) %>%
+    select(-parent_taxid, -rank, -name) %>%
+    left_join(taxid_db, by="taxid")
+  return(reads_out)
+}
+hv_reads_species <- raise_rank(mrg_hv_named, viral_taxa, "species")
+hv_reads_genus <- raise_rank(mrg_hv_named, viral_taxa, "genus")
+hv_reads_family <- raise_rank(mrg_hv_named, viral_taxa, "family")
+
+
+
+
Code
threshold_major_family <- 0.02
+
+# Count reads for each human-viral family
+hv_family_counts <- hv_reads_family %>% 
+  group_by(sample, date, sample_type_short, name, taxid) %>%
+  count(name = "n_reads_hv") %>%
+  group_by(sample, date, sample_type_short) %>%
+  mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking families and group others
+hv_family_major_tab <- hv_family_counts %>% group_by(name) %>% 
+  filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%
+  arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > threshold_major_family)
+hv_family_counts_major <- hv_family_counts %>%
+  mutate(name_display = ifelse(name %in% hv_family_major_tab$name, name, "Other")) %>%
+  group_by(sample, date, sample_type_short, name_display) %>%
+  summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv), 
+            .groups="drop") %>%
+  mutate(name_display = factor(name_display, 
+                               levels = c(hv_family_major_tab$name, "Other")))
+hv_family_counts_display <- hv_family_counts_major %>%
+  rename(p_reads = p_reads_hv, classification = name_display)
+
+# Plot
+g_hv_family <- g_comp_base + 
+  geom_col(data=hv_family_counts_display, position = "stack") +
+  scale_y_continuous(name="% HV Reads", limits=c(0,1.01), 
+                     breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral family") +
+  labs(title="Family composition of human-viral reads") +
+  guides(fill=guide_legend(ncol=4)) +
+  theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain"))
+g_hv_family
+
+
+

+
+
+
+
Code
# Get most prominent families for text
+hv_family_collate <- hv_family_counts %>% group_by(name, taxid) %>% 
+  summarize(n_reads_tot = sum(n_reads_hv),
+            p_reads_max = max(p_reads_hv), .groups="drop") %>% 
+  arrange(desc(n_reads_tot))
+
+
+

In investigating individual viral families, to avoid distortions from a few rare reads, I restricted myself to samples where that family made up at least 10% of human-viral reads:

+
+
Code
threshold_major_species <- 0.05
+taxid_adeno <- 10508
+
+# Get set of adenoviridae reads
+adeno_samples <- hv_family_counts %>% filter(taxid == taxid_adeno) %>%
+  filter(p_reads_hv >= 0.1) %>%
+  pull(sample)
+adeno_ids <- hv_reads_family %>% 
+  filter(taxid == taxid_adeno, sample %in% adeno_samples) %>%
+  pull(seq_id)
+
+# Count reads for each adenoviridae species
+adeno_species_counts <- hv_reads_species %>%
+  filter(seq_id %in% adeno_ids) %>%
+  group_by(sample, date, sample_type_short, name, taxid) %>%
+  count(name = "n_reads_hv") %>%
+  group_by(sample, date, sample_type_short) %>%
+  mutate(p_reads_adeno = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking families and group others
+adeno_species_major_tab <- adeno_species_counts %>% group_by(name) %>% 
+  filter(p_reads_adeno == max(p_reads_adeno)) %>% 
+  filter(row_number() == 1) %>%
+  arrange(desc(p_reads_adeno)) %>% 
+  filter(p_reads_adeno > threshold_major_species)
+adeno_species_counts_major <- adeno_species_counts %>%
+  mutate(name_display = ifelse(name %in% adeno_species_major_tab$name, 
+                               name, "Other")) %>%
+  group_by(sample, date, sample_type_short, name_display) %>%
+  summarize(n_reads_adeno = sum(n_reads_hv),
+            p_reads_adeno = sum(p_reads_adeno), 
+            .groups="drop") %>%
+  mutate(name_display = factor(name_display, 
+                               levels = c(adeno_species_major_tab$name, "Other")))
+adeno_species_counts_display <- adeno_species_counts_major %>%
+  rename(p_reads = p_reads_adeno, classification = name_display)
+
+# Plot
+g_adeno_species <- g_comp_base + 
+  geom_col(data=adeno_species_counts_display, position = "stack") +
+  scale_y_continuous(name="% Adenoviridae Reads", limits=c(0,1.01), 
+                     breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral species") +
+  labs(title="Species composition of Adenoviridae reads") +
+  guides(fill=guide_legend(ncol=3)) +
+  theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain"))
+
+g_adeno_species
+
+
+

+
+
+
+
Code
# Get most prominent species for text
+adeno_species_collate <- adeno_species_counts %>% group_by(name, taxid) %>% 
+  summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_adeno), .groups="drop") %>% 
+  arrange(desc(n_reads_tot))
+
+
+
+
Code
threshold_major_species <- 0.1
+taxid_picorna <- 12058
+
+# Get set of picornaviridae reads
+picorna_samples <- hv_family_counts %>% filter(taxid == taxid_picorna) %>%
+  filter(p_reads_hv >= 0.1) %>%
+  pull(sample)
+picorna_ids <- hv_reads_family %>% 
+  filter(taxid == taxid_picorna, sample %in% picorna_samples) %>%
+  pull(seq_id)
+
+# Count reads for each picornaviridae species
+picorna_species_counts <- hv_reads_species %>%
+  filter(seq_id %in% picorna_ids) %>%
+  group_by(sample, date, sample_type_short, name, taxid) %>%
+  count(name = "n_reads_hv") %>%
+  group_by(sample, date, sample_type_short) %>%
+  mutate(p_reads_picorna = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking families and group others
+picorna_species_major_tab <- picorna_species_counts %>% group_by(name) %>% 
+  filter(p_reads_picorna == max(p_reads_picorna)) %>% 
+  filter(row_number() == 1) %>%
+  arrange(desc(p_reads_picorna)) %>% 
+  filter(p_reads_picorna > threshold_major_species)
+picorna_species_counts_major <- picorna_species_counts %>%
+  mutate(name_display = ifelse(name %in% picorna_species_major_tab$name, 
+                               name, "Other")) %>%
+  group_by(sample, date, sample_type_short, name_display) %>%
+  summarize(n_reads_picorna = sum(n_reads_hv),
+            p_reads_picorna = sum(p_reads_picorna), 
+            .groups="drop") %>%
+  mutate(name_display = factor(name_display, 
+                               levels = c(picorna_species_major_tab$name, "Other")))
+picorna_species_counts_display <- picorna_species_counts_major %>%
+  rename(p_reads = p_reads_picorna, classification = name_display)
+
+# Plot
+g_picorna_species <- g_comp_base + 
+  geom_col(data=picorna_species_counts_display, position = "stack") +
+  scale_y_continuous(name="% Picornaviridae Reads", limits=c(0,1.01), 
+                     breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral species") +
+  labs(title="Species composition of Picornaviridae reads") +
+  guides(fill=guide_legend(ncol=3)) +
+  theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain"))
+
+g_picorna_species
+
+
+

+
+
+
+
Code
# Get most prominent species for text
+picorna_species_collate <- picorna_species_counts %>% group_by(name, taxid) %>% 
+  summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_picorna), .groups="drop") %>% 
+  arrange(desc(n_reads_tot))
+
+
+
+
Code
threshold_major_species <- 0.1
+taxid_polyoma <- 151341
+
+# Get set of polyomaviridae reads
+polyoma_samples <- hv_family_counts %>% filter(taxid == taxid_polyoma) %>%
+  filter(p_reads_hv >= 0.1) %>%
+  pull(sample)
+polyoma_ids <- hv_reads_family %>% 
+  filter(taxid == taxid_polyoma, sample %in% polyoma_samples) %>%
+  pull(seq_id)
+
+# Count reads for each polyomaviridae species
+polyoma_species_counts <- hv_reads_species %>%
+  filter(seq_id %in% polyoma_ids) %>%
+  group_by(sample, date, sample_type_short, name, taxid) %>%
+  count(name = "n_reads_hv") %>%
+  group_by(sample, date, sample_type_short) %>%
+  mutate(p_reads_polyoma = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking families and group others
+polyoma_species_major_tab <- polyoma_species_counts %>% group_by(name) %>% 
+  filter(p_reads_polyoma == max(p_reads_polyoma)) %>% 
+  filter(row_number() == 1) %>%
+  arrange(desc(p_reads_polyoma)) %>% 
+  filter(p_reads_polyoma > threshold_major_species)
+polyoma_species_counts_major <- polyoma_species_counts %>%
+  mutate(name_display = ifelse(name %in% polyoma_species_major_tab$name, 
+                               name, "Other")) %>%
+  group_by(sample, date, sample_type_short, name_display) %>%
+  summarize(n_reads_polyoma = sum(n_reads_hv),
+            p_reads_polyoma = sum(p_reads_polyoma), 
+            .groups="drop") %>%
+  mutate(name_display = factor(name_display, 
+                               levels = c(polyoma_species_major_tab$name, "Other")))
+polyoma_species_counts_display <- polyoma_species_counts_major %>%
+  rename(p_reads = p_reads_polyoma, classification = name_display)
+
+# Plot
+g_polyoma_species <- g_comp_base + 
+  geom_col(data=polyoma_species_counts_display, position = "stack") +
+  scale_y_continuous(name="% Polyomaviridae Reads", limits=c(0,1.01), 
+                     breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral species") +
+  labs(title="Species composition of Polyomaviridae reads") +
+  guides(fill=guide_legend(ncol=3)) +
+  theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain"))
+
+g_polyoma_species
+
+
+

+
+
+
+
Code
# Get most prominent species for text
+polyoma_species_collate <- polyoma_species_counts %>% group_by(name, taxid) %>% 
+  summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_polyoma), .groups="drop") %>% 
+  arrange(desc(n_reads_tot))
+
+
+

Finally, here again are the overall relative abundances of the specific viral genera I picked out manually in my last entry:

+
+
Code
# Define reference genera
+path_genera_rna <- c("Mamastrovirus", "Enterovirus", "Salivirus", "Kobuvirus", "Norovirus", "Sapovirus", "Rotavirus", "Alphacoronavirus", "Betacoronavirus", "Alphainfluenzavirus", "Betainfluenzavirus", "Lentivirus")
+path_genera_dna <- c("Mastadenovirus", "Alphapolyomavirus", "Betapolyomavirus", "Alphapapillomavirus", "Betapapillomavirus", "Gammapapillomavirus", "Orthopoxvirus", "Simplexvirus",
+                     "Lymphocryptovirus", "Cytomegalovirus", "Dependoparvovirus")
+path_genera <- bind_rows(tibble(name=path_genera_rna, genome_type="RNA genome"),
+                         tibble(name=path_genera_dna, genome_type="DNA genome")) %>%
+  left_join(viral_taxa, by="name")
+
+# Count in each sample
+mrg_hv_named_all <- mrg_hv %>% left_join(viral_taxa, by="taxid")
+hv_reads_genus_all <- raise_rank(mrg_hv_named_all, viral_taxa, "genus")
+n_path_genera <- hv_reads_genus_all %>% 
+  group_by(sample, date, sample_type_short, name, taxid) %>% 
+  count(name="n_reads_viral") %>% 
+  inner_join(path_genera, by=c("name", "taxid")) %>%
+  left_join(read_counts_raw, by=c("sample", "date", "sample_type_short")) %>%
+  mutate(p_reads_viral = n_reads_viral/n_reads_raw)
+
+# Pivot out and back to add zero lines
+n_path_genera_out <- n_path_genera %>% ungroup %>% select(sample, name, n_reads_viral) %>%
+  pivot_wider(names_from="name", values_from="n_reads_viral", values_fill=0) %>%
+  pivot_longer(-sample, names_to="name", values_to="n_reads_viral") %>%
+  left_join(read_counts_raw, by="sample") %>%
+  left_join(path_genera, by="name") %>%
+  mutate(p_reads_viral = n_reads_viral/n_reads_raw)
+
+## Aggregate across dates
+n_path_genera_stype <- n_path_genera_out %>% 
+  group_by(name, taxid, genome_type, sample_type_short) %>%
+  summarize(n_reads_raw = sum(n_reads_raw),
+            n_reads_viral = sum(n_reads_viral), .groups = "drop") %>%
+  mutate(sample="All samples", location="All locations",
+         p_reads_viral = n_reads_viral/n_reads_raw,
+         na_type = "DNA")
+
+# Plot
+g_path_genera <- ggplot(n_path_genera_stype,
+                        aes(y=name, x=p_reads_viral, color=sample_type_short)) +
+  geom_point() +
+  scale_x_log10(name="Relative abundance") +
+  scale_color_st() +
+  facet_grid(genome_type~., scales="free_y") +
+  theme_base + theme(axis.title.y = element_blank())
+g_path_genera
+
+
+

+
+
+
+
+

Conclusion

+

This is another dataset with very low HV abundance, arising from lab methods intended to maximize bacterial abundance at the expense of other taxa. Nevertheless, this dataset had higher HV relative abundance than the last one. Interestingly, all three wastewater DNA datasets analyzed so far have exhibited a strong predominance of adenoviruses, and especially human mastadenovirus F, among human-infecting viruses. We’ll see if this pattern persists in the other DNA wastewater datasets I have in the queue.

+ + + + +
+
+ + + + \ No newline at end of file diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/hv-family-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/hv-family-1.png new file mode 100644 index 0000000..9edba98 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/hv-family-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/hv-species-adeno-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/hv-species-adeno-1.png new file mode 100644 index 0000000..d6f7ed6 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/hv-species-adeno-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/hv-species-picorna-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/hv-species-picorna-1.png new file mode 100644 index 0000000..2aff154 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/hv-species-picorna-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/hv-species-polyoma-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/hv-species-polyoma-1.png new file mode 100644 index 0000000..c7204ca Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/hv-species-polyoma-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-basic-stats-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-basic-stats-1.png new file mode 100644 index 0000000..fa66a22 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-basic-stats-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-blast-results-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-blast-results-1.png new file mode 100644 index 0000000..f22a271 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-blast-results-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-composition-all-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-composition-all-1.png new file mode 100644 index 0000000..fabca08 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-composition-all-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-composition-all-2.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-composition-all-2.png new file mode 100644 index 0000000..3cbeadb Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-composition-all-2.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-f1-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-f1-1.png new file mode 100644 index 0000000..d4c02bc Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-f1-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-hv-ra-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-hv-ra-1.png new file mode 100644 index 0000000..0335aaa Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-hv-ra-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-hv-scores-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-hv-scores-1.png new file mode 100644 index 0000000..e71b0c1 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-hv-scores-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-quality-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-quality-1.png new file mode 100644 index 0000000..872a80e Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-quality-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-quality-2.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-quality-2.png new file mode 100644 index 0000000..7b3ea44 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-quality-2.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-quality-3.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-quality-3.png new file mode 100644 index 0000000..696c6d8 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-quality-3.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-raw-quality-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-raw-quality-1.png new file mode 100644 index 0000000..17a7d24 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-raw-quality-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-raw-quality-2.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-raw-quality-2.png new file mode 100644 index 0000000..9b33413 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-raw-quality-2.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/plot-raw-quality-3.png b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-raw-quality-3.png new file mode 100644 index 0000000..159afce Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/plot-raw-quality-3.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/preproc-dedup-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/preproc-dedup-1.png new file mode 100644 index 0000000..77f7477 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/preproc-dedup-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/preproc-dedup-2.png b/docs/notebooks/2024-05-01_ng_files/figure-html/preproc-dedup-2.png new file mode 100644 index 0000000..6e92de5 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/preproc-dedup-2.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/preproc-figures-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/preproc-figures-1.png new file mode 100644 index 0000000..c48bbac Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/preproc-figures-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/preproc-figures-2.png b/docs/notebooks/2024-05-01_ng_files/figure-html/preproc-figures-2.png new file mode 100644 index 0000000..ec9324b Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/preproc-figures-2.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/ra-genera-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/ra-genera-1.png new file mode 100644 index 0000000..f8b1815 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/ra-genera-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/ra-hv-past-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/ra-hv-past-1.png new file mode 100644 index 0000000..178dc2d Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/ra-hv-past-1.png differ diff --git a/docs/notebooks/2024-05-01_ng_files/figure-html/viral-class-composition-1.png b/docs/notebooks/2024-05-01_ng_files/figure-html/viral-class-composition-1.png new file mode 100644 index 0000000..d25a557 Binary files /dev/null and b/docs/notebooks/2024-05-01_ng_files/figure-html/viral-class-composition-1.png differ diff --git a/docs/notebooks/images/clipboard-3360113909.png b/docs/notebooks/images/clipboard-3360113909.png new file mode 100644 index 0000000..eecfad3 Binary files /dev/null and b/docs/notebooks/images/clipboard-3360113909.png differ diff --git a/docs/search.json b/docs/search.json index 98a0c8f..04cb3d0 100644 --- a/docs/search.json +++ b/docs/search.json @@ -32,7 +32,7 @@ "href": "index.html", "title": "Will's Public NAO Notebook", "section": "", - "text": "Workflow analysis of Bengtsson-Palme et al. (2021)\n\n\nWastewater grab samples from Sweden.\n\n\n\n\n\nMay 1, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Brinch et al. (2020)\n\n\nWastewater from Copenhagen.\n\n\n\n\n\nApr 30, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Leung et al. (2021)\n\n\nAir sampling from urban public transit systems.\n\n\n\n\n\nApr 19, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Rosario et al. (2018)\n\n\nAir sampling from a student dorm in Colorado.\n\n\n\n\n\nApr 12, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Prussin et al. (2019)\n\n\nAir filters from a daycare in Virginia.\n\n\n\n\n\nApr 12, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Brumfield et al. (2022)\n\n\nWastewater from a manhole in Maryland.\n\n\n\n\n\nApr 8, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Spurbeck et al. (2023)\n\n\nCave carpa.\n\n\n\n\n\nApr 1, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nFollowup analysis of Yang et al. (2020)\n\n\nDigging into deduplication.\n\n\n\n\n\nMar 19, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Yang et al. (2020)\n\n\nWastewater from Xinjiang.\n\n\n\n\n\nMar 16, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nImproving read deduplication in the MGS workflow\n\n\nRemoving reverse-complement duplicates of human-viral reads.\n\n\n\n\n\nMar 1, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Rothman et al. (2021), part 2\n\n\nPanel-enriched samples.\n\n\n\n\n\nFeb 29, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Rothman et al. (2021), part 1\n\n\nUnenriched samples.\n\n\n\n\n\nFeb 27, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Crits-Christoph et al. (2021), part 3\n\n\nFixing the virus pipeline.\n\n\n\n\n\nFeb 15, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Crits-Christoph et al. (2021), part 2\n\n\nAbundance and composition of human-infecting viruses.\n\n\n\n\n\nFeb 8, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Crits-Christoph et al. (2021), part 1\n\n\nPreprocessing and composition.\n\n\n\n\n\nFeb 4, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nAutomating BLAST validation of human viral read assignment\n\n\nExperiments with BLASTN remote mode\n\n\n\n\n\nJan 30, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nProject Runway RNA-seq testing data: removing livestock reads\n\n\n\n\n\n\n\n\nDec 22, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Project Runway RNA-seq testing data\n\n\nApplying a new workflow to some oldish data.\n\n\n\n\n\nDec 19, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nEstimating the effect of read depth on duplication rate for Project Runway DNA data\n\n\nHow deep can we go?\n\n\n\n\n\nNov 8, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing viral read assignments between pipelines on Project Runway data\n\n\n\n\n\n\n\n\nNov 2, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nInitial analysis of Project Runway protocol testing data\n\n\n\n\n\n\n\n\nOct 31, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing options for read deduplication\n\n\nClumpify vs fastp\n\n\n\n\n\nOct 19, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing Ribodetector and bbduk for rRNA detection\n\n\nIn search of quick rRNA filtering.\n\n\n\n\n\nOct 16, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing FASTP and AdapterRemoval for MGS pre-processing\n\n\nTwo tools – how do they perform?\n\n\n\n\n\nOct 12, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nHow does Element AVITI sequencing work?\n\n\nFindings of a shallow investigation\n\n\n\n\n\nOct 11, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nExtraction experiment 2: high-level results & interpretation\n\n\nComparing RNA yields and quality across extraction kits for settled solids\n\n\n\n\n\nSep 21, 2023\n\n\n\n\n\n\nNo matching items" + "text": "Workflow analysis of Ng et al. (2019)\n\n\nWastewater from Singapore.\n\n\n\n\n\nMay 1, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Bengtsson-Palme et al. (2016)\n\n\nWastewater grab samples from Sweden.\n\n\n\n\n\nMay 1, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Brinch et al. (2020)\n\n\nWastewater from Copenhagen.\n\n\n\n\n\nApr 30, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Leung et al. (2021)\n\n\nAir sampling from urban public transit systems.\n\n\n\n\n\nApr 19, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Rosario et al. (2018)\n\n\nAir sampling from a student dorm in Colorado.\n\n\n\n\n\nApr 12, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Prussin et al. (2019)\n\n\nAir filters from a daycare in Virginia.\n\n\n\n\n\nApr 12, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Brumfield et al. (2022)\n\n\nWastewater from a manhole in Maryland.\n\n\n\n\n\nApr 8, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Spurbeck et al. (2023)\n\n\nCave carpa.\n\n\n\n\n\nApr 1, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nFollowup analysis of Yang et al. (2020)\n\n\nDigging into deduplication.\n\n\n\n\n\nMar 19, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Yang et al. (2020)\n\n\nWastewater from Xinjiang.\n\n\n\n\n\nMar 16, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nImproving read deduplication in the MGS workflow\n\n\nRemoving reverse-complement duplicates of human-viral reads.\n\n\n\n\n\nMar 1, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Rothman et al. (2021), part 2\n\n\nPanel-enriched samples.\n\n\n\n\n\nFeb 29, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Rothman et al. (2021), part 1\n\n\nUnenriched samples.\n\n\n\n\n\nFeb 27, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Crits-Christoph et al. (2021), part 3\n\n\nFixing the virus pipeline.\n\n\n\n\n\nFeb 15, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Crits-Christoph et al. (2021), part 2\n\n\nAbundance and composition of human-infecting viruses.\n\n\n\n\n\nFeb 8, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Crits-Christoph et al. (2021), part 1\n\n\nPreprocessing and composition.\n\n\n\n\n\nFeb 4, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nAutomating BLAST validation of human viral read assignment\n\n\nExperiments with BLASTN remote mode\n\n\n\n\n\nJan 30, 2024\n\n\n\n\n\n\n\n\n\n\n\n\nProject Runway RNA-seq testing data: removing livestock reads\n\n\n\n\n\n\n\n\nDec 22, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nWorkflow analysis of Project Runway RNA-seq testing data\n\n\nApplying a new workflow to some oldish data.\n\n\n\n\n\nDec 19, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nEstimating the effect of read depth on duplication rate for Project Runway DNA data\n\n\nHow deep can we go?\n\n\n\n\n\nNov 8, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing viral read assignments between pipelines on Project Runway data\n\n\n\n\n\n\n\n\nNov 2, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nInitial analysis of Project Runway protocol testing data\n\n\n\n\n\n\n\n\nOct 31, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing options for read deduplication\n\n\nClumpify vs fastp\n\n\n\n\n\nOct 19, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing Ribodetector and bbduk for rRNA detection\n\n\nIn search of quick rRNA filtering.\n\n\n\n\n\nOct 16, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nComparing FASTP and AdapterRemoval for MGS pre-processing\n\n\nTwo tools – how do they perform?\n\n\n\n\n\nOct 12, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nHow does Element AVITI sequencing work?\n\n\nFindings of a shallow investigation\n\n\n\n\n\nOct 11, 2023\n\n\n\n\n\n\n\n\n\n\n\n\nExtraction experiment 2: high-level results & interpretation\n\n\nComparing RNA yields and quality across extraction kits for settled solids\n\n\n\n\n\nSep 21, 2023\n\n\n\n\n\n\nNo matching items" }, { "objectID": "notebooks/2023-10-12_fastp-vs-adapterremoval.html", @@ -317,8 +317,15 @@ { "objectID": "notebooks/2024-05-01_bengtsson-palme.html", "href": "notebooks/2024-05-01_bengtsson-palme.html", - "title": "Workflow analysis of Bengtsson-Palme et al. (2021)", + "title": "Workflow analysis of Bengtsson-Palme et al. (2016)", "section": "", - "text": "In this entry, I’m analyzing Bengtsson-Palme et al. (2016), a study of grab samples taken from three treatment plants in Sweden in September 2012. This was a DNA-sequencing study focused on investigating AMR in sewage communities. Influent, effluent and sludge samples were taken from each location, with a total of 70 samples across all sites: 20 influent, 10 effluent and 40 sludge. Liquid samples were filtered through a 1mm seive, then centrifuged, retaining the pellet (note that we expect this to select against viruses), which was then resuspended and underwent DNA extraction. Sludge samples weren’t concentrated, but went directly to DNA extraction. Samples were sequenced on an Illumina HiSeq 2500, producing 2x101bp reads.\nThe raw data\nThe sample composition of the Bengtsson-Palme dataset was as follows:\n\nCode# Importing the data is a bit more complicated this time as the samples are split across three pipeline runs\ndata_dir <- \"../data/2024-04-30_bengtsson-palme\"\n\n# Data input paths\nlibraries_path <- file.path(data_dir, \"sample-metadata.csv\")\nbasic_stats_path <- file.path(data_dir, \"qc_basic_stats.tsv.gz\")\nadapter_stats_path <- file.path(data_dir, \"qc_adapter_stats.tsv.gz\")\nquality_base_stats_path <- file.path(data_dir, \"qc_quality_base_stats.tsv.gz\")\nquality_seq_stats_path <- file.path(data_dir, \"qc_quality_sequence_stats.tsv.gz\")\n\n# Import libraries and extract metadata from sample names\nlibraries_raw <- lapply(libraries_path, read_csv, show_col_types = FALSE) %>% bind_rows\nlibraries <- libraries_raw %>%\n # Process sample types\n mutate(sample_group = ifelse(grepl(\"Inlet\", sample_type), \"Influent\",\n ifelse(grepl(\"Primary\", sample_type), \"Sludge (Primary/Surplus)\",\n ifelse(grepl(\"Surplus\", sample_type), \"Sludge (Primary/Surplus)\",\n ifelse(grepl(\"Digested\", sample_type), \"Sludge (Other)\",\n ifelse(grepl(\"Kemikond\", sample_type), \"Sludge (Other)\",\n \"Effluent\"))))),\n sample_group = factor(sample_group, levels = c(\"Influent\", \"Effluent\", \"Sludge (Primary/Surplus)\", \n \"Sludge (Other)\")),\n sludge = grepl(\"Sludge\", sample_group)) %>%\n arrange(location, sample_group, sample) %>%\n mutate(location = fct_inorder(location),\n sample = fct_inorder(sample))\n\n# Make table\ncount_samples <- libraries %>% group_by(sample_group, location) %>% count %>%\n pivot_wider(names_from = \"location\", values_from=\"n\") %>%\n rename(`Sample Type`=sample_group)\ncount_samples\n\n\n \n\n\n\n\nCode# Import QC data\nstages <- c(\"raw_concat\", \"cleaned\", \"dedup\", \"ribo_initial\", \"ribo_secondary\")\nimport_basic <- function(paths){\n lapply(paths, read_tsv, show_col_types = FALSE) %>% bind_rows %>%\n inner_join(libraries, by=\"sample\") %>%\n arrange(location, sample_group, sample) %>%\n mutate(stage = factor(stage, levels = stages),\n sample = fct_inorder(sample))\n}\nimport_basic_paired <- function(paths){\n import_basic(paths) %>% arrange(read_pair) %>% \n mutate(read_pair = fct_inorder(as.character(read_pair)))\n}\nbasic_stats <- import_basic(basic_stats_path)\nadapter_stats <- import_basic_paired(adapter_stats_path)\nquality_base_stats <- import_basic_paired(quality_base_stats_path)\nquality_seq_stats <- import_basic_paired(quality_seq_stats_path)\n\n# Filter to raw data\nbasic_stats_raw <- basic_stats %>% filter(stage == \"raw_concat\")\nadapter_stats_raw <- adapter_stats %>% filter(stage == \"raw_concat\")\nquality_base_stats_raw <- quality_base_stats %>% filter(stage == \"raw_concat\")\nquality_seq_stats_raw <- quality_seq_stats %>% filter(stage == \"raw_concat\")\n\n# Get key values for readout\nraw_read_counts <- basic_stats_raw %>% ungroup %>% \n summarize(rmin = min(n_read_pairs), rmax=max(n_read_pairs),\n rmean=mean(n_read_pairs), \n rtot = sum(n_read_pairs),\n btot = sum(n_bases_approx),\n dmin = min(percent_duplicates), dmax=max(percent_duplicates),\n dmean=mean(percent_duplicates), .groups = \"drop\")\n\n\nThese 70 samples yielded 23.7M-61.3M (mean 38.7M) reads per sample, for a total of 2.7B read pairs (539 gigabases of sequence). Read qualities were mostly high but tailed off towards the 3’ end, requiring some trimming. Adapter levels were high. Inferred duplication levels were low in sludge samples (1-12%, mean 4%) but much higher in liquid samples (22-90%, mean 46%), implying lower available sequence diversity in the latter sample groups.\n\nCode# Prepare data\nbasic_stats_raw_metrics <- basic_stats_raw %>%\n select(sample, sample_group, location,\n `# Read pairs` = n_read_pairs,\n `Total base pairs\\n(approx)` = n_bases_approx,\n `% Duplicates\\n(FASTQC)` = percent_duplicates) %>%\n pivot_longer(-(sample:location), names_to = \"metric\", values_to = \"value\") %>%\n mutate(metric = fct_inorder(metric))\n\n# Set up plot templates\nscale_fill_st <- purrr::partial(scale_fill_brewer, palette=\"Set1\", name=\"Sample Type\")\ng_basic <- ggplot(basic_stats_raw_metrics, \n aes(x=sample, y=value, fill=sample_group, group=interaction(sample_group,sample))) +\n geom_col(position = \"dodge\") +\n scale_y_continuous(expand=c(0,0)) +\n expand_limits(y=c(0,100)) +\n scale_fill_st() + \n facet_grid(metric~location, scales = \"free\", space=\"free_x\", switch=\"y\") +\n theme_kit + theme(\n axis.title.y = element_blank(),\n strip.text.y = element_text(face=\"plain\")\n )\ng_basic\n\n\n\n\n\n\n\n\nCode# Set up plotting templates\nscale_color_st <- purrr::partial(scale_color_brewer, palette=\"Set1\",\n name=\"Sample Type\")\ng_qual_raw <- ggplot(mapping=aes(color=sample_group, linetype=read_pair, \n group=interaction(sample,read_pair))) + \n scale_color_st() + scale_linetype_discrete(name = \"Read Pair\") +\n guides(color=guide_legend(nrow=2,byrow=TRUE),\n linetype = guide_legend(nrow=2,byrow=TRUE)) +\n theme_base\n\n# Visualize adapters\ng_adapters_raw <- g_qual_raw + \n geom_line(aes(x=position, y=pc_adapters), data=adapter_stats_raw) +\n scale_y_continuous(name=\"% Adapters\", limits=c(0,NA),\n breaks = seq(0,100,10), expand=c(0,0)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_grid(.~adapter)\ng_adapters_raw\n\n\n\n\n\n\nCode# Visualize quality\ng_quality_base_raw <- g_qual_raw +\n geom_hline(yintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_hline(yintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats_raw) +\n scale_y_continuous(name=\"Mean Phred score\", expand=c(0,0), limits=c(10,45)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0))\ng_quality_base_raw\n\n\n\n\n\n\nCodeg_quality_seq_raw <- g_qual_raw +\n geom_vline(xintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_vline(xintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats_raw) +\n scale_x_continuous(name=\"Mean Phred score\", expand=c(0,0)) +\n scale_y_continuous(name=\"# Sequences\", expand=c(0,0))\ng_quality_seq_raw\n\n\n\n\n\n\n\nPreprocessing\nThe average fraction of reads lost at each stage in the preprocessing pipeline is shown in the following table. As expected given the observed difference in duplication levels, many more reads were lost during deduplication in liquid samples than sludge samples. Conversely, trimming and filtering consistently removed more reads in sludge than in liquid samples, though the effect was less dramatic than for deduplication. Very few reads were lost during ribodepletion, as expected for DNA sequencing libraries.\n\nCoden_reads_rel <- basic_stats %>% \n select(sample, location, sample_group, sludge, stage, \n percent_duplicates, n_read_pairs) %>%\n group_by(sample) %>% arrange(sample, stage) %>%\n mutate(p_reads_retained = replace_na(n_read_pairs / lag(n_read_pairs), 0),\n p_reads_lost = 1 - p_reads_retained,\n p_reads_retained_abs = n_read_pairs / n_read_pairs[1],\n p_reads_lost_abs = 1-p_reads_retained_abs,\n p_reads_lost_abs_marginal = replace_na(p_reads_lost_abs - lag(p_reads_lost_abs), 0))\nn_reads_rel_display <- n_reads_rel %>% \n group_by(`Sludge?`=sludge, Stage=stage) %>% \n summarize(`% Total Reads Lost (Cumulative)` = paste0(round(min(p_reads_lost_abs*100),1), \"-\", round(max(p_reads_lost_abs*100),1), \" (mean \", round(mean(p_reads_lost_abs*100),1), \")\"),\n `% Total Reads Lost (Marginal)` = paste0(round(min(p_reads_lost_abs_marginal*100),1), \"-\", round(max(p_reads_lost_abs_marginal*100),1), \" (mean \", round(mean(p_reads_lost_abs_marginal*100),1), \")\"), .groups=\"drop\") %>% \n filter(Stage != \"raw_concat\") %>%\n mutate(Stage = Stage %>% as.numeric %>% factor(labels=c(\"Trimming & filtering\", \"Deduplication\", \"Initial ribodepletion\", \"Secondary ribodepletion\")))\nn_reads_rel_display\n\n\n \n\n\n\n\nCodeg_stage_trace <- ggplot(basic_stats, aes(x=stage, color=sample_group, group=sample)) +\n scale_color_st() +\n facet_wrap(~location, scales=\"free\", ncol=3) +\n theme_kit\n\n# Plot reads over preprocessing\ng_reads_stages <- g_stage_trace +\n geom_line(aes(y=n_read_pairs)) +\n scale_y_continuous(\"# Read pairs\", expand=c(0,0), limits=c(0,NA))\ng_reads_stages\n\n\n\n\n\n\nCode# Plot relative read losses during preprocessing\ng_reads_rel <- ggplot(n_reads_rel, aes(x=stage, color=sample_group, group=sample)) +\n geom_line(aes(y=p_reads_lost_abs_marginal)) +\n scale_y_continuous(\"% Total Reads Lost\", expand=c(0,0), \n labels = function(x) x*100) +\n scale_color_st() +\n facet_wrap(~location, scales=\"free\", ncol=3) +\n theme_kit\ng_reads_rel\n\n\n\n\n\n\n\nData cleaning was very successful at removing adapters and improving read qualities:\n\nCodeg_qual <- ggplot(mapping=aes(color=sample_group, linetype=read_pair, \n group=interaction(sample,read_pair))) + \n scale_color_st() + scale_linetype_discrete(name = \"Read Pair\") +\n guides(color=guide_legend(nrow=2,byrow=TRUE),\n linetype = guide_legend(nrow=2,byrow=TRUE)) +\n theme_base\n\n# Visualize adapters\ng_adapters <- g_qual + \n geom_line(aes(x=position, y=pc_adapters), data=adapter_stats) +\n scale_y_continuous(name=\"% Adapters\", limits=c(0,20),\n breaks = seq(0,50,10), expand=c(0,0)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_grid(stage~adapter)\ng_adapters\n\n\n\n\n\n\nCode# Visualize quality\ng_quality_base <- g_qual +\n geom_hline(yintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_hline(yintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats) +\n scale_y_continuous(name=\"Mean Phred score\", expand=c(0,0), limits=c(10,45)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_grid(stage~.)\ng_quality_base\n\n\n\n\n\n\nCodeg_quality_seq <- g_qual +\n geom_vline(xintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_vline(xintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats) +\n scale_x_continuous(name=\"Mean Phred score\", expand=c(0,0)) +\n scale_y_continuous(name=\"# Sequences\", expand=c(0,0)) +\n facet_grid(stage~.)\ng_quality_seq\n\n\n\n\n\n\n\nAccording to FASTQC, cleaning + deduplication was very effective at reducing measured duplicate levels, which fell from an average of 45% to 5% in liquid samples and from 6% to 3% in sludge samples:\n\nCodestage_dup <- basic_stats %>% group_by(sludge,stage) %>% \n summarize(dmin = min(percent_duplicates), dmax=max(percent_duplicates),\n dmean=mean(percent_duplicates), .groups = \"drop\")\n\ng_dup_stages <- g_stage_trace +\n geom_line(aes(y=percent_duplicates)) +\n scale_y_continuous(\"% Duplicates\", limits=c(0,NA), expand=c(0,0))\ng_dup_stages\n\n\n\n\n\n\nCodeg_readlen_stages <- g_stage_trace + geom_line(aes(y=mean_seq_len)) +\n scale_y_continuous(\"Mean read length (nt)\", expand=c(0,0), limits=c(0,NA))\ng_readlen_stages\n\n\n\n\n\n\n\nHigh-level composition\nAs before, to assess the high-level composition of the reads, I ran the ribodepleted files through Kraken (using the Standard 16 database) and summarized the results with Bracken. Combining these results with the read counts above gives us a breakdown of the inferred composition of the samples:\n\nCodeclassifications <- c(\"Filtered\", \"Duplicate\", \"Ribosomal\", \"Unassigned\",\n \"Bacterial\", \"Archaeal\", \"Viral\", \"Human\")\n\n# Import composition data\ncomp_path <- file.path(data_dir, \"taxonomic_composition.tsv.gz\")\ncomp <- read_tsv(comp_path, show_col_types = FALSE) %>%\n left_join(libraries, by=\"sample\") %>%\n mutate(classification = factor(classification, levels = classifications))\n \n\n# Summarize composition\nread_comp_summ <- comp %>% \n group_by(location, sample_group, classification) %>%\n summarize(n_reads = sum(n_reads), .groups = \"drop_last\") %>%\n mutate(n_reads = replace_na(n_reads,0),\n p_reads = n_reads/sum(n_reads),\n pc_reads = p_reads*100)\n\n\n\nCode# Prepare plotting templates\ng_comp_base <- ggplot(mapping=aes(x=sample, y=p_reads, fill=classification)) +\n facet_wrap(location~sample_group, scales = \"free_x\", ncol=4,\n labeller = label_wrap_gen(multi_line=FALSE, width=20)) +\n theme_xblank\nscale_y_pc_reads <- purrr::partial(scale_y_continuous, name = \"% Reads\",\n expand = c(0,0), labels = function(y) y*100)\n\n# Plot overall composition\ng_comp <- g_comp_base + geom_col(data = comp, position = \"stack\", width=1) +\n scale_y_pc_reads(limits = c(0,1.01), breaks = seq(0,1,0.2)) +\n scale_fill_brewer(palette = \"Set1\", name = \"Classification\")\ng_comp\n\n\n\n\n\n\nCode# Plot composition of minor components\ncomp_minor <- comp %>% \n filter(classification %in% c(\"Archaeal\", \"Viral\", \"Human\", \"Other\"))\npalette_minor <- brewer.pal(9, \"Set1\")[6:9]\ng_comp_minor <- g_comp_base + \n geom_col(data=comp_minor, position = \"stack\", width=1) +\n scale_y_pc_reads() +\n scale_fill_manual(values=palette_minor, name = \"Classification\")\ng_comp_minor\n\n\n\n\n\n\n\n\nCodep_reads_summ_group <- comp %>%\n mutate(classification = ifelse(classification %in% c(\"Filtered\", \"Duplicate\", \"Unassigned\"), \"Excluded\", as.character(classification)),\n classification = fct_inorder(classification)) %>%\n group_by(classification, sample, sample_group) %>%\n summarize(p_reads = sum(p_reads), .groups = \"drop\") %>%\n group_by(classification, sample_group) %>%\n summarize(pc_min = min(p_reads)*100, pc_max = max(p_reads)*100, \n pc_mean = mean(p_reads)*100, .groups = \"drop\")\np_reads_summ_prep <- p_reads_summ_group %>%\n mutate(classification = fct_inorder(classification),\n pc_min = pc_min %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n pc_max = pc_max %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n pc_mean = pc_mean %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n display = paste0(pc_min, \"-\", pc_max, \"% (mean \", pc_mean, \"%)\"))\np_reads_summ <- p_reads_summ_prep %>%\n select(sample_group, classification, read_fraction=display) %>%\n arrange(sample_group, classification)\np_reads_summ\n\n\n \n\n\n\nIn all sample types, the majority of reads were either filtered, duplicates, or unassigned. Among assigned reads, the vast majority were bacterial, which is unsurprising given the sample processing protocols used. Total viral fraction averaged 0.011% in influent and primary sludge, and considerably lower in effluent and treated sludge. The human fraction was also low, averaging 0.033% across all sample types. Interestingly, treated sludge showed far higher fractions of archaeal reads than other sample types, possibly due to the anaerobic conditions experienced during sludge treatment.\nAs is common for DNA data, viral reads were overwhelmingly dominated by Caudoviricetes phages:\n\nCode# Get Kraken reports\nreports_path <- file.path(data_dir, \"kraken_reports.tsv.gz\")\nreports <- read_tsv(reports_path, show_col_types = FALSE)\n\n# Get viral taxonomy\nviral_taxa_path <- file.path(data_dir, \"viral-taxids.tsv.gz\")\nviral_taxa <- read_tsv(viral_taxa_path, show_col_types = FALSE)\n\n# Filter to viral taxa\nkraken_reports_viral <- filter(reports, taxid %in% viral_taxa$taxid) %>%\n group_by(sample) %>%\n mutate(p_reads_viral = n_reads_clade/n_reads_clade[1])\nkraken_reports_viral_cleaned <- kraken_reports_viral %>%\n inner_join(libraries, by=\"sample\") %>%\n select(-pc_reads_total, -n_reads_direct, -contains(\"minimizers\")) %>%\n select(name, taxid, p_reads_viral, n_reads_clade, everything())\n\nviral_classes <- kraken_reports_viral_cleaned %>% filter(rank == \"C\")\nviral_families <- kraken_reports_viral_cleaned %>% filter(rank == \"F\")\n\n\n\nCodemajor_threshold <- 0.02\n\n# Identify major viral classes\nviral_classes_major_tab <- viral_classes %>% \n group_by(name, taxid) %>%\n summarize(p_reads_viral_max = max(p_reads_viral), .groups=\"drop\") %>%\n filter(p_reads_viral_max >= major_threshold)\nviral_classes_major_list <- viral_classes_major_tab %>% pull(name)\nviral_classes_major <- viral_classes %>% \n filter(name %in% viral_classes_major_list) %>%\n select(name, taxid, sample, sample_group, location, p_reads_viral)\nviral_classes_minor <- viral_classes_major %>% \n group_by(sample, sample_group, location) %>%\n summarize(p_reads_viral_major = sum(p_reads_viral), .groups = \"drop\") %>%\n mutate(name = \"Other\", taxid=NA, p_reads_viral = 1-p_reads_viral_major) %>%\n select(name, taxid, sample, sample_group, location, p_reads_viral)\nviral_classes_display <- bind_rows(viral_classes_major, viral_classes_minor) %>%\n arrange(desc(p_reads_viral)) %>% \n mutate(name = factor(name, levels=c(viral_classes_major_list, \"Other\")),\n p_reads_viral = pmax(p_reads_viral, 0)) %>%\n rename(p_reads = p_reads_viral, classification=name)\n\npalette_viral <- c(brewer.pal(12, \"Set3\"), brewer.pal(8, \"Dark2\"))\ng_classes <- g_comp_base + \n geom_col(data=viral_classes_display, position = \"stack\", width=1) +\n scale_y_continuous(name=\"% Viral Reads\", limits=c(0,1.01), breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral class\")\n \ng_classes\n\n\n\n\n\n\n\nHuman-infecting virus reads: validation\nNext, I investigated the human-infecting virus read content of these unenriched samples. A grand total of 565 reads were identified as putatively human-viral, with many samples showing fewer than 5 total HV read pairs. Even this total likely overstates total human-viral presence, however, as more than 300 of these reads had only low alignment scores to their putative viral sources:\n\nCode# Import HV read data\nhv_reads_filtered_path <- file.path(data_dir, \"hv_hits_putative_filtered.tsv.gz\")\nhv_reads_filtered <- lapply(hv_reads_filtered_path, read_tsv,\n show_col_types = FALSE) %>%\n bind_rows() %>%\n left_join(libraries, by=\"sample\")\n\n# Count reads\nn_hv_filtered <- hv_reads_filtered %>%\n group_by(sample, location, sample_group, seq_id) %>% count %>%\n group_by(sample, location, sample_group) %>% count %>% \n inner_join(basic_stats %>% filter(stage == \"ribo_initial\") %>% \n select(sample, n_read_pairs), by=\"sample\") %>% \n rename(n_putative = n, n_total = n_read_pairs) %>% \n mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)\nn_hv_filtered_summ <- n_hv_filtered %>% ungroup %>%\n summarize(n_putative = sum(n_putative), n_total = sum(n_total), \n .groups=\"drop\") %>% \n mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)\n\n\n\nCode# Collapse multi-entry sequences\nrmax <- purrr::partial(max, na.rm = TRUE)\ncollapse <- function(x) ifelse(all(x == x[1]), x[1], paste(x, collapse=\"/\"))\nmrg <- hv_reads_filtered %>% \n mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev, na.rm = TRUE)) %>%\n arrange(desc(adj_score_max)) %>%\n group_by(seq_id) %>%\n summarize(sample = collapse(sample),\n genome_id = collapse(genome_id),\n taxid_best = taxid[1],\n taxid = collapse(as.character(taxid)),\n best_alignment_score_fwd = rmax(best_alignment_score_fwd),\n best_alignment_score_rev = rmax(best_alignment_score_rev),\n query_len_fwd = rmax(query_len_fwd),\n query_len_rev = rmax(query_len_rev),\n query_seq_fwd = query_seq_fwd[!is.na(query_seq_fwd)][1],\n query_seq_rev = query_seq_rev[!is.na(query_seq_rev)][1],\n classified = rmax(classified),\n assigned_name = collapse(assigned_name),\n assigned_taxid_best = assigned_taxid[1],\n assigned_taxid = collapse(as.character(assigned_taxid)),\n assigned_hv = rmax(assigned_hv),\n hit_hv = rmax(hit_hv),\n encoded_hits = collapse(encoded_hits),\n adj_score_fwd = rmax(adj_score_fwd),\n adj_score_rev = rmax(adj_score_rev)\n ) %>%\n inner_join(libraries, by=\"sample\") %>%\n mutate(kraken_label = ifelse(assigned_hv, \"Kraken2 HV\\nassignment\",\n ifelse(hit_hv, \"Kraken2 HV\\nhit\",\n \"No hit or\\nassignment\"))) %>%\n mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev),\n highscore = adj_score_max >= 20)\n\n# Plot results\ngeom_vhist <- purrr::partial(geom_histogram, binwidth=5, boundary=0)\ng_vhist_base <- ggplot(mapping=aes(x=adj_score_max)) +\n geom_vline(xintercept=20, linetype=\"dashed\", color=\"red\") +\n facet_wrap(~kraken_label, labeller = labeller(kit = label_wrap_gen(20)), scales = \"free_y\") +\n scale_x_continuous(name = \"Maximum adjusted alignment score\") + \n scale_y_continuous(name=\"# Read pairs\") + \n theme_base \ng_vhist_0 <- g_vhist_base + geom_vhist(data=mrg)\ng_vhist_0\n\n\n\n\n\n\n\nBLASTing these reads against nt:\n\nCode# Import paired BLAST results\nblast_paired_path <- file.path(data_dir, \"hv_hits_blast_paired.tsv.gz\")\nblast_paired <- read_tsv(blast_paired_path, show_col_types = FALSE)\n\n# Add viral status\nblast_viral <- mutate(blast_paired, viral = staxid %in% viral_taxa$taxid) %>%\n mutate(viral_full = viral & n_reads == 2)\n\n# Compare to Kraken & Bowtie assignments\nmatch_taxid <- function(taxid_1, taxid_2){\n p1 <- mapply(grepl, paste0(\"/\", taxid_1, \"$\"), taxid_2)\n p2 <- mapply(grepl, paste0(\"^\", taxid_1, \"/\"), taxid_2)\n p3 <- mapply(grepl, paste0(\"^\", taxid_1, \"$\"), taxid_2)\n out <- setNames(p1|p2|p3, NULL)\n return(out)\n}\nmrg_assign <- mrg %>% select(sample, seq_id, taxid, assigned_taxid, adj_score_max)\nblast_assign <- inner_join(blast_viral, mrg_assign, by=\"seq_id\") %>%\n mutate(taxid_match_bowtie = match_taxid(staxid, taxid),\n taxid_match_kraken = match_taxid(staxid, assigned_taxid),\n taxid_match_any = taxid_match_bowtie | taxid_match_kraken)\nblast_out <- blast_assign %>%\n group_by(seq_id) %>%\n summarize(viral_status = ifelse(any(viral_full), 2,\n ifelse(any(taxid_match_any), 2,\n ifelse(any(viral), 1, 0))),\n .groups = \"drop\")\n\n\n\nCode# Merge BLAST results with unenriched read data\nmrg_blast <- full_join(mrg, blast_out, by=\"seq_id\") %>%\n mutate(viral_status = replace_na(viral_status, 0),\n viral_status_out = ifelse(viral_status == 0, FALSE, TRUE))\n\n# Plot\ng_vhist_1 <- g_vhist_base + geom_vhist(data=mrg_blast, mapping=aes(fill=viral_status_out)) +\n scale_fill_brewer(palette = \"Set1\", name = \"Viral status\")\ng_vhist_1\n\n\n\n\n\n\n\nIn order to achieve decent performance metrics under these conditions, I needed to exclude low-scoring reads with Kraken hits as well as those without. Doing this at my normal disjunctive score threshold of 20 gave passable precision (93%) but poor sensitivity (79%), leading to a poor overall F1 score (85%):\n\nCodetest_sens_spec <- function(tab, score_threshold){\n tab_retained <- tab %>% \n mutate(retain_score = (adj_score_fwd > score_threshold | adj_score_rev > score_threshold),\n retain = assigned_hv | retain_score) %>%\n group_by(viral_status_out, retain) %>% count\n pos_tru <- tab_retained %>% filter(viral_status_out == \"TRUE\", retain) %>% pull(n) %>% sum\n pos_fls <- tab_retained %>% filter(viral_status_out != \"TRUE\", retain) %>% pull(n) %>% sum\n neg_tru <- tab_retained %>% filter(viral_status_out != \"TRUE\", !retain) %>% pull(n) %>% sum\n neg_fls <- tab_retained %>% filter(viral_status_out == \"TRUE\", !retain) %>% pull(n) %>% sum\n sensitivity <- pos_tru / (pos_tru + neg_fls)\n specificity <- neg_tru / (neg_tru + pos_fls)\n precision <- pos_tru / (pos_tru + pos_fls)\n f1 <- 2 * precision * sensitivity / (precision + sensitivity)\n out <- tibble(threshold=score_threshold, sensitivity=sensitivity, \n specificity=specificity, precision=precision, f1=f1)\n return(out)\n}\nrange_f1 <- function(intab, inrange=15:45){\n tss <- purrr::partial(test_sens_spec, tab=intab)\n stats <- lapply(inrange, tss) %>% bind_rows %>%\n pivot_longer(!threshold, names_to=\"metric\", values_to=\"value\")\n return(stats)\n}\nstats_0 <- range_f1(mrg_blast)\ng_stats_0 <- ggplot(stats_0, aes(x=threshold, y=value, color=metric)) +\n geom_vline(xintercept=20, color = \"red\", linetype = \"dashed\") +\n geom_line() +\n scale_y_continuous(name = \"Value\", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_x_continuous(name = \"Adjusted Score Threshold\", expand = c(0,0)) +\n scale_color_brewer(palette=\"Dark2\") +\n theme_base\ng_stats_0\n\n\n\n\n\n\nCodestats_0 %>% filter(threshold == 20) %>% \n select(Threshold=threshold, Metric=metric, Value=value)\n\n\n \n\n\n\nLooking into the composition of different read groups, the bulk of high-scoring false positives map to human alphaherpesvirus 1 strain RH2 according to Bowtie2. BLASTN maps these sequences to a variety of bacterial taxa, especially E. coli and various Klebsiella species.\n\nCodemajor_threshold <- 0.05\n\n# Add missing viral taxa\nviral_taxa$name[viral_taxa$taxid == 211787] <- \"Human papillomavirus type 92\"\nviral_taxa$name[viral_taxa$taxid == 509154] <- \"Porcine endogenous retrovirus C\"\nviral_taxa$name[viral_taxa$taxid == 493803] <- \"Merkel cell polyomavirus\"\nviral_taxa$name[viral_taxa$taxid == 427343] <- \"Human papillomavirus 107\"\nviral_taxa$name[viral_taxa$taxid == 194958] <- \"Porcine endogenous retrovirus A\"\nviral_taxa$name[viral_taxa$taxid == 340907] <- \"Papiine alphaherpesvirus 2\"\nviral_taxa$name[viral_taxa$taxid == 194959] <- \"Porcine endogenous retrovirus B\"\n\n\n# Prepare data\nfp <- mrg_blast %>% \n group_by(viral_status_out, highscore, taxid_best) %>% count %>% \n group_by(viral_status_out, highscore) %>% mutate(p=n/sum(n)) %>% \n rename(taxid = taxid_best) %>%\n left_join(viral_taxa, by=\"taxid\") %>%\n arrange(desc(p))\nfp_major_tab <- fp %>% filter(p > major_threshold) %>% arrange(desc(p))\nfp_major_list <- fp_major_tab %>% pull(name) %>% sort %>% unique %>% c(., \"Other\")\nfp_major <- fp %>% mutate(major = p > major_threshold) %>% \n mutate(name_display = ifelse(major, name, \"Other\")) %>%\n group_by(viral_status_out, highscore, name_display) %>% \n summarize(n=sum(n), p=sum(p), .groups = \"drop\") %>%\n mutate(name_display = factor(name_display, levels = fp_major_list),\n score_display = ifelse(highscore, \"S >= 20\", \"S < 20\"),\n status_display = ifelse(viral_status_out, \"True positive\", \"False positive\"))\n\n# Plot\ng_fp <- ggplot(fp_major, aes(x=score_display, y=p, fill=name_display)) +\n geom_col(position=\"stack\") +\n scale_x_discrete(name = \"True positive?\") +\n scale_y_continuous(name = \"% reads\", limits = c(0,1.01), \n breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_fill_manual(values = palette_viral, name = \"Viral\\ntaxon\") +\n facet_grid(.~status_display) +\n guides(fill=guide_legend(ncol=3)) +\n theme_kit\ng_fp\n\n\n\n\n\n\n\n\nCode# Configure\nref_taxid_rh2 <- 946522\np_threshold <- 0.3\n\n# Get taxon names\ntax_names_path <- file.path(data_dir, \"taxid-names.tsv.gz\")\ntax_names <- read_tsv(tax_names_path, show_col_types = FALSE)\n\n# Add missing names\ntax_names_new <- tribble(~staxid, ~name,\n 3050295, \"Cytomegalovirus humanbeta5\",\n 459231, \"FLAG-tagging vector pFLAG97-TSR\",\n 3082113, \"Rangifer tarandus platyrhynchus\",\n 3119969, \"Bubalus kerabau\",\n 177155, \"Streptopelia turtur\",\n 187126, \"Nesoenas mayeri\",\n 244366, \"Klebsiella variicola\",\n )\ntax_names <- tax_names_new %>% filter(! staxid %in% tax_names$staxid) %>%\n bind_rows(tax_names) %>% arrange(staxid)\nref_name_rh2 <- tax_names %>% filter(staxid == ref_taxid_rh2) %>% pull(name)\n\n# Get major matches\nmrg_staxid <- mrg_blast %>% filter(taxid_best == ref_taxid_rh2) %>%\n group_by(highscore, viral_status_out) %>% mutate(n_seq = n())\nfp_staxid <- mrg_staxid %>%\n left_join(blast_paired, by=\"seq_id\") %>%\n mutate(staxid = as.integer(staxid)) %>%\n left_join(tax_names, by=\"staxid\") %>% rename(sname=name) %>%\n left_join(tax_names %>% rename(taxid_best=staxid), by=\"taxid_best\")\nfp_staxid_count <- fp_staxid %>%\n group_by(viral_status_out, highscore, \n taxid_best, name, staxid, sname, n_seq) %>%\n count %>%\n group_by(viral_status_out, highscore, taxid_best, name) %>%\n mutate(p=n/n_seq)\nfp_staxid_count_major <- fp_staxid_count %>%\n filter(n>1, p>p_threshold, !is.na(staxid)) %>%\n mutate(score_display = ifelse(highscore, \"S >= 20\", \"S < 20\"),\n status_display = ifelse(viral_status_out, \n \"True positive\", \"False positive\"))\n\n# Plot\ng <- ggplot(fp_staxid_count_major, aes(x=p, y=sname)) + \n geom_col() + \n facet_grid(status_display~score_display, scales=\"free\",\n labeller = label_wrap_gen(multi_line = FALSE)) +\n scale_x_continuous(name=\"% mapped reads\", limits=c(0,1), breaks=seq(0,1,0.2),\n expand=c(0,0)) +\n labs(title=paste0(ref_name_rh2, \" (taxid \", ref_taxid_rh2, \")\")) +\n theme_base + theme(\n axis.title.y = element_blank(),\n plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\ng\n\n\n\n\n\n\n\nThis is the second DNA wastewater dataset I’ve run (along with Brinch) where alphaherpesvirus 1 strain RH2 represents a large fraction of high-scoring false-positives. In both datasets, all of these reads are mapped by Bowtie to a single reference genome, with ID AB618031.1. This is just one of over 70 HSV-1 genomes present in our reference database. As such, I decided to try removing this genome from the database and re-running the analysis to see if this reduced the number of high-scoring false positives.\nRepeating the analysis with this modification reduces the number of putative HV reads by 33, increases precision from 93% to 98%, and eliminates high-scoring false-positives mapping to human alphaherpesvirus 1:\n\nCodedata_dir_2 <- file.path(data_dir, \"take2\")\n\n# Import HV read data\nhv_reads_filtered_2_path <- file.path(data_dir_2, \"hv_hits_putative_filtered.tsv.gz\")\nhv_reads_filtered_2 <- lapply(hv_reads_filtered_2_path, read_tsv,\n show_col_types = FALSE) %>%\n bind_rows() %>%\n left_join(libraries, by=\"sample\")\n\n# Count reads\nn_hv_filtered_2 <- hv_reads_filtered_2 %>%\n group_by(sample, location, sample_group, seq_id) %>% count %>%\n group_by(sample, location, sample_group) %>% count %>% \n inner_join(basic_stats %>% filter(stage == \"ribo_initial\") %>% \n select(sample, n_read_pairs), by=\"sample\") %>% \n rename(n_putative = n, n_total = n_read_pairs) %>% \n mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)\nn_hv_filtered_summ_2 <- n_hv_filtered_2 %>% ungroup %>%\n summarize(n_putative = sum(n_putative), n_total = sum(n_total), \n .groups=\"drop\") %>% \n mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)\n\n# Process read data\nmrg2 <- hv_reads_filtered_2 %>% \n mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev, na.rm = TRUE)) %>%\n arrange(desc(adj_score_max)) %>%\n group_by(seq_id) %>%\n summarize(sample = collapse(sample),\n genome_id = collapse(genome_id),\n taxid_best = taxid[1],\n taxid = collapse(as.character(taxid)),\n best_alignment_score_fwd = rmax(best_alignment_score_fwd),\n best_alignment_score_rev = rmax(best_alignment_score_rev),\n query_len_fwd = rmax(query_len_fwd),\n query_len_rev = rmax(query_len_rev),\n query_seq_fwd = query_seq_fwd[!is.na(query_seq_fwd)][1],\n query_seq_rev = query_seq_rev[!is.na(query_seq_rev)][1],\n classified = rmax(classified),\n assigned_name = collapse(assigned_name),\n assigned_taxid_best = assigned_taxid[1],\n assigned_taxid = collapse(as.character(assigned_taxid)),\n assigned_hv = rmax(assigned_hv),\n hit_hv = rmax(hit_hv),\n encoded_hits = collapse(encoded_hits),\n adj_score_fwd = rmax(adj_score_fwd),\n adj_score_rev = rmax(adj_score_rev)\n ) %>%\n inner_join(libraries, by=\"sample\") %>%\n mutate(kraken_label = ifelse(assigned_hv, \"Kraken2 HV\\nassignment\",\n ifelse(hit_hv, \"Kraken2 HV\\nhit\",\n \"No hit or\\nassignment\"))) %>%\n mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev),\n highscore = adj_score_max >= 20)\n\n\n\nCode# Import paired BLAST results\nblast_paired_2_path <- file.path(data_dir_2, \"hv_hits_blast_paired.tsv.gz\")\nblast_paired_2 <- read_tsv(blast_paired_2_path, show_col_types = FALSE)\n\n# Add viral status\nblast_viral_2 <- mutate(blast_paired_2, viral = staxid %in% viral_taxa$taxid) %>%\n mutate(viral_full = viral & n_reads == 2)\n\n# Compare to Kraken & Bowtie assignments\nmrg2_assign <- mrg2 %>% select(sample, seq_id, taxid, assigned_taxid, adj_score_max)\nblast_assign_2 <- inner_join(blast_viral, mrg2_assign, by=\"seq_id\") %>%\n mutate(taxid_match_bowtie = match_taxid(staxid, taxid),\n taxid_match_kraken = match_taxid(staxid, assigned_taxid),\n taxid_match_any = taxid_match_bowtie | taxid_match_kraken)\nblast_out_2 <- blast_assign_2 %>%\n group_by(seq_id) %>%\n summarize(viral_status = ifelse(any(viral_full), 2,\n ifelse(any(taxid_match_any), 2,\n ifelse(any(viral), 1, 0))),\n .groups = \"drop\")\n\n# Merge BLAST results with unenriched read data\nmrg2_blast <- full_join(mrg2, blast_out_2, by=\"seq_id\") %>%\n mutate(viral_status = replace_na(viral_status, 0),\n viral_status_out = ifelse(viral_status == 0, FALSE, TRUE))\n\n# Plot\ng_vhist_2 <- g_vhist_base + geom_vhist(data=mrg2_blast, mapping=aes(fill=viral_status_out)) +\n scale_fill_brewer(palette = \"Set1\", name = \"Viral status\")\ng_vhist_2\n\n\n\n\n\n\n\n\nCodestats_1 <- range_f1(mrg2_blast)\ng_stats_1 <- ggplot(stats_1, aes(x=threshold, y=value, color=metric)) +\n geom_vline(xintercept=20, color = \"red\", linetype = \"dashed\") +\n geom_line() +\n scale_y_continuous(name = \"Value\", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_x_continuous(name = \"Adjusted Score Threshold\", expand = c(0,0)) +\n scale_color_brewer(palette=\"Dark2\") +\n theme_base\ng_stats_1\n\n\n\n\n\n\nCodestats_1 %>% filter(threshold == 20) %>% \n select(Threshold=threshold, Metric=metric, Value=value)\n\n\n \n\n\n\n\nCode# Prepare data\nfp2 <- mrg2_blast %>% \n group_by(viral_status_out, highscore, taxid_best) %>% count %>% \n group_by(viral_status_out, highscore) %>% mutate(p=n/sum(n)) %>% \n rename(taxid = taxid_best) %>%\n left_join(viral_taxa, by=\"taxid\") %>%\n arrange(desc(p))\nfp2_major_tab <- fp2 %>% filter(p > major_threshold) %>% arrange(desc(p))\nfp2_major_list <- fp2_major_tab %>% pull(name) %>% sort %>% unique %>% c(., \"Other\")\nfp2_major <- fp2 %>% mutate(major = p > major_threshold) %>% \n mutate(name_display = ifelse(major, name, \"Other\")) %>%\n group_by(viral_status_out, highscore, name_display) %>% \n summarize(n=sum(n), p=sum(p), .groups = \"drop\") %>%\n mutate(name_display = factor(name_display, levels = fp2_major_list),\n score_display = ifelse(highscore, \"S >= 20\", \"S < 20\"),\n status_display = ifelse(viral_status_out, \"True positive\", \"False positive\"))\n\n# Plot\ng_fp2 <- ggplot(fp2_major, aes(x=score_display, y=p, fill=name_display)) +\n geom_col(position=\"stack\") +\n scale_x_discrete(name = \"True positive?\") +\n scale_y_continuous(name = \"% reads\", limits = c(0,1.01), \n breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_fill_manual(values = palette_viral, name = \"Viral\\ntaxon\") +\n facet_grid(.~status_display) +\n guides(fill=guide_legend(ncol=3)) +\n theme_kit\ng_fp2\n\n\n\n\n\n\n\nHuman-infecting viruses: overall relative abundance\n\nCode# Get raw read counts\nread_counts_raw <- basic_stats_raw %>%\n select(sample, location, sample_group, n_reads_raw = n_read_pairs)\n\n# Get HV read counts\nmrg_hv <- mrg2 %>% mutate(hv_status = assigned_hv | hit_hv | highscore) %>%\n rename(taxid_all = taxid, taxid = taxid_best)\nread_counts_hv <- mrg_hv %>% filter(hv_status) %>% group_by(sample) %>% \n count(name=\"n_reads_hv\")\nread_counts <- read_counts_raw %>% left_join(read_counts_hv, by=\"sample\") %>%\n mutate(n_reads_hv = replace_na(n_reads_hv, 0))\n\n# Aggregate\nread_counts_grp <- read_counts %>% group_by(location, sample_group) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv), .groups=\"drop\") %>%\n mutate(sample= \"All samples\")\nread_counts_st <- read_counts_grp %>% group_by(sample, sample_group) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv), .groups=\"drop\") %>%\n mutate(location = \"All locations\")\nread_counts_loc <- read_counts_grp %>%\n group_by(sample, location) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv), .groups=\"drop\") %>%\n mutate(sample_group = \"All sample types\")\nread_counts_tot <- read_counts_loc %>% group_by(sample, sample_group) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv), .groups=\"drop\") %>%\n mutate(location = \"All locations\")\nread_counts_agg <- bind_rows(read_counts_grp, read_counts_st,\n read_counts_loc, read_counts_tot) %>%\n mutate(p_reads_hv = n_reads_hv/n_reads_raw,\n location = factor(location, levels = c(levels(libraries$location), \"All locations\")),\n sample_group = factor(sample_group, levels = c(levels(libraries$sample_group), \"All sample types\")))\n\n\nApplying a disjunctive cutoff at S=20 identifies 230 read pairs as human-viral. This gives an overall relative HV abundance of \\(8.50 \\times 10^{-8}\\). While very low across all sample types, HV RA was noticeably higher in influent and primary sludge than in sample types that had undergone more extensive processing (effluent and processed sludge):\n\nCode# Visualize\ng_phv_agg <- ggplot(read_counts_agg, aes(x=sample_group, color=location)) +\n geom_point(aes(y=p_reads_hv)) +\n scale_y_log10(\"Relative abundance of human virus reads\") +\n scale_color_brewer(name=\"Location\", palette=\"Dark2\") + theme_kit\ng_phv_agg\n\n\n\n\n\n\n\nThis is by far the lowest HV relative abundance I’ve seen across any of the datasets I’ve analyzed:\n\nCode# Collate past RA values\nra_past <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched,\n \"Brumfield\", 5e-5, \"RNA\", FALSE,\n \"Brumfield\", 3.66e-7, \"DNA\", FALSE,\n \"Spurbeck\", 5.44e-6, \"RNA\", FALSE,\n \"Yang\", 3.62e-4, \"RNA\", FALSE,\n \"Rothman (unenriched)\", 1.87e-5, \"RNA\", FALSE,\n \"Rothman (panel-enriched)\", 3.3e-5, \"RNA\", TRUE,\n \"Crits-Christoph (unenriched)\", 1.37e-5, \"RNA\", FALSE,\n \"Crits-Christoph (panel-enriched)\", 1.26e-2, \"RNA\", TRUE,\n \"Prussin (non-control)\", 1.63e-5, \"RNA\", FALSE,\n \"Prussin (non-control)\", 4.16e-5, \"DNA\", FALSE,\n \"Rosario (non-control)\", 1.21e-5, \"RNA\", FALSE,\n \"Rosario (non-control)\", 1.50e-4, \"DNA\", FALSE,\n \"Leung\", 1.73e-5, \"DNA\", FALSE,\n \"Brinch\", 3.88e-6, \"DNA\", FALSE\n)\n\n# Collate new RA values\nra_new <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched,\n \"Bengtsson-Palme\", 8.86e-8, \"DNA\", FALSE)\n\n\n# Plot\nscale_color_na <- purrr::partial(scale_color_brewer, palette=\"Set1\",\n name=\"Nucleic acid type\")\nra_comp <- bind_rows(ra_past, ra_new) %>% mutate(dataset = fct_inorder(dataset))\ng_ra_comp <- ggplot(ra_comp, aes(y=dataset, x=ra, color=na_type)) +\n geom_point() +\n scale_color_na() +\n scale_x_log10(name=\"Relative abundance of human virus reads\") +\n theme_base + theme(axis.title.y = element_blank())\ng_ra_comp\n\n\n\n\n\n\n\nHuman-infecting viruses: taxonomy and composition\nIn investigating the taxonomy of human-infecting virus reads, I restricted my analysis to samples with more than 5 HV read pairs total across all viruses, to reduce noise arising from extremely low HV read counts in some samples. 13 samples, 3 from influent and 10 from primary sludge, met this criterion.\nAt the family level, most samples across all locations were dominated by Poxviridae and Adenoviridae, with Herpesviridae, Papillomaviridae and Picornaviridae also making a significant appearance in at least some samples:\n\nCode# Get viral taxon names for putative HV reads\nviral_taxa$name[viral_taxa$taxid == 249588] <- \"Mamastrovirus\"\nviral_taxa$name[viral_taxa$taxid == 194960] <- \"Kobuvirus\"\nviral_taxa$name[viral_taxa$taxid == 688449] <- \"Salivirus\"\nviral_taxa$name[viral_taxa$taxid == 585893] <- \"Picobirnaviridae\"\nviral_taxa$name[viral_taxa$taxid == 333922] <- \"Betapapillomavirus\"\nviral_taxa$name[viral_taxa$taxid == 334207] <- \"Betapapillomavirus 3\"\nviral_taxa$name[viral_taxa$taxid == 369960] <- \"Porcine type-C oncovirus\"\nviral_taxa$name[viral_taxa$taxid == 333924] <- \"Betapapillomavirus 2\"\nviral_taxa$name[viral_taxa$taxid == 687329] <- \"Anelloviridae\"\nviral_taxa$name[viral_taxa$taxid == 325455] <- \"Gammapapillomavirus\"\nviral_taxa$name[viral_taxa$taxid == 333750] <- \"Alphapapillomavirus\"\nviral_taxa$name[viral_taxa$taxid == 694002] <- \"Betacoronavirus\"\nviral_taxa$name[viral_taxa$taxid == 334202] <- \"Mupapillomavirus\"\nviral_taxa$name[viral_taxa$taxid == 197911] <- \"Alphainfluenzavirus\"\nviral_taxa$name[viral_taxa$taxid == 186938] <- \"Respirovirus\"\nviral_taxa$name[viral_taxa$taxid == 333926] <- \"Gammapapillomavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 337051] <- \"Betapapillomavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 337043] <- \"Alphapapillomavirus 4\"\nviral_taxa$name[viral_taxa$taxid == 694003] <- \"Betacoronavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 334204] <- \"Mupapillomavirus 2\"\nviral_taxa$name[viral_taxa$taxid == 334208] <- \"Betapapillomavirus 4\"\nviral_taxa$name[viral_taxa$taxid == 333928] <- \"Gammapapillomavirus 2\"\nviral_taxa$name[viral_taxa$taxid == 337039] <- \"Alphapapillomavirus 2\"\nviral_taxa$name[viral_taxa$taxid == 333929] <- \"Gammapapillomavirus 3\"\nviral_taxa$name[viral_taxa$taxid == 337042] <- \"Alphapapillomavirus 7\"\nviral_taxa$name[viral_taxa$taxid == 334203] <- \"Mupapillomavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 333757] <- \"Alphapapillomavirus 8\"\nviral_taxa$name[viral_taxa$taxid == 337050] <- \"Alphapapillomavirus 6\"\nviral_taxa$name[viral_taxa$taxid == 333767] <- \"Alphapapillomavirus 3\"\nviral_taxa$name[viral_taxa$taxid == 333754] <- \"Alphapapillomavirus 10\"\nviral_taxa$name[viral_taxa$taxid == 687363] <- \"Torque teno virus 24\"\nviral_taxa$name[viral_taxa$taxid == 687342] <- \"Torque teno virus 3\"\nviral_taxa$name[viral_taxa$taxid == 687359] <- \"Torque teno virus 20\"\nviral_taxa$name[viral_taxa$taxid == 194441] <- \"Primate T-lymphotropic virus 2\"\nviral_taxa$name[viral_taxa$taxid == 334209] <- \"Betapapillomavirus 5\"\nviral_taxa$name[viral_taxa$taxid == 194965] <- \"Aichivirus B\"\nviral_taxa$name[viral_taxa$taxid == 333930] <- \"Gammapapillomavirus 4\"\nviral_taxa$name[viral_taxa$taxid == 337048] <- \"Alphapapillomavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 337041] <- \"Alphapapillomavirus 9\"\nviral_taxa$name[viral_taxa$taxid == 337049] <- \"Alphapapillomavirus 11\"\nviral_taxa$name[viral_taxa$taxid == 337044] <- \"Alphapapillomavirus 5\"\n\n# Filter samples and add viral taxa information\nsamples_keep <- read_counts %>% filter(n_reads_hv > 5) %>% pull(sample)\nmrg_hv_named <- mrg_hv %>% filter(sample %in% samples_keep) %>% left_join(viral_taxa, by=\"taxid\") \n\n# Discover viral species & genera for HV reads\nraise_rank <- function(read_db, taxid_db, out_rank = \"species\", verbose = FALSE){\n # Get higher ranks than search rank\n ranks <- c(\"subspecies\", \"species\", \"subgenus\", \"genus\", \"subfamily\", \"family\", \"suborder\", \"order\", \"class\", \"subphylum\", \"phylum\", \"kingdom\", \"superkingdom\")\n rank_match <- which.max(ranks == out_rank)\n high_ranks <- ranks[rank_match:length(ranks)]\n # Merge read DB and taxid DB\n reads <- read_db %>% select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n # Extract sequences that are already at appropriate rank\n reads_rank <- filter(reads, rank == out_rank)\n # Drop sequences at a higher rank and return unclassified sequences\n reads_norank <- reads %>% filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))\n while(nrow(reads_norank) > 0){ # As long as there are unclassified sequences...\n # Promote read taxids and re-merge with taxid DB, then re-classify and filter\n reads_remaining <- reads_norank %>% mutate(taxid = parent_taxid) %>%\n select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n reads_rank <- reads_remaining %>% filter(rank == out_rank) %>%\n bind_rows(reads_rank)\n reads_norank <- reads_remaining %>%\n filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))\n }\n # Finally, extract and append reads that were excluded during the process\n reads_dropped <- reads %>% filter(!seq_id %in% reads_rank$seq_id)\n reads_out <- reads_rank %>% bind_rows(reads_dropped) %>%\n select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n return(reads_out)\n}\nhv_reads_species <- raise_rank(mrg_hv_named, viral_taxa, \"species\")\nhv_reads_genus <- raise_rank(mrg_hv_named, viral_taxa, \"genus\")\nhv_reads_family <- raise_rank(mrg_hv_named, viral_taxa, \"family\")\n\n\n\nCodethreshold_major_family <- 0.08\n\n# Count reads for each human-viral family\nhv_family_counts <- hv_reads_family %>% \n group_by(sample, location, sample_group, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, location, sample_group) %>%\n mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\nhv_family_major_tab <- hv_family_counts %>% group_by(name) %>% \n filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%\n arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > threshold_major_family)\nhv_family_counts_major <- hv_family_counts %>%\n mutate(name_display = ifelse(name %in% hv_family_major_tab$name, name, \"Other\")) %>%\n group_by(sample, location, sample_group, name_display) %>%\n summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(hv_family_major_tab$name, \"Other\")))\nhv_family_counts_display <- hv_family_counts_major %>%\n rename(p_reads = p_reads_hv, classification = name_display)\n\n# Plot\ng_hv_family <- g_comp_base + \n geom_col(data=hv_family_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% HV Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral family\") +\n labs(title=\"Family composition of human-viral reads\") +\n guides(fill=guide_legend(ncol=4)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\ng_hv_family\n\n\n\n\n\n\nCode# Get most prominent families for text\nhv_family_collate <- hv_family_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_hv), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\nIn investigating individual viral families, to avoid distortions from a few rare reads, I restricted myself to samples where that family made up at least 10% of human-viral reads:\n\nCodethreshold_major_species <- 0.1\ntaxid_pox <- 10240\n\n# Get set of poxviridae reads\npox_samples <- hv_family_counts %>% filter(taxid == taxid_pox) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\npox_ids <- hv_reads_family %>% \n filter(taxid == taxid_pox, sample %in% pox_samples) %>%\n pull(seq_id)\n\n# Count reads for each poxviridae species\npox_species_counts <- hv_reads_species %>%\n filter(seq_id %in% pox_ids) %>%\n group_by(sample, location, sample_group, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, location, sample_group) %>%\n mutate(p_reads_pox = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\npox_species_major_tab <- pox_species_counts %>% group_by(name) %>% \n filter(p_reads_pox == max(p_reads_pox)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_pox)) %>% \n filter(p_reads_pox > threshold_major_species)\npox_species_counts_major <- pox_species_counts %>%\n mutate(name_display = ifelse(name %in% pox_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, location, sample_group, name_display) %>%\n summarize(n_reads_pox = sum(n_reads_hv),\n p_reads_pox = sum(p_reads_pox), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(pox_species_major_tab$name, \"Other\")))\npox_species_counts_display <- pox_species_counts_major %>%\n rename(p_reads = p_reads_pox, classification = name_display)\n\n# Plot\ng_pox_species <- g_comp_base + \n geom_col(data=pox_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% Poxviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of Poxviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_pox_species\n\n\n\n\n\n\nCode# Get most prominent species for text\npox_species_collate <- pox_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_pox), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\n\nCodethreshold_major_species <- 0.1\ntaxid_adeno <- 10508\n\n# Get set of adenoviridae reads\nadeno_samples <- hv_family_counts %>% filter(taxid == taxid_adeno) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\nadeno_ids <- hv_reads_family %>% \n filter(taxid == taxid_adeno, sample %in% adeno_samples) %>%\n pull(seq_id)\n\n# Count reads for each adenoviridae species\nadeno_species_counts <- hv_reads_species %>%\n filter(seq_id %in% adeno_ids) %>%\n group_by(sample, location, sample_group, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, location, sample_group) %>%\n mutate(p_reads_adeno = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\nadeno_species_major_tab <- adeno_species_counts %>% group_by(name) %>% \n filter(p_reads_adeno == max(p_reads_adeno)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_adeno)) %>% \n filter(p_reads_adeno > threshold_major_species)\nadeno_species_counts_major <- adeno_species_counts %>%\n mutate(name_display = ifelse(name %in% adeno_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, location, sample_group, name_display) %>%\n summarize(n_reads_adeno = sum(n_reads_hv),\n p_reads_adeno = sum(p_reads_adeno), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(adeno_species_major_tab$name, \"Other\")))\nadeno_species_counts_display <- adeno_species_counts_major %>%\n rename(p_reads = p_reads_adeno, classification = name_display)\n\n# Plot\ng_adeno_species <- g_comp_base + \n geom_col(data=adeno_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% Adenoviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of Adenoviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_adeno_species\n\n\n\n\n\n\nCode# Get most prominent species for text\nadeno_species_collate <- adeno_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_adeno), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\n\nCodethreshold_major_species <- 0.1\ntaxid_herpes <- 10292\n\n# Get set of herpesviridae reads\nherpes_samples <- hv_family_counts %>% filter(taxid == taxid_herpes) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\nherpes_ids <- hv_reads_family %>% \n filter(taxid == taxid_herpes, sample %in% herpes_samples) %>%\n pull(seq_id)\n\n# Count reads for each herpesviridae species\nherpes_species_counts <- hv_reads_species %>%\n filter(seq_id %in% herpes_ids) %>%\n group_by(sample, location, sample_group, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, location, sample_group) %>%\n mutate(p_reads_herpes = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\nherpes_species_major_tab <- herpes_species_counts %>% group_by(name) %>% \n filter(p_reads_herpes == max(p_reads_herpes)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_herpes)) %>% \n filter(p_reads_herpes > threshold_major_species)\nherpes_species_counts_major <- herpes_species_counts %>%\n mutate(name_display = ifelse(name %in% herpes_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, location, sample_group, name_display) %>%\n summarize(n_reads_herpes = sum(n_reads_hv),\n p_reads_herpes = sum(p_reads_herpes), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(herpes_species_major_tab$name, \"Other\")))\nherpes_species_counts_display <- herpes_species_counts_major %>%\n rename(p_reads = p_reads_herpes, classification = name_display)\n\n# Plot\ng_herpes_species <- g_comp_base + \n geom_col(data=herpes_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% herpesviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of herpesviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_herpes_species\n\n\n\n\n\n\nCode# Get most prominent species for text\nherpes_species_collate <- herpes_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_herpes), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\n\nCodethreshold_major_species <- 0.1\ntaxid_papilloma <- 151340\n\n# Get set of papillomaviridae reads\npapilloma_samples <- hv_family_counts %>% filter(taxid == taxid_papilloma) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\npapilloma_ids <- hv_reads_family %>% \n filter(taxid == taxid_papilloma, sample %in% papilloma_samples) %>%\n pull(seq_id)\n\n# Count reads for each papillomaviridae species\npapilloma_species_counts <- hv_reads_species %>%\n filter(seq_id %in% papilloma_ids) %>%\n group_by(sample, location, sample_group, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, location, sample_group) %>%\n mutate(p_reads_papilloma = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\npapilloma_species_major_tab <- papilloma_species_counts %>% group_by(name) %>% \n filter(p_reads_papilloma == max(p_reads_papilloma)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_papilloma)) %>% \n filter(p_reads_papilloma > threshold_major_species)\npapilloma_species_counts_major <- papilloma_species_counts %>%\n mutate(name_display = ifelse(name %in% papilloma_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, location, sample_group, name_display) %>%\n summarize(n_reads_papilloma = sum(n_reads_hv),\n p_reads_papilloma = sum(p_reads_papilloma), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(papilloma_species_major_tab$name, \"Other\")))\npapilloma_species_counts_display <- papilloma_species_counts_major %>%\n rename(p_reads = p_reads_papilloma, classification = name_display)\n\n# Plot\ng_papilloma_species <- g_comp_base + \n geom_col(data=papilloma_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% Papillomaviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of Papillomaviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_papilloma_species\n\n\n\n\n\n\nCode# Get most prominent species for text\npapilloma_species_collate <- papilloma_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_papilloma), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\n\nCodethreshold_major_species <- 0.1\ntaxid_picorna <- 12058\n\n# Get set of picornaviridae reads\npicorna_samples <- hv_family_counts %>% filter(taxid == taxid_picorna) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\npicorna_ids <- hv_reads_family %>% \n filter(taxid == taxid_picorna, sample %in% picorna_samples) %>%\n pull(seq_id)\n\n# Count reads for each picornaviridae species\npicorna_species_counts <- hv_reads_species %>%\n filter(seq_id %in% picorna_ids) %>%\n group_by(sample, location, sample_group, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, location, sample_group) %>%\n mutate(p_reads_picorna = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\npicorna_species_major_tab <- picorna_species_counts %>% group_by(name) %>% \n filter(p_reads_picorna == max(p_reads_picorna)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_picorna)) %>% \n filter(p_reads_picorna > threshold_major_species)\npicorna_species_counts_major <- picorna_species_counts %>%\n mutate(name_display = ifelse(name %in% picorna_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, location, sample_group, name_display) %>%\n summarize(n_reads_picorna = sum(n_reads_hv),\n p_reads_picorna = sum(p_reads_picorna), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(picorna_species_major_tab$name, \"Other\")))\npicorna_species_counts_display <- picorna_species_counts_major %>%\n rename(p_reads = p_reads_picorna, classification = name_display)\n\n# Plot\ng_picorna_species <- g_comp_base + \n geom_col(data=picorna_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% Picornaviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of Picornaviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_picorna_species\n\n\n\n\n\n\nCode# Get most prominent species for text\npicorna_species_collate <- picorna_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_picorna), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\nFinally, here again are the overall relative abundances of the specific viral genera I picked out manually in my last entry:\n\nCode# Define reference genera\npath_genera_rna <- c(\"Mamastrovirus\", \"Enterovirus\", \"Salivirus\", \"Kobuvirus\", \"Norovirus\", \"Sapovirus\", \"Rotavirus\", \"Alphacoronavirus\", \"Betacoronavirus\", \"Alphainfluenzavirus\", \"Betainfluenzavirus\", \"Lentivirus\")\npath_genera_dna <- c(\"Mastadenovirus\", \"Alphapolyomavirus\", \"Betapolyomavirus\", \"Alphapapillomavirus\", \"Betapapillomavirus\", \"Gammapapillomavirus\", \"Orthopoxvirus\", \"Simplexvirus\",\n \"Lymphocryptovirus\", \"Cytomegalovirus\", \"Dependoparvovirus\")\npath_genera <- bind_rows(tibble(name=path_genera_rna, genome_type=\"RNA genome\"),\n tibble(name=path_genera_dna, genome_type=\"DNA genome\")) %>%\n left_join(viral_taxa, by=\"name\")\n\n# Count in each sample\nmrg_hv_named_all <- mrg_hv %>% left_join(viral_taxa, by=\"taxid\")\nhv_reads_genus_all <- raise_rank(mrg_hv_named_all, viral_taxa, \"genus\")\nn_path_genera <- hv_reads_genus_all %>% \n group_by(sample, location, sample_group, name, taxid) %>% \n count(name=\"n_reads_viral\") %>% \n inner_join(path_genera, by=c(\"name\", \"taxid\")) %>%\n left_join(read_counts_raw, by=c(\"sample\", \"location\", \"sample_group\")) %>%\n mutate(p_reads_viral = n_reads_viral/n_reads_raw)\n\n# Pivot out and back to add zero lines\nn_path_genera_out <- n_path_genera %>% ungroup %>% select(sample, name, n_reads_viral) %>%\n pivot_wider(names_from=\"name\", values_from=\"n_reads_viral\", values_fill=0) %>%\n pivot_longer(-sample, names_to=\"name\", values_to=\"n_reads_viral\") %>%\n left_join(read_counts_raw, by=\"sample\") %>%\n left_join(path_genera, by=\"name\") %>%\n mutate(p_reads_viral = n_reads_viral/n_reads_raw)\n\n## Aggregate across dates\nn_path_genera_stype <- n_path_genera_out %>% \n group_by(name, taxid, genome_type, sample_group) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_viral = sum(n_reads_viral), .groups = \"drop\") %>%\n mutate(sample=\"All samples\", location=\"All locations\",\n p_reads_viral = n_reads_viral/n_reads_raw,\n na_type = \"DNA\")\n\n# Plot\ng_path_genera <- ggplot(n_path_genera_stype,\n aes(y=name, x=p_reads_viral, color=sample_group)) +\n geom_point() +\n scale_x_log10(name=\"Relative abundance\") +\n scale_color_st() +\n facet_grid(genome_type~., scales=\"free_y\") +\n theme_base + theme(axis.title.y = element_blank())\ng_path_genera\n\n\n\n\n\n\n\nConclusion\nThis dataset is a cautionary tale about the dangers of looking for viruses in samples that were processed with something else in mind. The sample preparation methods used here would be expected to select against viruses, and it shows, with by far the lowest overall human-viral RA of any dataset I’ve investigated so far. Even so, there were some interesting results, such as the higher level of human viruses in influent and primary sludge compared to more processed sample types." + "text": "In this entry, I’m analyzing Bengtsson-Palme et al. (2016), a study of grab samples taken from three treatment plants in Sweden in September 2012. This was a DNA-sequencing study focused on investigating AMR in sewage communities. Influent, effluent and sludge samples were taken from each location, with a total of 70 samples across all sites: 20 influent, 10 effluent and 40 sludge. Liquid samples were filtered through a 1mm seive, then centrifuged, retaining the pellet (note that we expect this to select against viruses), which was then resuspended and underwent DNA extraction. Sludge samples weren’t concentrated, but went directly to DNA extraction. Samples were sequenced on an Illumina HiSeq 2500, producing 2x101bp reads.\nThe raw data\nThe sample composition of the Bengtsson-Palme dataset was as follows:\n\nCode# Importing the data is a bit more complicated this time as the samples are split across three pipeline runs\ndata_dir <- \"../data/2024-04-30_bengtsson-palme\"\n\n# Data input paths\nlibraries_path <- file.path(data_dir, \"sample-metadata.csv\")\nbasic_stats_path <- file.path(data_dir, \"qc_basic_stats.tsv.gz\")\nadapter_stats_path <- file.path(data_dir, \"qc_adapter_stats.tsv.gz\")\nquality_base_stats_path <- file.path(data_dir, \"qc_quality_base_stats.tsv.gz\")\nquality_seq_stats_path <- file.path(data_dir, \"qc_quality_sequence_stats.tsv.gz\")\n\n# Import libraries and extract metadata from sample names\nlibraries_raw <- lapply(libraries_path, read_csv, show_col_types = FALSE) %>% bind_rows\nlibraries <- libraries_raw %>%\n # Process sample types\n mutate(sample_group = ifelse(grepl(\"Inlet\", sample_type), \"Influent\",\n ifelse(grepl(\"Primary\", sample_type), \"Sludge (Primary/Surplus)\",\n ifelse(grepl(\"Surplus\", sample_type), \"Sludge (Primary/Surplus)\",\n ifelse(grepl(\"Digested\", sample_type), \"Sludge (Other)\",\n ifelse(grepl(\"Kemikond\", sample_type), \"Sludge (Other)\",\n \"Effluent\"))))),\n sample_group = factor(sample_group, levels = c(\"Influent\", \"Effluent\", \"Sludge (Primary/Surplus)\", \n \"Sludge (Other)\")),\n sludge = grepl(\"Sludge\", sample_group)) %>%\n arrange(location, sample_group, sample) %>%\n mutate(location = fct_inorder(location),\n sample = fct_inorder(sample))\n\n# Make table\ncount_samples <- libraries %>% group_by(sample_group, location) %>% count %>%\n pivot_wider(names_from = \"location\", values_from=\"n\") %>%\n rename(`Sample Type`=sample_group)\ncount_samples\n\n\n \n\n\n\n\nCode# Import QC data\nstages <- c(\"raw_concat\", \"cleaned\", \"dedup\", \"ribo_initial\", \"ribo_secondary\")\nimport_basic <- function(paths){\n lapply(paths, read_tsv, show_col_types = FALSE) %>% bind_rows %>%\n inner_join(libraries, by=\"sample\") %>%\n arrange(location, sample_group, sample) %>%\n mutate(stage = factor(stage, levels = stages),\n sample = fct_inorder(sample))\n}\nimport_basic_paired <- function(paths){\n import_basic(paths) %>% arrange(read_pair) %>% \n mutate(read_pair = fct_inorder(as.character(read_pair)))\n}\nbasic_stats <- import_basic(basic_stats_path)\nadapter_stats <- import_basic_paired(adapter_stats_path)\nquality_base_stats <- import_basic_paired(quality_base_stats_path)\nquality_seq_stats <- import_basic_paired(quality_seq_stats_path)\n\n# Filter to raw data\nbasic_stats_raw <- basic_stats %>% filter(stage == \"raw_concat\")\nadapter_stats_raw <- adapter_stats %>% filter(stage == \"raw_concat\")\nquality_base_stats_raw <- quality_base_stats %>% filter(stage == \"raw_concat\")\nquality_seq_stats_raw <- quality_seq_stats %>% filter(stage == \"raw_concat\")\n\n# Get key values for readout\nraw_read_counts <- basic_stats_raw %>% ungroup %>% \n summarize(rmin = min(n_read_pairs), rmax=max(n_read_pairs),\n rmean=mean(n_read_pairs), \n rtot = sum(n_read_pairs),\n btot = sum(n_bases_approx),\n dmin = min(percent_duplicates), dmax=max(percent_duplicates),\n dmean=mean(percent_duplicates), .groups = \"drop\")\n\n\nThese 70 samples yielded 23.7M-61.3M (mean 38.7M) reads per sample, for a total of 2.7B read pairs (539 gigabases of sequence). Read qualities were mostly high but tailed off towards the 3’ end, requiring some trimming. Adapter levels were high. Inferred duplication levels were low in sludge samples (1-12%, mean 4%) but much higher in liquid samples (22-90%, mean 46%), implying lower available sequence diversity in the latter sample groups.\n\nCode# Prepare data\nbasic_stats_raw_metrics <- basic_stats_raw %>%\n select(sample, sample_group, location,\n `# Read pairs` = n_read_pairs,\n `Total base pairs\\n(approx)` = n_bases_approx,\n `% Duplicates\\n(FASTQC)` = percent_duplicates) %>%\n pivot_longer(-(sample:location), names_to = \"metric\", values_to = \"value\") %>%\n mutate(metric = fct_inorder(metric))\n\n# Set up plot templates\nscale_fill_st <- purrr::partial(scale_fill_brewer, palette=\"Set1\", name=\"Sample Type\")\ng_basic <- ggplot(basic_stats_raw_metrics, \n aes(x=sample, y=value, fill=sample_group, group=interaction(sample_group,sample))) +\n geom_col(position = \"dodge\") +\n scale_y_continuous(expand=c(0,0)) +\n expand_limits(y=c(0,100)) +\n scale_fill_st() + \n facet_grid(metric~location, scales = \"free\", space=\"free_x\", switch=\"y\") +\n theme_xblank + theme(\n axis.title.y = element_blank(),\n strip.text.y = element_text(face=\"plain\")\n )\ng_basic\n\n\n\n\n\n\n\n\nCode# Set up plotting templates\nscale_color_st <- purrr::partial(scale_color_brewer, palette=\"Set1\",\n name=\"Sample Type\")\ng_qual_raw <- ggplot(mapping=aes(color=sample_group, linetype=read_pair, \n group=interaction(sample,read_pair))) + \n scale_color_st() + scale_linetype_discrete(name = \"Read Pair\") +\n guides(color=guide_legend(nrow=2,byrow=TRUE),\n linetype = guide_legend(nrow=2,byrow=TRUE)) +\n theme_base\n\n# Visualize adapters\ng_adapters_raw <- g_qual_raw + \n geom_line(aes(x=position, y=pc_adapters), data=adapter_stats_raw) +\n scale_y_continuous(name=\"% Adapters\", limits=c(0,NA),\n breaks = seq(0,100,10), expand=c(0,0)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_grid(.~adapter)\ng_adapters_raw\n\n\n\n\n\n\nCode# Visualize quality\ng_quality_base_raw <- g_qual_raw +\n geom_hline(yintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_hline(yintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats_raw) +\n scale_y_continuous(name=\"Mean Phred score\", expand=c(0,0), limits=c(10,45)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0))\ng_quality_base_raw\n\n\n\n\n\n\nCodeg_quality_seq_raw <- g_qual_raw +\n geom_vline(xintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_vline(xintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats_raw) +\n scale_x_continuous(name=\"Mean Phred score\", expand=c(0,0)) +\n scale_y_continuous(name=\"# Sequences\", expand=c(0,0))\ng_quality_seq_raw\n\n\n\n\n\n\n\nPreprocessing\nThe average fraction of reads lost at each stage in the preprocessing pipeline is shown in the following table. As expected given the observed difference in duplication levels, many more reads were lost during deduplication in liquid samples than sludge samples. Conversely, trimming and filtering consistently removed more reads in sludge than in liquid samples, though the effect was less dramatic than for deduplication. Very few reads were lost during ribodepletion, as expected for DNA sequencing libraries.\n\nCoden_reads_rel <- basic_stats %>% \n select(sample, location, sample_group, sludge, stage, \n percent_duplicates, n_read_pairs) %>%\n group_by(sample) %>% arrange(sample, stage) %>%\n mutate(p_reads_retained = replace_na(n_read_pairs / lag(n_read_pairs), 0),\n p_reads_lost = 1 - p_reads_retained,\n p_reads_retained_abs = n_read_pairs / n_read_pairs[1],\n p_reads_lost_abs = 1-p_reads_retained_abs,\n p_reads_lost_abs_marginal = replace_na(p_reads_lost_abs - lag(p_reads_lost_abs), 0))\nn_reads_rel_display <- n_reads_rel %>% \n group_by(`Sludge?`=sludge, Stage=stage) %>% \n summarize(`% Total Reads Lost (Cumulative)` = paste0(round(min(p_reads_lost_abs*100),1), \"-\", round(max(p_reads_lost_abs*100),1), \" (mean \", round(mean(p_reads_lost_abs*100),1), \")\"),\n `% Total Reads Lost (Marginal)` = paste0(round(min(p_reads_lost_abs_marginal*100),1), \"-\", round(max(p_reads_lost_abs_marginal*100),1), \" (mean \", round(mean(p_reads_lost_abs_marginal*100),1), \")\"), .groups=\"drop\") %>% \n filter(Stage != \"raw_concat\") %>%\n mutate(Stage = Stage %>% as.numeric %>% factor(labels=c(\"Trimming & filtering\", \"Deduplication\", \"Initial ribodepletion\", \"Secondary ribodepletion\")))\nn_reads_rel_display\n\n\n \n\n\n\n\nCodeg_stage_trace <- ggplot(basic_stats, aes(x=stage, color=sample_group, group=sample)) +\n scale_color_st() +\n facet_wrap(~location, scales=\"free\", ncol=3) +\n theme_kit\n\n# Plot reads over preprocessing\ng_reads_stages <- g_stage_trace +\n geom_line(aes(y=n_read_pairs)) +\n scale_y_continuous(\"# Read pairs\", expand=c(0,0), limits=c(0,NA))\ng_reads_stages\n\n\n\n\n\n\nCode# Plot relative read losses during preprocessing\ng_reads_rel <- ggplot(n_reads_rel, aes(x=stage, color=sample_group, group=sample)) +\n geom_line(aes(y=p_reads_lost_abs_marginal)) +\n scale_y_continuous(\"% Total Reads Lost\", expand=c(0,0), \n labels = function(x) x*100) +\n scale_color_st() +\n facet_wrap(~location, scales=\"free\", ncol=3) +\n theme_kit\ng_reads_rel\n\n\n\n\n\n\n\nData cleaning was very successful at removing adapters and improving read qualities:\n\nCodeg_qual <- ggplot(mapping=aes(color=sample_group, linetype=read_pair, \n group=interaction(sample,read_pair))) + \n scale_color_st() + scale_linetype_discrete(name = \"Read Pair\") +\n guides(color=guide_legend(nrow=2,byrow=TRUE),\n linetype = guide_legend(nrow=2,byrow=TRUE)) +\n theme_base\n\n# Visualize adapters\ng_adapters <- g_qual + \n geom_line(aes(x=position, y=pc_adapters), data=adapter_stats) +\n scale_y_continuous(name=\"% Adapters\", limits=c(0,20),\n breaks = seq(0,50,10), expand=c(0,0)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_grid(stage~adapter)\ng_adapters\n\n\n\n\n\n\nCode# Visualize quality\ng_quality_base <- g_qual +\n geom_hline(yintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_hline(yintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats) +\n scale_y_continuous(name=\"Mean Phred score\", expand=c(0,0), limits=c(10,45)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_grid(stage~.)\ng_quality_base\n\n\n\n\n\n\nCodeg_quality_seq <- g_qual +\n geom_vline(xintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_vline(xintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats) +\n scale_x_continuous(name=\"Mean Phred score\", expand=c(0,0)) +\n scale_y_continuous(name=\"# Sequences\", expand=c(0,0)) +\n facet_grid(stage~.)\ng_quality_seq\n\n\n\n\n\n\n\nAccording to FASTQC, cleaning + deduplication was very effective at reducing measured duplicate levels, which fell from an average of 45% to 5% in liquid samples and from 6% to 3% in sludge samples:\n\nCodestage_dup <- basic_stats %>% group_by(sludge,stage) %>% \n summarize(dmin = min(percent_duplicates), dmax=max(percent_duplicates),\n dmean=mean(percent_duplicates), .groups = \"drop\")\n\ng_dup_stages <- g_stage_trace +\n geom_line(aes(y=percent_duplicates)) +\n scale_y_continuous(\"% Duplicates\", limits=c(0,NA), expand=c(0,0))\ng_dup_stages\n\n\n\n\n\n\nCodeg_readlen_stages <- g_stage_trace + geom_line(aes(y=mean_seq_len)) +\n scale_y_continuous(\"Mean read length (nt)\", expand=c(0,0), limits=c(0,NA))\ng_readlen_stages\n\n\n\n\n\n\n\nHigh-level composition\nAs before, to assess the high-level composition of the reads, I ran the ribodepleted files through Kraken (using the Standard 16 database) and summarized the results with Bracken. Combining these results with the read counts above gives us a breakdown of the inferred composition of the samples:\n\nCodeclassifications <- c(\"Filtered\", \"Duplicate\", \"Ribosomal\", \"Unassigned\",\n \"Bacterial\", \"Archaeal\", \"Viral\", \"Human\")\n\n# Import composition data\ncomp_path <- file.path(data_dir, \"taxonomic_composition.tsv.gz\")\ncomp <- read_tsv(comp_path, show_col_types = FALSE) %>%\n left_join(libraries, by=\"sample\") %>%\n mutate(classification = factor(classification, levels = classifications))\n \n\n# Summarize composition\nread_comp_summ <- comp %>% \n group_by(location, sample_group, classification) %>%\n summarize(n_reads = sum(n_reads), .groups = \"drop_last\") %>%\n mutate(n_reads = replace_na(n_reads,0),\n p_reads = n_reads/sum(n_reads),\n pc_reads = p_reads*100)\n\n\n\nCode# Prepare plotting templates\ng_comp_base <- ggplot(mapping=aes(x=sample, y=p_reads, fill=classification)) +\n facet_wrap(location~sample_group, scales = \"free_x\", ncol=4,\n labeller = label_wrap_gen(multi_line=FALSE, width=20)) +\n theme_xblank\nscale_y_pc_reads <- purrr::partial(scale_y_continuous, name = \"% Reads\",\n expand = c(0,0), labels = function(y) y*100)\n\n# Plot overall composition\ng_comp <- g_comp_base + geom_col(data = comp, position = \"stack\", width=1) +\n scale_y_pc_reads(limits = c(0,1.01), breaks = seq(0,1,0.2)) +\n scale_fill_brewer(palette = \"Set1\", name = \"Classification\")\ng_comp\n\n\n\n\n\n\nCode# Plot composition of minor components\ncomp_minor <- comp %>% \n filter(classification %in% c(\"Archaeal\", \"Viral\", \"Human\", \"Other\"))\npalette_minor <- brewer.pal(9, \"Set1\")[6:9]\ng_comp_minor <- g_comp_base + \n geom_col(data=comp_minor, position = \"stack\", width=1) +\n scale_y_pc_reads() +\n scale_fill_manual(values=palette_minor, name = \"Classification\")\ng_comp_minor\n\n\n\n\n\n\n\n\nCodep_reads_summ_group <- comp %>%\n mutate(classification = ifelse(classification %in% c(\"Filtered\", \"Duplicate\", \"Unassigned\"), \"Excluded\", as.character(classification)),\n classification = fct_inorder(classification)) %>%\n group_by(classification, sample, sample_group) %>%\n summarize(p_reads = sum(p_reads), .groups = \"drop\") %>%\n group_by(classification, sample_group) %>%\n summarize(pc_min = min(p_reads)*100, pc_max = max(p_reads)*100, \n pc_mean = mean(p_reads)*100, .groups = \"drop\")\np_reads_summ_prep <- p_reads_summ_group %>%\n mutate(classification = fct_inorder(classification),\n pc_min = pc_min %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n pc_max = pc_max %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n pc_mean = pc_mean %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n display = paste0(pc_min, \"-\", pc_max, \"% (mean \", pc_mean, \"%)\"))\np_reads_summ <- p_reads_summ_prep %>%\n select(sample_group, classification, read_fraction=display) %>%\n arrange(sample_group, classification)\np_reads_summ\n\n\n \n\n\n\nIn all sample types, the majority of reads were either filtered, duplicates, or unassigned. Among assigned reads, the vast majority were bacterial, which is unsurprising given the sample processing protocols used. Total viral fraction averaged 0.011% in influent and primary sludge, and considerably lower in effluent and treated sludge. The human fraction was also low, averaging 0.033% across all sample types. Interestingly, treated sludge showed far higher fractions of archaeal reads than other sample types, possibly due to the anaerobic conditions experienced during sludge treatment.\nAs is common for DNA data, viral reads were overwhelmingly dominated by Caudoviricetes phages:\n\nCode# Get Kraken reports\nreports_path <- file.path(data_dir, \"kraken_reports.tsv.gz\")\nreports <- read_tsv(reports_path, show_col_types = FALSE)\n\n# Get viral taxonomy\nviral_taxa_path <- file.path(data_dir, \"viral-taxids.tsv.gz\")\nviral_taxa <- read_tsv(viral_taxa_path, show_col_types = FALSE)\n\n# Filter to viral taxa\nkraken_reports_viral <- filter(reports, taxid %in% viral_taxa$taxid) %>%\n group_by(sample) %>%\n mutate(p_reads_viral = n_reads_clade/n_reads_clade[1])\nkraken_reports_viral_cleaned <- kraken_reports_viral %>%\n inner_join(libraries, by=\"sample\") %>%\n select(-pc_reads_total, -n_reads_direct, -contains(\"minimizers\")) %>%\n select(name, taxid, p_reads_viral, n_reads_clade, everything())\n\nviral_classes <- kraken_reports_viral_cleaned %>% filter(rank == \"C\")\nviral_families <- kraken_reports_viral_cleaned %>% filter(rank == \"F\")\n\n\n\nCodemajor_threshold <- 0.02\n\n# Identify major viral classes\nviral_classes_major_tab <- viral_classes %>% \n group_by(name, taxid) %>%\n summarize(p_reads_viral_max = max(p_reads_viral), .groups=\"drop\") %>%\n filter(p_reads_viral_max >= major_threshold)\nviral_classes_major_list <- viral_classes_major_tab %>% pull(name)\nviral_classes_major <- viral_classes %>% \n filter(name %in% viral_classes_major_list) %>%\n select(name, taxid, sample, sample_group, location, p_reads_viral)\nviral_classes_minor <- viral_classes_major %>% \n group_by(sample, sample_group, location) %>%\n summarize(p_reads_viral_major = sum(p_reads_viral), .groups = \"drop\") %>%\n mutate(name = \"Other\", taxid=NA, p_reads_viral = 1-p_reads_viral_major) %>%\n select(name, taxid, sample, sample_group, location, p_reads_viral)\nviral_classes_display <- bind_rows(viral_classes_major, viral_classes_minor) %>%\n arrange(desc(p_reads_viral)) %>% \n mutate(name = factor(name, levels=c(viral_classes_major_list, \"Other\")),\n p_reads_viral = pmax(p_reads_viral, 0)) %>%\n rename(p_reads = p_reads_viral, classification=name)\n\npalette_viral <- c(brewer.pal(12, \"Set3\"), brewer.pal(8, \"Dark2\"))\ng_classes <- g_comp_base + \n geom_col(data=viral_classes_display, position = \"stack\", width=1) +\n scale_y_continuous(name=\"% Viral Reads\", limits=c(0,1.01), breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral class\")\n \ng_classes\n\n\n\n\n\n\n\nHuman-infecting virus reads: validation\nNext, I investigated the human-infecting virus read content of these unenriched samples. A grand total of 565 reads were identified as putatively human-viral, with many samples showing fewer than 5 total HV read pairs. Even this total likely overstates total human-viral presence, however, as more than 300 of these reads had only low alignment scores to their putative viral sources:\n\nCode# Import HV read data\nhv_reads_filtered_path <- file.path(data_dir, \"hv_hits_putative_filtered.tsv.gz\")\nhv_reads_filtered <- lapply(hv_reads_filtered_path, read_tsv,\n show_col_types = FALSE) %>%\n bind_rows() %>%\n left_join(libraries, by=\"sample\")\n\n# Count reads\nn_hv_filtered <- hv_reads_filtered %>%\n group_by(sample, location, sample_group, seq_id) %>% count %>%\n group_by(sample, location, sample_group) %>% count %>% \n inner_join(basic_stats %>% filter(stage == \"ribo_initial\") %>% \n select(sample, n_read_pairs), by=\"sample\") %>% \n rename(n_putative = n, n_total = n_read_pairs) %>% \n mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)\nn_hv_filtered_summ <- n_hv_filtered %>% ungroup %>%\n summarize(n_putative = sum(n_putative), n_total = sum(n_total), \n .groups=\"drop\") %>% \n mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)\n\n\n\nCode# Collapse multi-entry sequences\nrmax <- purrr::partial(max, na.rm = TRUE)\ncollapse <- function(x) ifelse(all(x == x[1]), x[1], paste(x, collapse=\"/\"))\nmrg <- hv_reads_filtered %>% \n mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev, na.rm = TRUE)) %>%\n arrange(desc(adj_score_max)) %>%\n group_by(seq_id) %>%\n summarize(sample = collapse(sample),\n genome_id = collapse(genome_id),\n taxid_best = taxid[1],\n taxid = collapse(as.character(taxid)),\n best_alignment_score_fwd = rmax(best_alignment_score_fwd),\n best_alignment_score_rev = rmax(best_alignment_score_rev),\n query_len_fwd = rmax(query_len_fwd),\n query_len_rev = rmax(query_len_rev),\n query_seq_fwd = query_seq_fwd[!is.na(query_seq_fwd)][1],\n query_seq_rev = query_seq_rev[!is.na(query_seq_rev)][1],\n classified = rmax(classified),\n assigned_name = collapse(assigned_name),\n assigned_taxid_best = assigned_taxid[1],\n assigned_taxid = collapse(as.character(assigned_taxid)),\n assigned_hv = rmax(assigned_hv),\n hit_hv = rmax(hit_hv),\n encoded_hits = collapse(encoded_hits),\n adj_score_fwd = rmax(adj_score_fwd),\n adj_score_rev = rmax(adj_score_rev)\n ) %>%\n inner_join(libraries, by=\"sample\") %>%\n mutate(kraken_label = ifelse(assigned_hv, \"Kraken2 HV\\nassignment\",\n ifelse(hit_hv, \"Kraken2 HV\\nhit\",\n \"No hit or\\nassignment\"))) %>%\n mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev),\n highscore = adj_score_max >= 20)\n\n# Plot results\ngeom_vhist <- purrr::partial(geom_histogram, binwidth=5, boundary=0)\ng_vhist_base <- ggplot(mapping=aes(x=adj_score_max)) +\n geom_vline(xintercept=20, linetype=\"dashed\", color=\"red\") +\n facet_wrap(~kraken_label, labeller = labeller(kit = label_wrap_gen(20)), scales = \"free_y\") +\n scale_x_continuous(name = \"Maximum adjusted alignment score\") + \n scale_y_continuous(name=\"# Read pairs\") + \n theme_base \ng_vhist_0 <- g_vhist_base + geom_vhist(data=mrg)\ng_vhist_0\n\n\n\n\n\n\n\nBLASTing these reads against nt:\n\nCode# Import paired BLAST results\nblast_paired_path <- file.path(data_dir, \"hv_hits_blast_paired.tsv.gz\")\nblast_paired <- read_tsv(blast_paired_path, show_col_types = FALSE)\n\n# Add viral status\nblast_viral <- mutate(blast_paired, viral = staxid %in% viral_taxa$taxid) %>%\n mutate(viral_full = viral & n_reads == 2)\n\n# Compare to Kraken & Bowtie assignments\nmatch_taxid <- function(taxid_1, taxid_2){\n p1 <- mapply(grepl, paste0(\"/\", taxid_1, \"$\"), taxid_2)\n p2 <- mapply(grepl, paste0(\"^\", taxid_1, \"/\"), taxid_2)\n p3 <- mapply(grepl, paste0(\"^\", taxid_1, \"$\"), taxid_2)\n out <- setNames(p1|p2|p3, NULL)\n return(out)\n}\nmrg_assign <- mrg %>% select(sample, seq_id, taxid, assigned_taxid, adj_score_max)\nblast_assign <- inner_join(blast_viral, mrg_assign, by=\"seq_id\") %>%\n mutate(taxid_match_bowtie = match_taxid(staxid, taxid),\n taxid_match_kraken = match_taxid(staxid, assigned_taxid),\n taxid_match_any = taxid_match_bowtie | taxid_match_kraken)\nblast_out <- blast_assign %>%\n group_by(seq_id) %>%\n summarize(viral_status = ifelse(any(viral_full), 2,\n ifelse(any(taxid_match_any), 2,\n ifelse(any(viral), 1, 0))),\n .groups = \"drop\")\n\n\n\nCode# Merge BLAST results with unenriched read data\nmrg_blast <- full_join(mrg, blast_out, by=\"seq_id\") %>%\n mutate(viral_status = replace_na(viral_status, 0),\n viral_status_out = ifelse(viral_status == 0, FALSE, TRUE))\n\n# Plot\ng_vhist_1 <- g_vhist_base + geom_vhist(data=mrg_blast, mapping=aes(fill=viral_status_out)) +\n scale_fill_brewer(palette = \"Set1\", name = \"Viral status\")\ng_vhist_1\n\n\n\n\n\n\n\nIn order to achieve decent performance metrics under these conditions, I needed to exclude low-scoring reads with Kraken hits as well as those without. Doing this at my normal disjunctive score threshold of 20 gave passable precision (93%) but poor sensitivity (79%), leading to a poor overall F1 score (85%):\n\nCodetest_sens_spec <- function(tab, score_threshold){\n tab_retained <- tab %>% \n mutate(retain_score = (adj_score_fwd > score_threshold | adj_score_rev > score_threshold),\n retain = assigned_hv | retain_score) %>%\n group_by(viral_status_out, retain) %>% count\n pos_tru <- tab_retained %>% filter(viral_status_out == \"TRUE\", retain) %>% pull(n) %>% sum\n pos_fls <- tab_retained %>% filter(viral_status_out != \"TRUE\", retain) %>% pull(n) %>% sum\n neg_tru <- tab_retained %>% filter(viral_status_out != \"TRUE\", !retain) %>% pull(n) %>% sum\n neg_fls <- tab_retained %>% filter(viral_status_out == \"TRUE\", !retain) %>% pull(n) %>% sum\n sensitivity <- pos_tru / (pos_tru + neg_fls)\n specificity <- neg_tru / (neg_tru + pos_fls)\n precision <- pos_tru / (pos_tru + pos_fls)\n f1 <- 2 * precision * sensitivity / (precision + sensitivity)\n out <- tibble(threshold=score_threshold, sensitivity=sensitivity, \n specificity=specificity, precision=precision, f1=f1)\n return(out)\n}\nrange_f1 <- function(intab, inrange=15:45){\n tss <- purrr::partial(test_sens_spec, tab=intab)\n stats <- lapply(inrange, tss) %>% bind_rows %>%\n pivot_longer(!threshold, names_to=\"metric\", values_to=\"value\")\n return(stats)\n}\nstats_0 <- range_f1(mrg_blast)\ng_stats_0 <- ggplot(stats_0, aes(x=threshold, y=value, color=metric)) +\n geom_vline(xintercept=20, color = \"red\", linetype = \"dashed\") +\n geom_line() +\n scale_y_continuous(name = \"Value\", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_x_continuous(name = \"Adjusted Score Threshold\", expand = c(0,0)) +\n scale_color_brewer(palette=\"Dark2\") +\n theme_base\ng_stats_0\n\n\n\n\n\n\nCodestats_0 %>% filter(threshold == 20) %>% \n select(Threshold=threshold, Metric=metric, Value=value)\n\n\n \n\n\n\nLooking into the composition of different read groups, the bulk of high-scoring false positives map to human alphaherpesvirus 1 strain RH2 according to Bowtie2. BLASTN maps these sequences to a variety of bacterial taxa, especially E. coli and various Klebsiella species.\n\nCodemajor_threshold <- 0.05\n\n# Add missing viral taxa\nviral_taxa$name[viral_taxa$taxid == 211787] <- \"Human papillomavirus type 92\"\nviral_taxa$name[viral_taxa$taxid == 509154] <- \"Porcine endogenous retrovirus C\"\nviral_taxa$name[viral_taxa$taxid == 493803] <- \"Merkel cell polyomavirus\"\nviral_taxa$name[viral_taxa$taxid == 427343] <- \"Human papillomavirus 107\"\nviral_taxa$name[viral_taxa$taxid == 194958] <- \"Porcine endogenous retrovirus A\"\nviral_taxa$name[viral_taxa$taxid == 340907] <- \"Papiine alphaherpesvirus 2\"\nviral_taxa$name[viral_taxa$taxid == 194959] <- \"Porcine endogenous retrovirus B\"\n\n\n# Prepare data\nfp <- mrg_blast %>% \n group_by(viral_status_out, highscore, taxid_best) %>% count %>% \n group_by(viral_status_out, highscore) %>% mutate(p=n/sum(n)) %>% \n rename(taxid = taxid_best) %>%\n left_join(viral_taxa, by=\"taxid\") %>%\n arrange(desc(p))\nfp_major_tab <- fp %>% filter(p > major_threshold) %>% arrange(desc(p))\nfp_major_list <- fp_major_tab %>% pull(name) %>% sort %>% unique %>% c(., \"Other\")\nfp_major <- fp %>% mutate(major = p > major_threshold) %>% \n mutate(name_display = ifelse(major, name, \"Other\")) %>%\n group_by(viral_status_out, highscore, name_display) %>% \n summarize(n=sum(n), p=sum(p), .groups = \"drop\") %>%\n mutate(name_display = factor(name_display, levels = fp_major_list),\n score_display = ifelse(highscore, \"S >= 20\", \"S < 20\"),\n status_display = ifelse(viral_status_out, \"True positive\", \"False positive\"))\n\n# Plot\ng_fp <- ggplot(fp_major, aes(x=score_display, y=p, fill=name_display)) +\n geom_col(position=\"stack\") +\n scale_x_discrete(name = \"True positive?\") +\n scale_y_continuous(name = \"% reads\", limits = c(0,1.01), \n breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_fill_manual(values = palette_viral, name = \"Viral\\ntaxon\") +\n facet_grid(.~status_display) +\n guides(fill=guide_legend(ncol=3)) +\n theme_kit\ng_fp\n\n\n\n\n\n\n\n\nCode# Configure\nref_taxid_rh2 <- 946522\np_threshold <- 0.3\n\n# Get taxon names\ntax_names_path <- file.path(data_dir, \"taxid-names.tsv.gz\")\ntax_names <- read_tsv(tax_names_path, show_col_types = FALSE)\n\n# Add missing names\ntax_names_new <- tribble(~staxid, ~name,\n 3050295, \"Cytomegalovirus humanbeta5\",\n 459231, \"FLAG-tagging vector pFLAG97-TSR\",\n 3082113, \"Rangifer tarandus platyrhynchus\",\n 3119969, \"Bubalus kerabau\",\n 177155, \"Streptopelia turtur\",\n 187126, \"Nesoenas mayeri\",\n 244366, \"Klebsiella variicola\",\n )\ntax_names <- tax_names_new %>% filter(! staxid %in% tax_names$staxid) %>%\n bind_rows(tax_names) %>% arrange(staxid)\nref_name_rh2 <- tax_names %>% filter(staxid == ref_taxid_rh2) %>% pull(name)\n\n# Get major matches\nmrg_staxid <- mrg_blast %>% filter(taxid_best == ref_taxid_rh2) %>%\n group_by(highscore, viral_status_out) %>% mutate(n_seq = n())\nfp_staxid <- mrg_staxid %>%\n left_join(blast_paired, by=\"seq_id\") %>%\n mutate(staxid = as.integer(staxid)) %>%\n left_join(tax_names, by=\"staxid\") %>% rename(sname=name) %>%\n left_join(tax_names %>% rename(taxid_best=staxid), by=\"taxid_best\")\nfp_staxid_count <- fp_staxid %>%\n group_by(viral_status_out, highscore, \n taxid_best, name, staxid, sname, n_seq) %>%\n count %>%\n group_by(viral_status_out, highscore, taxid_best, name) %>%\n mutate(p=n/n_seq)\nfp_staxid_count_major <- fp_staxid_count %>%\n filter(n>1, p>p_threshold, !is.na(staxid)) %>%\n mutate(score_display = ifelse(highscore, \"S >= 20\", \"S < 20\"),\n status_display = ifelse(viral_status_out, \n \"True positive\", \"False positive\"))\n\n# Plot\ng <- ggplot(fp_staxid_count_major, aes(x=p, y=sname)) + \n geom_col() + \n facet_grid(status_display~score_display, scales=\"free\",\n labeller = label_wrap_gen(multi_line = FALSE)) +\n scale_x_continuous(name=\"% mapped reads\", limits=c(0,1), breaks=seq(0,1,0.2),\n expand=c(0,0)) +\n labs(title=paste0(ref_name_rh2, \" (taxid \", ref_taxid_rh2, \")\")) +\n theme_base + theme(\n axis.title.y = element_blank(),\n plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\ng\n\n\n\n\n\n\n\nThis is the second DNA wastewater dataset I’ve run (along with Brinch) where alphaherpesvirus 1 strain RH2 represents a large fraction of high-scoring false-positives. In both datasets, all of these reads are mapped by Bowtie to a single reference genome, with ID AB618031.1. This is just one of over 70 HSV-1 genomes present in our reference database. As such, I decided to try removing this genome from the database and re-running the analysis to see if this reduced the number of high-scoring false positives.\nRepeating the analysis with this modification reduces the number of putative HV reads by 33, increases precision from 93% to 98%, and eliminates high-scoring false-positives mapping to human alphaherpesvirus 1:\n\nCodedata_dir_2 <- file.path(data_dir, \"take2\")\n\n# Import HV read data\nhv_reads_filtered_2_path <- file.path(data_dir_2, \"hv_hits_putative_filtered.tsv.gz\")\nhv_reads_filtered_2 <- lapply(hv_reads_filtered_2_path, read_tsv,\n show_col_types = FALSE) %>%\n bind_rows() %>%\n left_join(libraries, by=\"sample\")\n\n# Count reads\nn_hv_filtered_2 <- hv_reads_filtered_2 %>%\n group_by(sample, location, sample_group, seq_id) %>% count %>%\n group_by(sample, location, sample_group) %>% count %>% \n inner_join(basic_stats %>% filter(stage == \"ribo_initial\") %>% \n select(sample, n_read_pairs), by=\"sample\") %>% \n rename(n_putative = n, n_total = n_read_pairs) %>% \n mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)\nn_hv_filtered_summ_2 <- n_hv_filtered_2 %>% ungroup %>%\n summarize(n_putative = sum(n_putative), n_total = sum(n_total), \n .groups=\"drop\") %>% \n mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)\n\n# Process read data\nmrg2 <- hv_reads_filtered_2 %>% \n mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev, na.rm = TRUE)) %>%\n arrange(desc(adj_score_max)) %>%\n group_by(seq_id) %>%\n summarize(sample = collapse(sample),\n genome_id = collapse(genome_id),\n taxid_best = taxid[1],\n taxid = collapse(as.character(taxid)),\n best_alignment_score_fwd = rmax(best_alignment_score_fwd),\n best_alignment_score_rev = rmax(best_alignment_score_rev),\n query_len_fwd = rmax(query_len_fwd),\n query_len_rev = rmax(query_len_rev),\n query_seq_fwd = query_seq_fwd[!is.na(query_seq_fwd)][1],\n query_seq_rev = query_seq_rev[!is.na(query_seq_rev)][1],\n classified = rmax(classified),\n assigned_name = collapse(assigned_name),\n assigned_taxid_best = assigned_taxid[1],\n assigned_taxid = collapse(as.character(assigned_taxid)),\n assigned_hv = rmax(assigned_hv),\n hit_hv = rmax(hit_hv),\n encoded_hits = collapse(encoded_hits),\n adj_score_fwd = rmax(adj_score_fwd),\n adj_score_rev = rmax(adj_score_rev)\n ) %>%\n inner_join(libraries, by=\"sample\") %>%\n mutate(kraken_label = ifelse(assigned_hv, \"Kraken2 HV\\nassignment\",\n ifelse(hit_hv, \"Kraken2 HV\\nhit\",\n \"No hit or\\nassignment\"))) %>%\n mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev),\n highscore = adj_score_max >= 20)\n\n\n\nCode# Import paired BLAST results\nblast_paired_2_path <- file.path(data_dir_2, \"hv_hits_blast_paired.tsv.gz\")\nblast_paired_2 <- read_tsv(blast_paired_2_path, show_col_types = FALSE)\n\n# Add viral status\nblast_viral_2 <- mutate(blast_paired_2, viral = staxid %in% viral_taxa$taxid) %>%\n mutate(viral_full = viral & n_reads == 2)\n\n# Compare to Kraken & Bowtie assignments\nmrg2_assign <- mrg2 %>% select(sample, seq_id, taxid, assigned_taxid, adj_score_max)\nblast_assign_2 <- inner_join(blast_viral, mrg2_assign, by=\"seq_id\") %>%\n mutate(taxid_match_bowtie = match_taxid(staxid, taxid),\n taxid_match_kraken = match_taxid(staxid, assigned_taxid),\n taxid_match_any = taxid_match_bowtie | taxid_match_kraken)\nblast_out_2 <- blast_assign_2 %>%\n group_by(seq_id) %>%\n summarize(viral_status = ifelse(any(viral_full), 2,\n ifelse(any(taxid_match_any), 2,\n ifelse(any(viral), 1, 0))),\n .groups = \"drop\")\n\n# Merge BLAST results with unenriched read data\nmrg2_blast <- full_join(mrg2, blast_out_2, by=\"seq_id\") %>%\n mutate(viral_status = replace_na(viral_status, 0),\n viral_status_out = ifelse(viral_status == 0, FALSE, TRUE))\n\n# Plot\ng_vhist_2 <- g_vhist_base + geom_vhist(data=mrg2_blast, mapping=aes(fill=viral_status_out)) +\n scale_fill_brewer(palette = \"Set1\", name = \"Viral status\")\ng_vhist_2\n\n\n\n\n\n\n\n\nCodestats_1 <- range_f1(mrg2_blast)\ng_stats_1 <- ggplot(stats_1, aes(x=threshold, y=value, color=metric)) +\n geom_vline(xintercept=20, color = \"red\", linetype = \"dashed\") +\n geom_line() +\n scale_y_continuous(name = \"Value\", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_x_continuous(name = \"Adjusted Score Threshold\", expand = c(0,0)) +\n scale_color_brewer(palette=\"Dark2\") +\n theme_base\ng_stats_1\n\n\n\n\n\n\nCodestats_1 %>% filter(threshold == 20) %>% \n select(Threshold=threshold, Metric=metric, Value=value)\n\n\n \n\n\n\n\nCode# Prepare data\nfp2 <- mrg2_blast %>% \n group_by(viral_status_out, highscore, taxid_best) %>% count %>% \n group_by(viral_status_out, highscore) %>% mutate(p=n/sum(n)) %>% \n rename(taxid = taxid_best) %>%\n left_join(viral_taxa, by=\"taxid\") %>%\n arrange(desc(p))\nfp2_major_tab <- fp2 %>% filter(p > major_threshold) %>% arrange(desc(p))\nfp2_major_list <- fp2_major_tab %>% pull(name) %>% sort %>% unique %>% c(., \"Other\")\nfp2_major <- fp2 %>% mutate(major = p > major_threshold) %>% \n mutate(name_display = ifelse(major, name, \"Other\")) %>%\n group_by(viral_status_out, highscore, name_display) %>% \n summarize(n=sum(n), p=sum(p), .groups = \"drop\") %>%\n mutate(name_display = factor(name_display, levels = fp2_major_list),\n score_display = ifelse(highscore, \"S >= 20\", \"S < 20\"),\n status_display = ifelse(viral_status_out, \"True positive\", \"False positive\"))\n\n# Plot\ng_fp2 <- ggplot(fp2_major, aes(x=score_display, y=p, fill=name_display)) +\n geom_col(position=\"stack\") +\n scale_x_discrete(name = \"True positive?\") +\n scale_y_continuous(name = \"% reads\", limits = c(0,1.01), \n breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_fill_manual(values = palette_viral, name = \"Viral\\ntaxon\") +\n facet_grid(.~status_display) +\n guides(fill=guide_legend(ncol=3)) +\n theme_kit\ng_fp2\n\n\n\n\n\n\n\nHuman-infecting viruses: overall relative abundance\n\nCode# Get raw read counts\nread_counts_raw <- basic_stats_raw %>%\n select(sample, location, sample_group, n_reads_raw = n_read_pairs)\n\n# Get HV read counts\nmrg_hv <- mrg2 %>% mutate(hv_status = assigned_hv | hit_hv | highscore) %>%\n rename(taxid_all = taxid, taxid = taxid_best)\nread_counts_hv <- mrg_hv %>% filter(hv_status) %>% group_by(sample) %>% \n count(name=\"n_reads_hv\")\nread_counts <- read_counts_raw %>% left_join(read_counts_hv, by=\"sample\") %>%\n mutate(n_reads_hv = replace_na(n_reads_hv, 0))\n\n# Aggregate\nread_counts_grp <- read_counts %>% group_by(location, sample_group) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv), .groups=\"drop\") %>%\n mutate(sample= \"All samples\")\nread_counts_st <- read_counts_grp %>% group_by(sample, sample_group) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv), .groups=\"drop\") %>%\n mutate(location = \"All locations\")\nread_counts_loc <- read_counts_grp %>%\n group_by(sample, location) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv), .groups=\"drop\") %>%\n mutate(sample_group = \"All sample types\")\nread_counts_tot <- read_counts_loc %>% group_by(sample, sample_group) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv), .groups=\"drop\") %>%\n mutate(location = \"All locations\")\nread_counts_agg <- bind_rows(read_counts_grp, read_counts_st,\n read_counts_loc, read_counts_tot) %>%\n mutate(p_reads_hv = n_reads_hv/n_reads_raw,\n location = factor(location, levels = c(levels(libraries$location), \"All locations\")),\n sample_group = factor(sample_group, levels = c(levels(libraries$sample_group), \"All sample types\")))\n\n\nApplying a disjunctive cutoff at S=20 identifies 230 read pairs as human-viral. This gives an overall relative HV abundance of \\(8.50 \\times 10^{-8}\\). While very low across all sample types, HV RA was noticeably higher in influent and primary sludge than in sample types that had undergone more extensive processing (effluent and processed sludge):\n\nCode# Visualize\ng_phv_agg <- ggplot(read_counts_agg, aes(x=sample_group, color=location)) +\n geom_point(aes(y=p_reads_hv)) +\n scale_y_log10(\"Relative abundance of human virus reads\") +\n scale_color_brewer(name=\"Location\", palette=\"Dark2\") + theme_kit\ng_phv_agg\n\n\n\n\n\n\n\nThis is by far the lowest HV relative abundance I’ve seen across any of the datasets I’ve analyzed:\n\nCode# Collate past RA values\nra_past <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched,\n \"Brumfield\", 5e-5, \"RNA\", FALSE,\n \"Brumfield\", 3.66e-7, \"DNA\", FALSE,\n \"Spurbeck\", 5.44e-6, \"RNA\", FALSE,\n \"Yang\", 3.62e-4, \"RNA\", FALSE,\n \"Rothman (unenriched)\", 1.87e-5, \"RNA\", FALSE,\n \"Rothman (panel-enriched)\", 3.3e-5, \"RNA\", TRUE,\n \"Crits-Christoph (unenriched)\", 1.37e-5, \"RNA\", FALSE,\n \"Crits-Christoph (panel-enriched)\", 1.26e-2, \"RNA\", TRUE,\n \"Prussin (non-control)\", 1.63e-5, \"RNA\", FALSE,\n \"Prussin (non-control)\", 4.16e-5, \"DNA\", FALSE,\n \"Rosario (non-control)\", 1.21e-5, \"RNA\", FALSE,\n \"Rosario (non-control)\", 1.50e-4, \"DNA\", FALSE,\n \"Leung\", 1.73e-5, \"DNA\", FALSE,\n \"Brinch\", 3.88e-6, \"DNA\", FALSE\n)\n\n# Collate new RA values\nra_new <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched,\n \"Bengtsson-Palme\", 8.86e-8, \"DNA\", FALSE)\n\n\n# Plot\nscale_color_na <- purrr::partial(scale_color_brewer, palette=\"Set1\",\n name=\"Nucleic acid type\")\nra_comp <- bind_rows(ra_past, ra_new) %>% mutate(dataset = fct_inorder(dataset))\ng_ra_comp <- ggplot(ra_comp, aes(y=dataset, x=ra, color=na_type)) +\n geom_point() +\n scale_color_na() +\n scale_x_log10(name=\"Relative abundance of human virus reads\") +\n theme_base + theme(axis.title.y = element_blank())\ng_ra_comp\n\n\n\n\n\n\n\nHuman-infecting viruses: taxonomy and composition\nIn investigating the taxonomy of human-infecting virus reads, I restricted my analysis to samples with more than 5 HV read pairs total across all viruses, to reduce noise arising from extremely low HV read counts in some samples. 13 samples, 3 from influent and 10 from primary sludge, met this criterion.\nAt the family level, most samples across all locations were dominated by Poxviridae and Adenoviridae, with Herpesviridae, Papillomaviridae and Picornaviridae also making a significant appearance in at least some samples:\n\nCode# Get viral taxon names for putative HV reads\nviral_taxa$name[viral_taxa$taxid == 249588] <- \"Mamastrovirus\"\nviral_taxa$name[viral_taxa$taxid == 194960] <- \"Kobuvirus\"\nviral_taxa$name[viral_taxa$taxid == 688449] <- \"Salivirus\"\nviral_taxa$name[viral_taxa$taxid == 585893] <- \"Picobirnaviridae\"\nviral_taxa$name[viral_taxa$taxid == 333922] <- \"Betapapillomavirus\"\nviral_taxa$name[viral_taxa$taxid == 334207] <- \"Betapapillomavirus 3\"\nviral_taxa$name[viral_taxa$taxid == 369960] <- \"Porcine type-C oncovirus\"\nviral_taxa$name[viral_taxa$taxid == 333924] <- \"Betapapillomavirus 2\"\nviral_taxa$name[viral_taxa$taxid == 687329] <- \"Anelloviridae\"\nviral_taxa$name[viral_taxa$taxid == 325455] <- \"Gammapapillomavirus\"\nviral_taxa$name[viral_taxa$taxid == 333750] <- \"Alphapapillomavirus\"\nviral_taxa$name[viral_taxa$taxid == 694002] <- \"Betacoronavirus\"\nviral_taxa$name[viral_taxa$taxid == 334202] <- \"Mupapillomavirus\"\nviral_taxa$name[viral_taxa$taxid == 197911] <- \"Alphainfluenzavirus\"\nviral_taxa$name[viral_taxa$taxid == 186938] <- \"Respirovirus\"\nviral_taxa$name[viral_taxa$taxid == 333926] <- \"Gammapapillomavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 337051] <- \"Betapapillomavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 337043] <- \"Alphapapillomavirus 4\"\nviral_taxa$name[viral_taxa$taxid == 694003] <- \"Betacoronavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 334204] <- \"Mupapillomavirus 2\"\nviral_taxa$name[viral_taxa$taxid == 334208] <- \"Betapapillomavirus 4\"\nviral_taxa$name[viral_taxa$taxid == 333928] <- \"Gammapapillomavirus 2\"\nviral_taxa$name[viral_taxa$taxid == 337039] <- \"Alphapapillomavirus 2\"\nviral_taxa$name[viral_taxa$taxid == 333929] <- \"Gammapapillomavirus 3\"\nviral_taxa$name[viral_taxa$taxid == 337042] <- \"Alphapapillomavirus 7\"\nviral_taxa$name[viral_taxa$taxid == 334203] <- \"Mupapillomavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 333757] <- \"Alphapapillomavirus 8\"\nviral_taxa$name[viral_taxa$taxid == 337050] <- \"Alphapapillomavirus 6\"\nviral_taxa$name[viral_taxa$taxid == 333767] <- \"Alphapapillomavirus 3\"\nviral_taxa$name[viral_taxa$taxid == 333754] <- \"Alphapapillomavirus 10\"\nviral_taxa$name[viral_taxa$taxid == 687363] <- \"Torque teno virus 24\"\nviral_taxa$name[viral_taxa$taxid == 687342] <- \"Torque teno virus 3\"\nviral_taxa$name[viral_taxa$taxid == 687359] <- \"Torque teno virus 20\"\nviral_taxa$name[viral_taxa$taxid == 194441] <- \"Primate T-lymphotropic virus 2\"\nviral_taxa$name[viral_taxa$taxid == 334209] <- \"Betapapillomavirus 5\"\nviral_taxa$name[viral_taxa$taxid == 194965] <- \"Aichivirus B\"\nviral_taxa$name[viral_taxa$taxid == 333930] <- \"Gammapapillomavirus 4\"\nviral_taxa$name[viral_taxa$taxid == 337048] <- \"Alphapapillomavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 337041] <- \"Alphapapillomavirus 9\"\nviral_taxa$name[viral_taxa$taxid == 337049] <- \"Alphapapillomavirus 11\"\nviral_taxa$name[viral_taxa$taxid == 337044] <- \"Alphapapillomavirus 5\"\n\n# Filter samples and add viral taxa information\nsamples_keep <- read_counts %>% filter(n_reads_hv > 5) %>% pull(sample)\nmrg_hv_named <- mrg_hv %>% filter(sample %in% samples_keep) %>% left_join(viral_taxa, by=\"taxid\") \n\n# Discover viral species & genera for HV reads\nraise_rank <- function(read_db, taxid_db, out_rank = \"species\", verbose = FALSE){\n # Get higher ranks than search rank\n ranks <- c(\"subspecies\", \"species\", \"subgenus\", \"genus\", \"subfamily\", \"family\", \"suborder\", \"order\", \"class\", \"subphylum\", \"phylum\", \"kingdom\", \"superkingdom\")\n rank_match <- which.max(ranks == out_rank)\n high_ranks <- ranks[rank_match:length(ranks)]\n # Merge read DB and taxid DB\n reads <- read_db %>% select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n # Extract sequences that are already at appropriate rank\n reads_rank <- filter(reads, rank == out_rank)\n # Drop sequences at a higher rank and return unclassified sequences\n reads_norank <- reads %>% filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))\n while(nrow(reads_norank) > 0){ # As long as there are unclassified sequences...\n # Promote read taxids and re-merge with taxid DB, then re-classify and filter\n reads_remaining <- reads_norank %>% mutate(taxid = parent_taxid) %>%\n select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n reads_rank <- reads_remaining %>% filter(rank == out_rank) %>%\n bind_rows(reads_rank)\n reads_norank <- reads_remaining %>%\n filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))\n }\n # Finally, extract and append reads that were excluded during the process\n reads_dropped <- reads %>% filter(!seq_id %in% reads_rank$seq_id)\n reads_out <- reads_rank %>% bind_rows(reads_dropped) %>%\n select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n return(reads_out)\n}\nhv_reads_species <- raise_rank(mrg_hv_named, viral_taxa, \"species\")\nhv_reads_genus <- raise_rank(mrg_hv_named, viral_taxa, \"genus\")\nhv_reads_family <- raise_rank(mrg_hv_named, viral_taxa, \"family\")\n\n\n\nCodethreshold_major_family <- 0.08\n\n# Count reads for each human-viral family\nhv_family_counts <- hv_reads_family %>% \n group_by(sample, location, sample_group, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, location, sample_group) %>%\n mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\nhv_family_major_tab <- hv_family_counts %>% group_by(name) %>% \n filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%\n arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > threshold_major_family)\nhv_family_counts_major <- hv_family_counts %>%\n mutate(name_display = ifelse(name %in% hv_family_major_tab$name, name, \"Other\")) %>%\n group_by(sample, location, sample_group, name_display) %>%\n summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(hv_family_major_tab$name, \"Other\")))\nhv_family_counts_display <- hv_family_counts_major %>%\n rename(p_reads = p_reads_hv, classification = name_display)\n\n# Plot\ng_hv_family <- g_comp_base + \n geom_col(data=hv_family_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% HV Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral family\") +\n labs(title=\"Family composition of human-viral reads\") +\n guides(fill=guide_legend(ncol=4)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\ng_hv_family\n\n\n\n\n\n\nCode# Get most prominent families for text\nhv_family_collate <- hv_family_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_hv), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\nIn investigating individual viral families, to avoid distortions from a few rare reads, I restricted myself to samples where that family made up at least 10% of human-viral reads:\n\nCodethreshold_major_species <- 0.1\ntaxid_pox <- 10240\n\n# Get set of poxviridae reads\npox_samples <- hv_family_counts %>% filter(taxid == taxid_pox) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\npox_ids <- hv_reads_family %>% \n filter(taxid == taxid_pox, sample %in% pox_samples) %>%\n pull(seq_id)\n\n# Count reads for each poxviridae species\npox_species_counts <- hv_reads_species %>%\n filter(seq_id %in% pox_ids) %>%\n group_by(sample, location, sample_group, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, location, sample_group) %>%\n mutate(p_reads_pox = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\npox_species_major_tab <- pox_species_counts %>% group_by(name) %>% \n filter(p_reads_pox == max(p_reads_pox)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_pox)) %>% \n filter(p_reads_pox > threshold_major_species)\npox_species_counts_major <- pox_species_counts %>%\n mutate(name_display = ifelse(name %in% pox_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, location, sample_group, name_display) %>%\n summarize(n_reads_pox = sum(n_reads_hv),\n p_reads_pox = sum(p_reads_pox), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(pox_species_major_tab$name, \"Other\")))\npox_species_counts_display <- pox_species_counts_major %>%\n rename(p_reads = p_reads_pox, classification = name_display)\n\n# Plot\ng_pox_species <- g_comp_base + \n geom_col(data=pox_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% Poxviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of Poxviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_pox_species\n\n\n\n\n\n\nCode# Get most prominent species for text\npox_species_collate <- pox_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_pox), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\n\nCodethreshold_major_species <- 0.1\ntaxid_adeno <- 10508\n\n# Get set of adenoviridae reads\nadeno_samples <- hv_family_counts %>% filter(taxid == taxid_adeno) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\nadeno_ids <- hv_reads_family %>% \n filter(taxid == taxid_adeno, sample %in% adeno_samples) %>%\n pull(seq_id)\n\n# Count reads for each adenoviridae species\nadeno_species_counts <- hv_reads_species %>%\n filter(seq_id %in% adeno_ids) %>%\n group_by(sample, location, sample_group, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, location, sample_group) %>%\n mutate(p_reads_adeno = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\nadeno_species_major_tab <- adeno_species_counts %>% group_by(name) %>% \n filter(p_reads_adeno == max(p_reads_adeno)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_adeno)) %>% \n filter(p_reads_adeno > threshold_major_species)\nadeno_species_counts_major <- adeno_species_counts %>%\n mutate(name_display = ifelse(name %in% adeno_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, location, sample_group, name_display) %>%\n summarize(n_reads_adeno = sum(n_reads_hv),\n p_reads_adeno = sum(p_reads_adeno), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(adeno_species_major_tab$name, \"Other\")))\nadeno_species_counts_display <- adeno_species_counts_major %>%\n rename(p_reads = p_reads_adeno, classification = name_display)\n\n# Plot\ng_adeno_species <- g_comp_base + \n geom_col(data=adeno_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% Adenoviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of Adenoviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_adeno_species\n\n\n\n\n\n\nCode# Get most prominent species for text\nadeno_species_collate <- adeno_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_adeno), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\n\nCodethreshold_major_species <- 0.1\ntaxid_herpes <- 10292\n\n# Get set of herpesviridae reads\nherpes_samples <- hv_family_counts %>% filter(taxid == taxid_herpes) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\nherpes_ids <- hv_reads_family %>% \n filter(taxid == taxid_herpes, sample %in% herpes_samples) %>%\n pull(seq_id)\n\n# Count reads for each herpesviridae species\nherpes_species_counts <- hv_reads_species %>%\n filter(seq_id %in% herpes_ids) %>%\n group_by(sample, location, sample_group, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, location, sample_group) %>%\n mutate(p_reads_herpes = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\nherpes_species_major_tab <- herpes_species_counts %>% group_by(name) %>% \n filter(p_reads_herpes == max(p_reads_herpes)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_herpes)) %>% \n filter(p_reads_herpes > threshold_major_species)\nherpes_species_counts_major <- herpes_species_counts %>%\n mutate(name_display = ifelse(name %in% herpes_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, location, sample_group, name_display) %>%\n summarize(n_reads_herpes = sum(n_reads_hv),\n p_reads_herpes = sum(p_reads_herpes), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(herpes_species_major_tab$name, \"Other\")))\nherpes_species_counts_display <- herpes_species_counts_major %>%\n rename(p_reads = p_reads_herpes, classification = name_display)\n\n# Plot\ng_herpes_species <- g_comp_base + \n geom_col(data=herpes_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% herpesviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of herpesviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_herpes_species\n\n\n\n\n\n\nCode# Get most prominent species for text\nherpes_species_collate <- herpes_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_herpes), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\n\nCodethreshold_major_species <- 0.1\ntaxid_papilloma <- 151340\n\n# Get set of papillomaviridae reads\npapilloma_samples <- hv_family_counts %>% filter(taxid == taxid_papilloma) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\npapilloma_ids <- hv_reads_family %>% \n filter(taxid == taxid_papilloma, sample %in% papilloma_samples) %>%\n pull(seq_id)\n\n# Count reads for each papillomaviridae species\npapilloma_species_counts <- hv_reads_species %>%\n filter(seq_id %in% papilloma_ids) %>%\n group_by(sample, location, sample_group, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, location, sample_group) %>%\n mutate(p_reads_papilloma = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\npapilloma_species_major_tab <- papilloma_species_counts %>% group_by(name) %>% \n filter(p_reads_papilloma == max(p_reads_papilloma)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_papilloma)) %>% \n filter(p_reads_papilloma > threshold_major_species)\npapilloma_species_counts_major <- papilloma_species_counts %>%\n mutate(name_display = ifelse(name %in% papilloma_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, location, sample_group, name_display) %>%\n summarize(n_reads_papilloma = sum(n_reads_hv),\n p_reads_papilloma = sum(p_reads_papilloma), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(papilloma_species_major_tab$name, \"Other\")))\npapilloma_species_counts_display <- papilloma_species_counts_major %>%\n rename(p_reads = p_reads_papilloma, classification = name_display)\n\n# Plot\ng_papilloma_species <- g_comp_base + \n geom_col(data=papilloma_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% Papillomaviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of Papillomaviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_papilloma_species\n\n\n\n\n\n\nCode# Get most prominent species for text\npapilloma_species_collate <- papilloma_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_papilloma), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\n\nCodethreshold_major_species <- 0.1\ntaxid_picorna <- 12058\n\n# Get set of picornaviridae reads\npicorna_samples <- hv_family_counts %>% filter(taxid == taxid_picorna) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\npicorna_ids <- hv_reads_family %>% \n filter(taxid == taxid_picorna, sample %in% picorna_samples) %>%\n pull(seq_id)\n\n# Count reads for each picornaviridae species\npicorna_species_counts <- hv_reads_species %>%\n filter(seq_id %in% picorna_ids) %>%\n group_by(sample, location, sample_group, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, location, sample_group) %>%\n mutate(p_reads_picorna = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\npicorna_species_major_tab <- picorna_species_counts %>% group_by(name) %>% \n filter(p_reads_picorna == max(p_reads_picorna)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_picorna)) %>% \n filter(p_reads_picorna > threshold_major_species)\npicorna_species_counts_major <- picorna_species_counts %>%\n mutate(name_display = ifelse(name %in% picorna_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, location, sample_group, name_display) %>%\n summarize(n_reads_picorna = sum(n_reads_hv),\n p_reads_picorna = sum(p_reads_picorna), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(picorna_species_major_tab$name, \"Other\")))\npicorna_species_counts_display <- picorna_species_counts_major %>%\n rename(p_reads = p_reads_picorna, classification = name_display)\n\n# Plot\ng_picorna_species <- g_comp_base + \n geom_col(data=picorna_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% Picornaviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of Picornaviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_picorna_species\n\n\n\n\n\n\nCode# Get most prominent species for text\npicorna_species_collate <- picorna_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_picorna), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\nFinally, here again are the overall relative abundances of the specific viral genera I picked out manually in my last entry:\n\nCode# Define reference genera\npath_genera_rna <- c(\"Mamastrovirus\", \"Enterovirus\", \"Salivirus\", \"Kobuvirus\", \"Norovirus\", \"Sapovirus\", \"Rotavirus\", \"Alphacoronavirus\", \"Betacoronavirus\", \"Alphainfluenzavirus\", \"Betainfluenzavirus\", \"Lentivirus\")\npath_genera_dna <- c(\"Mastadenovirus\", \"Alphapolyomavirus\", \"Betapolyomavirus\", \"Alphapapillomavirus\", \"Betapapillomavirus\", \"Gammapapillomavirus\", \"Orthopoxvirus\", \"Simplexvirus\",\n \"Lymphocryptovirus\", \"Cytomegalovirus\", \"Dependoparvovirus\")\npath_genera <- bind_rows(tibble(name=path_genera_rna, genome_type=\"RNA genome\"),\n tibble(name=path_genera_dna, genome_type=\"DNA genome\")) %>%\n left_join(viral_taxa, by=\"name\")\n\n# Count in each sample\nmrg_hv_named_all <- mrg_hv %>% left_join(viral_taxa, by=\"taxid\")\nhv_reads_genus_all <- raise_rank(mrg_hv_named_all, viral_taxa, \"genus\")\nn_path_genera <- hv_reads_genus_all %>% \n group_by(sample, location, sample_group, name, taxid) %>% \n count(name=\"n_reads_viral\") %>% \n inner_join(path_genera, by=c(\"name\", \"taxid\")) %>%\n left_join(read_counts_raw, by=c(\"sample\", \"location\", \"sample_group\")) %>%\n mutate(p_reads_viral = n_reads_viral/n_reads_raw)\n\n# Pivot out and back to add zero lines\nn_path_genera_out <- n_path_genera %>% ungroup %>% select(sample, name, n_reads_viral) %>%\n pivot_wider(names_from=\"name\", values_from=\"n_reads_viral\", values_fill=0) %>%\n pivot_longer(-sample, names_to=\"name\", values_to=\"n_reads_viral\") %>%\n left_join(read_counts_raw, by=\"sample\") %>%\n left_join(path_genera, by=\"name\") %>%\n mutate(p_reads_viral = n_reads_viral/n_reads_raw)\n\n## Aggregate across dates\nn_path_genera_stype <- n_path_genera_out %>% \n group_by(name, taxid, genome_type, sample_group) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_viral = sum(n_reads_viral), .groups = \"drop\") %>%\n mutate(sample=\"All samples\", location=\"All locations\",\n p_reads_viral = n_reads_viral/n_reads_raw,\n na_type = \"DNA\")\n\n# Plot\ng_path_genera <- ggplot(n_path_genera_stype,\n aes(y=name, x=p_reads_viral, color=sample_group)) +\n geom_point() +\n scale_x_log10(name=\"Relative abundance\") +\n scale_color_st() +\n facet_grid(genome_type~., scales=\"free_y\") +\n theme_base + theme(axis.title.y = element_blank())\ng_path_genera\n\n\n\n\n\n\n\nConclusion\nThis dataset is a cautionary tale about the dangers of looking for viruses in samples that were processed with something else in mind. The sample preparation methods used here would be expected to select against viruses, and it shows, with by far the lowest overall human-viral RA of any dataset I’ve investigated so far. Even so, there were some interesting results, such as the higher level of human viruses in influent and primary sludge compared to more processed sample types." + }, + { + "objectID": "notebooks/2024-05-01_ng.html", + "href": "notebooks/2024-05-01_ng.html", + "title": "Workflow analysis of Ng et al. (2019)", + "section": "", + "text": "Continuing my analysis of datasets from the P2RA preprint, I analyzed the data from Ng et al. (2019), a study that used DNA sequencing of wastewater samples to characterize the bacterial microbiota and resistome in Singapore. This study used processing methods I haven’t seen before:\n\nAll samples passed through “a filter” on-site at the WWTP prior to further processing in lab.\nSamples concentrated to 400ml using a Hemoflow dialyzer “via standard bloodline tubing”.\nEluted concentrates then further concentrated by passing through a 0.22um filter and retaining the retentate (NB: this is anti-selecting for viruses).\nSludge samples were instead centrifuged and the pellet kept for further analysis.\nAfter concentration, samples underwent DNA extraction with the PowerSoil DNA Isolation Kit, then underwent library prep and Illumina sequencing with an Illumina HiSeq2500 (2x250bp).\n\nSince this was a bacteria-focused study that used processing methods we expect to select against viruses, we wouldn’t expect to see high viral relative abundances here. Nevertheless, it’s worth seeing what we can see.\nThe raw data\nSamples were collected from six different locations in the treatment plant on six different dates (from October 2016 to August 2017) for a total of 36 samples:\n\n\nCode# Importing the data is a bit more complicated this time as the samples are split across three pipeline runs\ndata_dir <- \"../data/2024-05-01_ng\"\n\n# Data input paths\nlibraries_path <- file.path(data_dir, \"sample-metadata.csv\")\nbasic_stats_path <- file.path(data_dir, \"qc_basic_stats.tsv.gz\")\nadapter_stats_path <- file.path(data_dir, \"qc_adapter_stats.tsv.gz\")\nquality_base_stats_path <- file.path(data_dir, \"qc_quality_base_stats.tsv.gz\")\nquality_seq_stats_path <- file.path(data_dir, \"qc_quality_sequence_stats.tsv.gz\")\n\n# Import libraries and extract metadata from sample names\nlocs <- c(\"INF\", \"PST\", \"SLUDGE\", \"SST\", \"MBR\", \"WW\")\nlibraries_raw <- lapply(libraries_path, read_csv, show_col_types = FALSE) %>%\n bind_rows\nlibraries <- libraries_raw %>%\n mutate(sample_type_long = gsub(\" \\\\(.*\", \"\", sample_type),\n sample_type_short = ifelse(sample_type_long == \"Influent\", \"INF\",\n sub(\".*\\\\((.*)\\\\)\", \"\\\\1\", sample_type)),\n sample_type_short = factor(sample_type_short, levels=locs)) %>%\n arrange(sample_type_short, date) %>%\n mutate(sample_type_long = fct_inorder(sample_type_long),\n sample = fct_inorder(sample)) %>%\n arrange(date) %>%\n mutate(date = fct_inorder(date))\n\n# Make table\ncount_samples <- libraries %>% group_by(sample_type_long, sample_type_short) %>%\n count %>%\n rename(`Sample Type`=sample_type_long, Abbreviation=sample_type_short)\ncount_samples\n\n\n \n\n\n\n\nCode# Import QC data\nstages <- c(\"raw_concat\", \"cleaned\", \"dedup\", \"ribo_initial\", \"ribo_secondary\")\nimport_basic <- function(paths){\n lapply(paths, read_tsv, show_col_types = FALSE) %>% bind_rows %>%\n inner_join(libraries, by=\"sample\") %>%\n arrange(sample_type_short, date, sample) %>%\n mutate(stage = factor(stage, levels = stages),\n sample = fct_inorder(sample))\n}\nimport_basic_paired <- function(paths){\n import_basic(paths) %>% arrange(read_pair) %>% \n mutate(read_pair = fct_inorder(as.character(read_pair)))\n}\nbasic_stats <- import_basic(basic_stats_path)\nadapter_stats <- import_basic_paired(adapter_stats_path)\nquality_base_stats <- import_basic_paired(quality_base_stats_path)\nquality_seq_stats <- import_basic_paired(quality_seq_stats_path)\n\n# Filter to raw data\nbasic_stats_raw <- basic_stats %>% filter(stage == \"raw_concat\")\nadapter_stats_raw <- adapter_stats %>% filter(stage == \"raw_concat\")\nquality_base_stats_raw <- quality_base_stats %>% filter(stage == \"raw_concat\")\nquality_seq_stats_raw <- quality_seq_stats %>% filter(stage == \"raw_concat\")\n\n# Get key values for readout\nraw_read_counts <- basic_stats_raw %>% ungroup %>% \n summarize(rmin = min(n_read_pairs), rmax=max(n_read_pairs),\n rmean=mean(n_read_pairs), \n rtot = sum(n_read_pairs),\n btot = sum(n_bases_approx),\n dmin = min(percent_duplicates), dmax=max(percent_duplicates),\n dmean=mean(percent_duplicates), .groups = \"drop\")\n\n\nThese 36 samples yielded 26.6M-74.1M (mean 46.1M) reads per sample, for a total of 1.7B read pairs (830 gigabases of sequence). Read qualities were mostly high but tailed off towards the 3’ end, requiring some trimming. Adapter levels were fairly low but still in need of some trimming. Inferred duplication levels were variable (1-64%, mean 31%), with libraries with lower read depth showing much lower duplicate levels.\n\nCode# Prepare data\nbasic_stats_raw_metrics <- basic_stats_raw %>%\n select(sample, sample_type_short, date,\n `# Read pairs` = n_read_pairs,\n `Total base pairs\\n(approx)` = n_bases_approx,\n `% Duplicates\\n(FASTQC)` = percent_duplicates) %>%\n pivot_longer(-(sample:date), names_to = \"metric\", values_to = \"value\") %>%\n mutate(metric = fct_inorder(metric))\n\n# Set up plot templates\nscale_fill_st <- purrr::partial(scale_fill_brewer, palette=\"Set1\", name=\"Sample Type\")\ng_basic <- ggplot(basic_stats_raw_metrics, \n aes(x=sample, y=value, fill=sample_type_short, \n group=interaction(sample_type_short,sample))) +\n geom_col(position = \"dodge\") +\n scale_y_continuous(expand=c(0,0)) +\n expand_limits(y=c(0,100)) +\n scale_fill_st() + \n facet_grid(metric~., scales = \"free\", space=\"free_x\", switch=\"y\") +\n theme_xblank + theme(\n axis.title.y = element_blank(),\n strip.text.y = element_text(face=\"plain\")\n )\ng_basic\n\n\n\n\n\n\n\n\nCode# Set up plotting templates\nscale_color_st <- purrr::partial(scale_color_brewer, palette=\"Set1\",\n name=\"Sample Type\")\ng_qual_raw <- ggplot(mapping=aes(color=sample_type_short, linetype=read_pair, \n group=interaction(sample,read_pair))) + \n scale_color_st() + scale_linetype_discrete(name = \"Read Pair\") +\n guides(color=guide_legend(nrow=2,byrow=TRUE),\n linetype = guide_legend(nrow=2,byrow=TRUE)) +\n theme_base\n\n# Visualize adapters\ng_adapters_raw <- g_qual_raw + \n geom_line(aes(x=position, y=pc_adapters), data=adapter_stats_raw) +\n scale_y_continuous(name=\"% Adapters\", limits=c(0,NA),\n breaks = seq(0,100,1), expand=c(0,0)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,500,20), expand=c(0,0)) +\n facet_grid(.~adapter)\ng_adapters_raw\n\n\n\n\n\n\nCode# Visualize quality\ng_quality_base_raw <- g_qual_raw +\n geom_hline(yintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_hline(yintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats_raw) +\n scale_y_continuous(name=\"Mean Phred score\", expand=c(0,0), limits=c(10,45)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,500,20), expand=c(0,0))\ng_quality_base_raw\n\n\n\n\n\n\nCodeg_quality_seq_raw <- g_qual_raw +\n geom_vline(xintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_vline(xintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats_raw) +\n scale_x_continuous(name=\"Mean Phred score\", expand=c(0,0)) +\n scale_y_continuous(name=\"# Sequences\", expand=c(0,0))\ng_quality_seq_raw\n\n\n\n\n\n\n\nPreprocessing\nThe average fraction of reads lost at each stage in the preprocessing pipeline is shown in the following table. As expected given the observed difference in duplication levels, many more reads were lost during deduplication in liquid samples than sludge samples. Conversely, trimming and filtering consistently removed more reads in sludge than in liquid samples, though the effect was less dramatic than for deduplication. Very few reads were lost during ribodepletion, as expected for DNA sequencing libraries.\n\nCoden_reads_rel <- basic_stats %>% \n select(sample, sample_type_short, date, stage, \n percent_duplicates, n_read_pairs) %>%\n group_by(sample) %>% arrange(sample, stage) %>%\n mutate(p_reads_retained = replace_na(n_read_pairs / lag(n_read_pairs), 0),\n p_reads_lost = 1 - p_reads_retained,\n p_reads_retained_abs = n_read_pairs / n_read_pairs[1],\n p_reads_lost_abs = 1-p_reads_retained_abs,\n p_reads_lost_abs_marginal = replace_na(p_reads_lost_abs - lag(p_reads_lost_abs), 0))\nn_reads_rel_display <- n_reads_rel %>% \n group_by(`Sample Type`=sample_type_short, Stage=stage) %>% \n summarize(`% Total Reads Lost (Cumulative)` = paste0(round(min(p_reads_lost_abs*100),1), \"-\", round(max(p_reads_lost_abs*100),1), \" (mean \", round(mean(p_reads_lost_abs*100),1), \")\"),\n `% Total Reads Lost (Marginal)` = paste0(round(min(p_reads_lost_abs_marginal*100),1), \"-\", round(max(p_reads_lost_abs_marginal*100),1), \" (mean \", round(mean(p_reads_lost_abs_marginal*100),1), \")\"), .groups=\"drop\") %>% \n filter(Stage != \"raw_concat\") %>%\n mutate(Stage = Stage %>% as.numeric %>% factor(labels=c(\"Trimming & filtering\", \"Deduplication\", \"Initial ribodepletion\", \"Secondary ribodepletion\")))\nn_reads_rel_display\n\n\n \n\n\n\n\nCodeg_stage_base <- ggplot(mapping=aes(x=stage, color=sample_type_short, group=sample)) +\n scale_color_st() +\n theme_kit\n\n# Plot reads over preprocessing\ng_reads_stages <- g_stage_base +\n geom_line(aes(y=n_read_pairs), data=basic_stats) +\n scale_y_continuous(\"# Read pairs\", expand=c(0,0), limits=c(0,NA))\ng_reads_stages\n\n\n\n\n\n\nCode# Plot relative read losses during preprocessing\ng_reads_rel <- g_stage_base +\n geom_line(aes(y=p_reads_lost_abs_marginal), data=n_reads_rel) +\n scale_y_continuous(\"% Total Reads Lost\", expand=c(0,0), \n labels = function(x) x*100)\ng_reads_rel\n\n\n\n\n\n\n\nData cleaning was very successful at removing adapters and improving read qualities:\n\nCodeg_qual <- ggplot(mapping=aes(color=sample_type_short, linetype=read_pair, \n group=interaction(sample,read_pair))) + \n scale_color_st() + scale_linetype_discrete(name = \"Read Pair\") +\n guides(color=guide_legend(nrow=2,byrow=TRUE),\n linetype = guide_legend(nrow=2,byrow=TRUE)) +\n theme_base\n\n# Visualize adapters\ng_adapters <- g_qual + \n geom_line(aes(x=position, y=pc_adapters), data=adapter_stats) +\n scale_y_continuous(name=\"% Adapters\", limits=c(0,20),\n breaks = seq(0,50,10), expand=c(0,0)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_grid(stage~adapter)\ng_adapters\n\n\n\n\n\n\nCode# Visualize quality\ng_quality_base <- g_qual +\n geom_hline(yintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_hline(yintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats) +\n scale_y_continuous(name=\"Mean Phred score\", expand=c(0,0), limits=c(10,45)) +\n scale_x_continuous(name=\"Position\", limits=c(0,NA),\n breaks=seq(0,140,20), expand=c(0,0)) +\n facet_grid(stage~.)\ng_quality_base\n\n\n\n\n\n\nCodeg_quality_seq <- g_qual +\n geom_vline(xintercept=25, linetype=\"dashed\", color=\"red\") +\n geom_vline(xintercept=30, linetype=\"dashed\", color=\"red\") +\n geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats) +\n scale_x_continuous(name=\"Mean Phred score\", expand=c(0,0)) +\n scale_y_continuous(name=\"# Sequences\", expand=c(0,0)) +\n facet_grid(stage~.)\ng_quality_seq\n\n\n\n\n\n\n\nAccording to FASTQC, cleaning + deduplication was very effective at reducing measured duplicate levels, which fell from an average of 31% to 6.5%:\n\nCodestage_dup <- basic_stats %>% group_by(stage) %>% \n summarize(dmin = min(percent_duplicates), dmax=max(percent_duplicates),\n dmean=mean(percent_duplicates), .groups = \"drop\")\n\ng_dup_stages <- g_stage_base +\n geom_line(aes(y=percent_duplicates), data=basic_stats) +\n scale_y_continuous(\"% Duplicates\", limits=c(0,NA), expand=c(0,0))\ng_dup_stages\n\n\n\n\n\n\nCodeg_readlen_stages <- g_stage_base + \n geom_line(aes(y=mean_seq_len), data=basic_stats) +\n scale_y_continuous(\"Mean read length (nt)\", expand=c(0,0), limits=c(0,NA))\ng_readlen_stages\n\n\n\n\n\n\n\nHigh-level composition\nAs before, to assess the high-level composition of the reads, I ran the ribodepleted files through Kraken (using the Standard 16 database) and summarized the results with Bracken. Combining these results with the read counts above gives us a breakdown of the inferred composition of the samples:\n\nCodeclassifications <- c(\"Filtered\", \"Duplicate\", \"Ribosomal\", \"Unassigned\",\n \"Bacterial\", \"Archaeal\", \"Viral\", \"Human\")\n\n# Import composition data\ncomp_path <- file.path(data_dir, \"taxonomic_composition.tsv.gz\")\ncomp <- read_tsv(comp_path, show_col_types = FALSE) %>%\n left_join(libraries, by=\"sample\") %>%\n mutate(classification = factor(classification, levels = classifications))\n \n\n# Summarize composition\nread_comp_summ <- comp %>% \n group_by(sample_type_short, classification) %>%\n summarize(n_reads = sum(n_reads), .groups = \"drop_last\") %>%\n mutate(n_reads = replace_na(n_reads,0),\n p_reads = n_reads/sum(n_reads),\n pc_reads = p_reads*100)\n\n\n\nCode# Prepare plotting templates\ng_comp_base <- ggplot(mapping=aes(x=sample, y=p_reads, fill=classification)) +\n facet_wrap(~sample_type_short, scales = \"free_x\", ncol=3,\n labeller = label_wrap_gen(multi_line=FALSE, width=20)) +\n theme_xblank\nscale_y_pc_reads <- purrr::partial(scale_y_continuous, name = \"% Reads\",\n expand = c(0,0), labels = function(y) y*100)\n\n# Plot overall composition\ng_comp <- g_comp_base + geom_col(data = comp, position = \"stack\", width=1) +\n scale_y_pc_reads(limits = c(0,1.01), breaks = seq(0,1,0.2)) +\n scale_fill_brewer(palette = \"Set1\", name = \"Classification\")\ng_comp\n\n\n\n\n\n\nCode# Plot composition of minor components\ncomp_minor <- comp %>% \n filter(classification %in% c(\"Archaeal\", \"Viral\", \"Human\", \"Other\"))\npalette_minor <- brewer.pal(9, \"Set1\")[6:9]\ng_comp_minor <- g_comp_base + \n geom_col(data=comp_minor, position = \"stack\", width=1) +\n scale_y_pc_reads() +\n scale_fill_manual(values=palette_minor, name = \"Classification\")\ng_comp_minor\n\n\n\n\n\n\n\n\nCodep_reads_summ_group <- comp %>%\n mutate(classification = ifelse(classification %in% c(\"Filtered\", \"Duplicate\", \"Unassigned\"), \"Excluded\", as.character(classification)),\n classification = fct_inorder(classification)) %>%\n group_by(classification, sample, sample_type_short) %>%\n summarize(p_reads = sum(p_reads), .groups = \"drop\") %>%\n group_by(classification, sample_type_short) %>%\n summarize(pc_min = min(p_reads)*100, pc_max = max(p_reads)*100, \n pc_mean = mean(p_reads)*100, .groups = \"drop\")\np_reads_summ_prep <- p_reads_summ_group %>%\n mutate(classification = fct_inorder(classification),\n pc_min = pc_min %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n pc_max = pc_max %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n pc_mean = pc_mean %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),\n display = paste0(pc_min, \"-\", pc_max, \"% (mean \", pc_mean, \"%)\"))\np_reads_summ <- p_reads_summ_prep %>%\n select(`Sample Type`=sample_type_short, Classification=classification, \n `Read Fraction`=display) %>%\n arrange(`Sample Type`, Classification)\np_reads_summ\n\n\n \n\n\n\nAs in previous DNA datasets, the vast majority of classified reads were bacterial in origin. The fraction of virus reads varied substantially between sample types, averaging <0.01% in influent and final effluent but closer to 0.05% in other sample types. Interestingly (though not particularly relevantly for this analysis), the fraction of archaeal reads was much higher in influent than other sample types, in contrast to Bengtsson-Palme where it was highest in slidge.\nAs is common for DNA data, viral reads were overwhelmingly dominated by Caudoviricetes phages, though one wet-well sample contained a substantial fraction of Alsuviricetes (a class of mainly plant pathogens that includes Virgaviridae):\n\nCode# Get Kraken reports\nreports_path <- file.path(data_dir, \"kraken_reports.tsv.gz\")\nreports <- read_tsv(reports_path, show_col_types = FALSE)\n\n# Get viral taxonomy\nviral_taxa_path <- file.path(data_dir, \"viral-taxids.tsv.gz\")\nviral_taxa <- read_tsv(viral_taxa_path, show_col_types = FALSE)\n\n# Filter to viral taxa\nkraken_reports_viral <- filter(reports, taxid %in% viral_taxa$taxid) %>%\n group_by(sample) %>%\n mutate(p_reads_viral = n_reads_clade/n_reads_clade[1])\nkraken_reports_viral_cleaned <- kraken_reports_viral %>%\n inner_join(libraries, by=\"sample\") %>%\n select(-pc_reads_total, -n_reads_direct, -contains(\"minimizers\")) %>%\n select(name, taxid, p_reads_viral, n_reads_clade, everything())\n\nviral_classes <- kraken_reports_viral_cleaned %>% filter(rank == \"C\")\nviral_families <- kraken_reports_viral_cleaned %>% filter(rank == \"F\")\n\n\n\nCodemajor_threshold <- 0.02\n\n# Identify major viral classes\nviral_classes_major_tab <- viral_classes %>% \n group_by(name, taxid) %>%\n summarize(p_reads_viral_max = max(p_reads_viral), .groups=\"drop\") %>%\n filter(p_reads_viral_max >= major_threshold)\nviral_classes_major_list <- viral_classes_major_tab %>% pull(name)\nviral_classes_major <- viral_classes %>% \n filter(name %in% viral_classes_major_list) %>%\n select(name, taxid, sample, sample_type_short, date, p_reads_viral)\nviral_classes_minor <- viral_classes_major %>% \n group_by(sample, sample_type_short, date) %>%\n summarize(p_reads_viral_major = sum(p_reads_viral), .groups = \"drop\") %>%\n mutate(name = \"Other\", taxid=NA, p_reads_viral = 1-p_reads_viral_major) %>%\n select(name, taxid, sample, sample_type_short, date, p_reads_viral)\nviral_classes_display <- bind_rows(viral_classes_major, viral_classes_minor) %>%\n arrange(desc(p_reads_viral)) %>% \n mutate(name = factor(name, levels=c(viral_classes_major_list, \"Other\")),\n p_reads_viral = pmax(p_reads_viral, 0)) %>%\n rename(p_reads = p_reads_viral, classification=name)\n\npalette_viral <- c(brewer.pal(12, \"Set3\"), brewer.pal(8, \"Dark2\"))\ng_classes <- g_comp_base + \n geom_col(data=viral_classes_display, position = \"stack\", width=1) +\n scale_y_continuous(name=\"% Viral Reads\", limits=c(0,1.01), breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral class\")\n \ng_classes\n\n\n\n\n\n\n\nHuman-infecting virus reads: validation\nNext, I investigated the human-infecting virus read content of these unenriched samples. A grand total of 527 reads were identified as putatively human-viral, with half of samples showing 5 or fewer total HV read pairs.\n\nCode# Import HV read data\nhv_reads_filtered_path <- file.path(data_dir, \"hv_hits_putative_filtered.tsv.gz\")\nhv_reads_filtered <- lapply(hv_reads_filtered_path, read_tsv,\n show_col_types = FALSE) %>%\n bind_rows() %>%\n left_join(libraries, by=\"sample\")\n\n# Count reads\nn_hv_filtered <- hv_reads_filtered %>%\n group_by(sample, date, sample_type_short, seq_id) %>% count %>%\n group_by(sample, date, sample_type_short) %>% count %>% \n inner_join(basic_stats %>% filter(stage == \"ribo_initial\") %>% \n select(sample, n_read_pairs), by=\"sample\") %>% \n rename(n_putative = n, n_total = n_read_pairs) %>% \n mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)\nn_hv_filtered_summ <- n_hv_filtered %>% ungroup %>%\n summarize(n_putative = sum(n_putative), n_total = sum(n_total), \n .groups=\"drop\") %>% \n mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)\n\n\n\nCode# Collapse multi-entry sequences\nrmax <- purrr::partial(max, na.rm = TRUE)\ncollapse <- function(x) ifelse(all(x == x[1]), x[1], paste(x, collapse=\"/\"))\nmrg <- hv_reads_filtered %>% \n mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev, na.rm = TRUE)) %>%\n arrange(desc(adj_score_max)) %>%\n group_by(seq_id) %>%\n summarize(sample = collapse(sample),\n genome_id = collapse(genome_id),\n taxid_best = taxid[1],\n taxid = collapse(as.character(taxid)),\n best_alignment_score_fwd = rmax(best_alignment_score_fwd),\n best_alignment_score_rev = rmax(best_alignment_score_rev),\n query_len_fwd = rmax(query_len_fwd),\n query_len_rev = rmax(query_len_rev),\n query_seq_fwd = query_seq_fwd[!is.na(query_seq_fwd)][1],\n query_seq_rev = query_seq_rev[!is.na(query_seq_rev)][1],\n classified = rmax(classified),\n assigned_name = collapse(assigned_name),\n assigned_taxid_best = assigned_taxid[1],\n assigned_taxid = collapse(as.character(assigned_taxid)),\n assigned_hv = rmax(assigned_hv),\n hit_hv = rmax(hit_hv),\n encoded_hits = collapse(encoded_hits),\n adj_score_fwd = rmax(adj_score_fwd),\n adj_score_rev = rmax(adj_score_rev)\n ) %>%\n inner_join(libraries, by=\"sample\") %>%\n mutate(kraken_label = ifelse(assigned_hv, \"Kraken2 HV\\nassignment\",\n ifelse(hit_hv, \"Kraken2 HV\\nhit\",\n \"No hit or\\nassignment\"))) %>%\n mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev),\n highscore = adj_score_max >= 20)\n\n# Plot results\ngeom_vhist <- purrr::partial(geom_histogram, binwidth=5, boundary=0)\ng_vhist_base <- ggplot(mapping=aes(x=adj_score_max)) +\n geom_vline(xintercept=20, linetype=\"dashed\", color=\"red\") +\n facet_wrap(~kraken_label, labeller = labeller(kit = label_wrap_gen(20)), scales = \"free_y\") +\n scale_x_continuous(name = \"Maximum adjusted alignment score\") + \n scale_y_continuous(name=\"# Read pairs\") + \n theme_base \ng_vhist_0 <- g_vhist_base + geom_vhist(data=mrg)\ng_vhist_0\n\n\n\n\n\n\n\nBLASTing these reads against nt, we find that the pipeline performs well, with only a single high-scoring false-positive read:\n\nCode# Import paired BLAST results\nblast_paired_path <- file.path(data_dir, \"hv_hits_blast_paired.tsv.gz\")\nblast_paired <- read_tsv(blast_paired_path, show_col_types = FALSE)\n\n# Add viral status\nblast_viral <- mutate(blast_paired, viral = staxid %in% viral_taxa$taxid) %>%\n mutate(viral_full = viral & n_reads == 2)\n\n# Compare to Kraken & Bowtie assignments\nmatch_taxid <- function(taxid_1, taxid_2){\n p1 <- mapply(grepl, paste0(\"/\", taxid_1, \"$\"), taxid_2)\n p2 <- mapply(grepl, paste0(\"^\", taxid_1, \"/\"), taxid_2)\n p3 <- mapply(grepl, paste0(\"^\", taxid_1, \"$\"), taxid_2)\n out <- setNames(p1|p2|p3, NULL)\n return(out)\n}\nmrg_assign <- mrg %>% select(sample, seq_id, taxid, assigned_taxid, adj_score_max)\nblast_assign <- inner_join(blast_viral, mrg_assign, by=\"seq_id\") %>%\n mutate(taxid_match_bowtie = match_taxid(staxid, taxid),\n taxid_match_kraken = match_taxid(staxid, assigned_taxid),\n taxid_match_any = taxid_match_bowtie | taxid_match_kraken)\nblast_out <- blast_assign %>%\n group_by(seq_id) %>%\n summarize(viral_status = ifelse(any(viral_full), 2,\n ifelse(any(taxid_match_any), 2,\n ifelse(any(viral), 1, 0))),\n .groups = \"drop\")\n\n\n\nCode# Merge BLAST results with unenriched read data\nmrg_blast <- full_join(mrg, blast_out, by=\"seq_id\") %>%\n mutate(viral_status = replace_na(viral_status, 0),\n viral_status_out = ifelse(viral_status == 0, FALSE, TRUE))\n\n# Plot\ng_vhist_1 <- g_vhist_base + geom_vhist(data=mrg_blast, mapping=aes(fill=viral_status_out)) +\n scale_fill_brewer(palette = \"Set1\", name = \"Viral status\")\ng_vhist_1\n\n\n\n\n\n\n\nMy usual disjunctive score threshold of 20 gave precision, sensitivity, and F1 scores all >97%:\n\nCodetest_sens_spec <- function(tab, score_threshold){\n tab_retained <- tab %>% \n mutate(retain_score = (adj_score_fwd > score_threshold | adj_score_rev > score_threshold),\n retain = assigned_hv | retain_score) %>%\n group_by(viral_status_out, retain) %>% count\n pos_tru <- tab_retained %>% filter(viral_status_out == \"TRUE\", retain) %>% pull(n) %>% sum\n pos_fls <- tab_retained %>% filter(viral_status_out != \"TRUE\", retain) %>% pull(n) %>% sum\n neg_tru <- tab_retained %>% filter(viral_status_out != \"TRUE\", !retain) %>% pull(n) %>% sum\n neg_fls <- tab_retained %>% filter(viral_status_out == \"TRUE\", !retain) %>% pull(n) %>% sum\n sensitivity <- pos_tru / (pos_tru + neg_fls)\n specificity <- neg_tru / (neg_tru + pos_fls)\n precision <- pos_tru / (pos_tru + pos_fls)\n f1 <- 2 * precision * sensitivity / (precision + sensitivity)\n out <- tibble(threshold=score_threshold, sensitivity=sensitivity, \n specificity=specificity, precision=precision, f1=f1)\n return(out)\n}\nrange_f1 <- function(intab, inrange=15:45){\n tss <- purrr::partial(test_sens_spec, tab=intab)\n stats <- lapply(inrange, tss) %>% bind_rows %>%\n pivot_longer(!threshold, names_to=\"metric\", values_to=\"value\")\n return(stats)\n}\nstats_0 <- range_f1(mrg_blast)\ng_stats_0 <- ggplot(stats_0, aes(x=threshold, y=value, color=metric)) +\n geom_vline(xintercept=20, color = \"red\", linetype = \"dashed\") +\n geom_line() +\n scale_y_continuous(name = \"Value\", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) +\n scale_x_continuous(name = \"Adjusted Score Threshold\", expand = c(0,0)) +\n scale_color_brewer(palette=\"Dark2\") +\n theme_base\ng_stats_0\n\n\n\n\n\n\nCodestats_0 %>% filter(threshold == 20) %>% \n select(Threshold=threshold, Metric=metric, Value=value)\n\n\n \n\n\n\nHuman-infecting viruses: overall relative abundance\n\nCode# Get raw read counts\nread_counts_raw <- basic_stats_raw %>%\n select(sample, sample_type_short, date, n_reads_raw = n_read_pairs)\n\n# Get HV read counts\nmrg_hv <- mrg %>% mutate(hv_status = assigned_hv | highscore) %>%\n rename(taxid_all = taxid, taxid = taxid_best)\nread_counts_hv <- mrg_hv %>% filter(hv_status) %>% group_by(sample) %>% \n count(name=\"n_reads_hv\")\nread_counts <- read_counts_raw %>% left_join(read_counts_hv, by=\"sample\") %>%\n mutate(n_reads_hv = replace_na(n_reads_hv, 0))\n\n# Aggregate\nread_counts_grp <- read_counts %>% group_by(date, sample_type_short) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv), .groups=\"drop\") %>%\n mutate(sample= \"All samples\")\nread_counts_st <- read_counts_grp %>% group_by(sample, sample_type_short) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv), .groups=\"drop\") %>%\n mutate(date = \"All dates\")\nread_counts_date <- read_counts_grp %>%\n group_by(sample, date) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv), .groups=\"drop\") %>%\n mutate(sample_type_short = \"All sample types\")\nread_counts_tot <- read_counts_date %>% group_by(sample, sample_type_short) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_hv = sum(n_reads_hv), .groups=\"drop\") %>%\n mutate(date = \"All dates\")\nread_counts_agg <- bind_rows(read_counts_grp, read_counts_st,\n read_counts_date, read_counts_tot) %>%\n mutate(p_reads_hv = n_reads_hv/n_reads_raw,\n date = factor(date, levels = c(levels(libraries$date), \"All dates\")),\n sample_type_short = factor(sample_type_short, levels = c(levels(libraries$sample_type_short), \"All sample types\")))\n\n\nApplying a disjunctive cutoff at S=20 identifies 482 read pairs as human-viral. This gives an overall relative HV abundance of \\(2.90 \\times 10^{-7}\\); on the low end across all datasets I’ve analyzed, though higher than for Bengtsson-Palme:\n\nCode# Visualize\ng_phv_agg <- ggplot(read_counts_agg, aes(x=date, color=sample_type_short)) +\n geom_point(aes(y=p_reads_hv)) +\n scale_y_log10(\"Relative abundance of human virus reads\") +\n scale_color_st() + theme_kit\ng_phv_agg\n\n\n\n\n\n\n\n\nCode# Collate past RA values\nra_past <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched,\n \"Brumfield\", 5e-5, \"RNA\", FALSE,\n \"Brumfield\", 3.66e-7, \"DNA\", FALSE,\n \"Spurbeck\", 5.44e-6, \"RNA\", FALSE,\n \"Yang\", 3.62e-4, \"RNA\", FALSE,\n \"Rothman (unenriched)\", 1.87e-5, \"RNA\", FALSE,\n \"Rothman (panel-enriched)\", 3.3e-5, \"RNA\", TRUE,\n \"Crits-Christoph (unenriched)\", 1.37e-5, \"RNA\", FALSE,\n \"Crits-Christoph (panel-enriched)\", 1.26e-2, \"RNA\", TRUE,\n \"Prussin (non-control)\", 1.63e-5, \"RNA\", FALSE,\n \"Prussin (non-control)\", 4.16e-5, \"DNA\", FALSE,\n \"Rosario (non-control)\", 1.21e-5, \"RNA\", FALSE,\n \"Rosario (non-control)\", 1.50e-4, \"DNA\", FALSE,\n \"Leung\", 1.73e-5, \"DNA\", FALSE,\n \"Brinch\", 3.88e-6, \"DNA\", FALSE,\n \"Bengtsson-Palme\", 8.86e-8, \"DNA\", FALSE\n)\n\n# Collate new RA values\nra_new <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched,\n \"Ng\", 2.90e-7, \"DNA\", FALSE)\n\n\n# Plot\nscale_color_na <- purrr::partial(scale_color_brewer, palette=\"Set1\",\n name=\"Nucleic acid type\")\nra_comp <- bind_rows(ra_past, ra_new) %>% mutate(dataset = fct_inorder(dataset))\ng_ra_comp <- ggplot(ra_comp, aes(y=dataset, x=ra, color=na_type)) +\n geom_point() +\n scale_color_na() +\n scale_x_log10(name=\"Relative abundance of human virus reads\") +\n theme_base + theme(axis.title.y = element_blank())\ng_ra_comp\n\n\n\n\n\n\n\nHuman-infecting viruses: taxonomy and composition\nIn investigating the taxonomy of human-infecting virus reads, I restricted my analysis to samples with more than 5 HV read pairs total across all viruses, to reduce noise arising from extremely low HV read counts in some samples. 13 samples met this criterion.\nAt the family level, most samples were overwhelmingly dominated by Adenoviridae, with Picornaviridae, Polyomaviridae and Papillomaviridae making up most of the rest:\n\nCode# Get viral taxon names for putative HV reads\nviral_taxa$name[viral_taxa$taxid == 249588] <- \"Mamastrovirus\"\nviral_taxa$name[viral_taxa$taxid == 194960] <- \"Kobuvirus\"\nviral_taxa$name[viral_taxa$taxid == 688449] <- \"Salivirus\"\nviral_taxa$name[viral_taxa$taxid == 585893] <- \"Picobirnaviridae\"\nviral_taxa$name[viral_taxa$taxid == 333922] <- \"Betapapillomavirus\"\nviral_taxa$name[viral_taxa$taxid == 334207] <- \"Betapapillomavirus 3\"\nviral_taxa$name[viral_taxa$taxid == 369960] <- \"Porcine type-C oncovirus\"\nviral_taxa$name[viral_taxa$taxid == 333924] <- \"Betapapillomavirus 2\"\nviral_taxa$name[viral_taxa$taxid == 687329] <- \"Anelloviridae\"\nviral_taxa$name[viral_taxa$taxid == 325455] <- \"Gammapapillomavirus\"\nviral_taxa$name[viral_taxa$taxid == 333750] <- \"Alphapapillomavirus\"\nviral_taxa$name[viral_taxa$taxid == 694002] <- \"Betacoronavirus\"\nviral_taxa$name[viral_taxa$taxid == 334202] <- \"Mupapillomavirus\"\nviral_taxa$name[viral_taxa$taxid == 197911] <- \"Alphainfluenzavirus\"\nviral_taxa$name[viral_taxa$taxid == 186938] <- \"Respirovirus\"\nviral_taxa$name[viral_taxa$taxid == 333926] <- \"Gammapapillomavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 337051] <- \"Betapapillomavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 337043] <- \"Alphapapillomavirus 4\"\nviral_taxa$name[viral_taxa$taxid == 694003] <- \"Betacoronavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 334204] <- \"Mupapillomavirus 2\"\nviral_taxa$name[viral_taxa$taxid == 334208] <- \"Betapapillomavirus 4\"\nviral_taxa$name[viral_taxa$taxid == 333928] <- \"Gammapapillomavirus 2\"\nviral_taxa$name[viral_taxa$taxid == 337039] <- \"Alphapapillomavirus 2\"\nviral_taxa$name[viral_taxa$taxid == 333929] <- \"Gammapapillomavirus 3\"\nviral_taxa$name[viral_taxa$taxid == 337042] <- \"Alphapapillomavirus 7\"\nviral_taxa$name[viral_taxa$taxid == 334203] <- \"Mupapillomavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 333757] <- \"Alphapapillomavirus 8\"\nviral_taxa$name[viral_taxa$taxid == 337050] <- \"Alphapapillomavirus 6\"\nviral_taxa$name[viral_taxa$taxid == 333767] <- \"Alphapapillomavirus 3\"\nviral_taxa$name[viral_taxa$taxid == 333754] <- \"Alphapapillomavirus 10\"\nviral_taxa$name[viral_taxa$taxid == 687363] <- \"Torque teno virus 24\"\nviral_taxa$name[viral_taxa$taxid == 687342] <- \"Torque teno virus 3\"\nviral_taxa$name[viral_taxa$taxid == 687359] <- \"Torque teno virus 20\"\nviral_taxa$name[viral_taxa$taxid == 194441] <- \"Primate T-lymphotropic virus 2\"\nviral_taxa$name[viral_taxa$taxid == 334209] <- \"Betapapillomavirus 5\"\nviral_taxa$name[viral_taxa$taxid == 194965] <- \"Aichivirus B\"\nviral_taxa$name[viral_taxa$taxid == 333930] <- \"Gammapapillomavirus 4\"\nviral_taxa$name[viral_taxa$taxid == 337048] <- \"Alphapapillomavirus 1\"\nviral_taxa$name[viral_taxa$taxid == 337041] <- \"Alphapapillomavirus 9\"\nviral_taxa$name[viral_taxa$taxid == 337049] <- \"Alphapapillomavirus 11\"\nviral_taxa$name[viral_taxa$taxid == 337044] <- \"Alphapapillomavirus 5\"\n\n# Filter samples and add viral taxa information\nsamples_keep <- read_counts %>% filter(n_reads_hv > 5) %>% pull(sample)\nmrg_hv_named <- mrg_hv %>% filter(sample %in% samples_keep, hv_status) %>% left_join(viral_taxa, by=\"taxid\") \n\n# Discover viral species & genera for HV reads\nraise_rank <- function(read_db, taxid_db, out_rank = \"species\", verbose = FALSE){\n # Get higher ranks than search rank\n ranks <- c(\"subspecies\", \"species\", \"subgenus\", \"genus\", \"subfamily\", \"family\", \"suborder\", \"order\", \"class\", \"subphylum\", \"phylum\", \"kingdom\", \"superkingdom\")\n rank_match <- which.max(ranks == out_rank)\n high_ranks <- ranks[rank_match:length(ranks)]\n # Merge read DB and taxid DB\n reads <- read_db %>% select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n # Extract sequences that are already at appropriate rank\n reads_rank <- filter(reads, rank == out_rank)\n # Drop sequences at a higher rank and return unclassified sequences\n reads_norank <- reads %>% filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))\n while(nrow(reads_norank) > 0){ # As long as there are unclassified sequences...\n # Promote read taxids and re-merge with taxid DB, then re-classify and filter\n reads_remaining <- reads_norank %>% mutate(taxid = parent_taxid) %>%\n select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n reads_rank <- reads_remaining %>% filter(rank == out_rank) %>%\n bind_rows(reads_rank)\n reads_norank <- reads_remaining %>%\n filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))\n }\n # Finally, extract and append reads that were excluded during the process\n reads_dropped <- reads %>% filter(!seq_id %in% reads_rank$seq_id)\n reads_out <- reads_rank %>% bind_rows(reads_dropped) %>%\n select(-parent_taxid, -rank, -name) %>%\n left_join(taxid_db, by=\"taxid\")\n return(reads_out)\n}\nhv_reads_species <- raise_rank(mrg_hv_named, viral_taxa, \"species\")\nhv_reads_genus <- raise_rank(mrg_hv_named, viral_taxa, \"genus\")\nhv_reads_family <- raise_rank(mrg_hv_named, viral_taxa, \"family\")\n\n\n\nCodethreshold_major_family <- 0.02\n\n# Count reads for each human-viral family\nhv_family_counts <- hv_reads_family %>% \n group_by(sample, date, sample_type_short, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, date, sample_type_short) %>%\n mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\nhv_family_major_tab <- hv_family_counts %>% group_by(name) %>% \n filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%\n arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > threshold_major_family)\nhv_family_counts_major <- hv_family_counts %>%\n mutate(name_display = ifelse(name %in% hv_family_major_tab$name, name, \"Other\")) %>%\n group_by(sample, date, sample_type_short, name_display) %>%\n summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(hv_family_major_tab$name, \"Other\")))\nhv_family_counts_display <- hv_family_counts_major %>%\n rename(p_reads = p_reads_hv, classification = name_display)\n\n# Plot\ng_hv_family <- g_comp_base + \n geom_col(data=hv_family_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% HV Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral family\") +\n labs(title=\"Family composition of human-viral reads\") +\n guides(fill=guide_legend(ncol=4)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\ng_hv_family\n\n\n\n\n\n\nCode# Get most prominent families for text\nhv_family_collate <- hv_family_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv),\n p_reads_max = max(p_reads_hv), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\nIn investigating individual viral families, to avoid distortions from a few rare reads, I restricted myself to samples where that family made up at least 10% of human-viral reads:\n\nCodethreshold_major_species <- 0.05\ntaxid_adeno <- 10508\n\n# Get set of adenoviridae reads\nadeno_samples <- hv_family_counts %>% filter(taxid == taxid_adeno) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\nadeno_ids <- hv_reads_family %>% \n filter(taxid == taxid_adeno, sample %in% adeno_samples) %>%\n pull(seq_id)\n\n# Count reads for each adenoviridae species\nadeno_species_counts <- hv_reads_species %>%\n filter(seq_id %in% adeno_ids) %>%\n group_by(sample, date, sample_type_short, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, date, sample_type_short) %>%\n mutate(p_reads_adeno = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\nadeno_species_major_tab <- adeno_species_counts %>% group_by(name) %>% \n filter(p_reads_adeno == max(p_reads_adeno)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_adeno)) %>% \n filter(p_reads_adeno > threshold_major_species)\nadeno_species_counts_major <- adeno_species_counts %>%\n mutate(name_display = ifelse(name %in% adeno_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, date, sample_type_short, name_display) %>%\n summarize(n_reads_adeno = sum(n_reads_hv),\n p_reads_adeno = sum(p_reads_adeno), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(adeno_species_major_tab$name, \"Other\")))\nadeno_species_counts_display <- adeno_species_counts_major %>%\n rename(p_reads = p_reads_adeno, classification = name_display)\n\n# Plot\ng_adeno_species <- g_comp_base + \n geom_col(data=adeno_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% Adenoviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of Adenoviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_adeno_species\n\n\n\n\n\n\nCode# Get most prominent species for text\nadeno_species_collate <- adeno_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_adeno), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\n\nCodethreshold_major_species <- 0.1\ntaxid_picorna <- 12058\n\n# Get set of picornaviridae reads\npicorna_samples <- hv_family_counts %>% filter(taxid == taxid_picorna) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\npicorna_ids <- hv_reads_family %>% \n filter(taxid == taxid_picorna, sample %in% picorna_samples) %>%\n pull(seq_id)\n\n# Count reads for each picornaviridae species\npicorna_species_counts <- hv_reads_species %>%\n filter(seq_id %in% picorna_ids) %>%\n group_by(sample, date, sample_type_short, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, date, sample_type_short) %>%\n mutate(p_reads_picorna = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\npicorna_species_major_tab <- picorna_species_counts %>% group_by(name) %>% \n filter(p_reads_picorna == max(p_reads_picorna)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_picorna)) %>% \n filter(p_reads_picorna > threshold_major_species)\npicorna_species_counts_major <- picorna_species_counts %>%\n mutate(name_display = ifelse(name %in% picorna_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, date, sample_type_short, name_display) %>%\n summarize(n_reads_picorna = sum(n_reads_hv),\n p_reads_picorna = sum(p_reads_picorna), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(picorna_species_major_tab$name, \"Other\")))\npicorna_species_counts_display <- picorna_species_counts_major %>%\n rename(p_reads = p_reads_picorna, classification = name_display)\n\n# Plot\ng_picorna_species <- g_comp_base + \n geom_col(data=picorna_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% Picornaviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of Picornaviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_picorna_species\n\n\n\n\n\n\nCode# Get most prominent species for text\npicorna_species_collate <- picorna_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_picorna), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\n\nCodethreshold_major_species <- 0.1\ntaxid_polyoma <- 151341\n\n# Get set of polyomaviridae reads\npolyoma_samples <- hv_family_counts %>% filter(taxid == taxid_polyoma) %>%\n filter(p_reads_hv >= 0.1) %>%\n pull(sample)\npolyoma_ids <- hv_reads_family %>% \n filter(taxid == taxid_polyoma, sample %in% polyoma_samples) %>%\n pull(seq_id)\n\n# Count reads for each polyomaviridae species\npolyoma_species_counts <- hv_reads_species %>%\n filter(seq_id %in% polyoma_ids) %>%\n group_by(sample, date, sample_type_short, name, taxid) %>%\n count(name = \"n_reads_hv\") %>%\n group_by(sample, date, sample_type_short) %>%\n mutate(p_reads_polyoma = n_reads_hv/sum(n_reads_hv))\n\n# Identify high-ranking families and group others\npolyoma_species_major_tab <- polyoma_species_counts %>% group_by(name) %>% \n filter(p_reads_polyoma == max(p_reads_polyoma)) %>% \n filter(row_number() == 1) %>%\n arrange(desc(p_reads_polyoma)) %>% \n filter(p_reads_polyoma > threshold_major_species)\npolyoma_species_counts_major <- polyoma_species_counts %>%\n mutate(name_display = ifelse(name %in% polyoma_species_major_tab$name, \n name, \"Other\")) %>%\n group_by(sample, date, sample_type_short, name_display) %>%\n summarize(n_reads_polyoma = sum(n_reads_hv),\n p_reads_polyoma = sum(p_reads_polyoma), \n .groups=\"drop\") %>%\n mutate(name_display = factor(name_display, \n levels = c(polyoma_species_major_tab$name, \"Other\")))\npolyoma_species_counts_display <- polyoma_species_counts_major %>%\n rename(p_reads = p_reads_polyoma, classification = name_display)\n\n# Plot\ng_polyoma_species <- g_comp_base + \n geom_col(data=polyoma_species_counts_display, position = \"stack\") +\n scale_y_continuous(name=\"% Polyomaviridae Reads\", limits=c(0,1.01), \n breaks = seq(0,1,0.2),\n expand=c(0,0), labels = function(y) y*100) +\n scale_fill_manual(values=palette_viral, name = \"Viral species\") +\n labs(title=\"Species composition of Polyomaviridae reads\") +\n guides(fill=guide_legend(ncol=3)) +\n theme(plot.title = element_text(size=rel(1.4), hjust=0, face=\"plain\"))\n\ng_polyoma_species\n\n\n\n\n\n\nCode# Get most prominent species for text\npolyoma_species_collate <- polyoma_species_counts %>% group_by(name, taxid) %>% \n summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_polyoma), .groups=\"drop\") %>% \n arrange(desc(n_reads_tot))\n\n\nFinally, here again are the overall relative abundances of the specific viral genera I picked out manually in my last entry:\n\nCode# Define reference genera\npath_genera_rna <- c(\"Mamastrovirus\", \"Enterovirus\", \"Salivirus\", \"Kobuvirus\", \"Norovirus\", \"Sapovirus\", \"Rotavirus\", \"Alphacoronavirus\", \"Betacoronavirus\", \"Alphainfluenzavirus\", \"Betainfluenzavirus\", \"Lentivirus\")\npath_genera_dna <- c(\"Mastadenovirus\", \"Alphapolyomavirus\", \"Betapolyomavirus\", \"Alphapapillomavirus\", \"Betapapillomavirus\", \"Gammapapillomavirus\", \"Orthopoxvirus\", \"Simplexvirus\",\n \"Lymphocryptovirus\", \"Cytomegalovirus\", \"Dependoparvovirus\")\npath_genera <- bind_rows(tibble(name=path_genera_rna, genome_type=\"RNA genome\"),\n tibble(name=path_genera_dna, genome_type=\"DNA genome\")) %>%\n left_join(viral_taxa, by=\"name\")\n\n# Count in each sample\nmrg_hv_named_all <- mrg_hv %>% left_join(viral_taxa, by=\"taxid\")\nhv_reads_genus_all <- raise_rank(mrg_hv_named_all, viral_taxa, \"genus\")\nn_path_genera <- hv_reads_genus_all %>% \n group_by(sample, date, sample_type_short, name, taxid) %>% \n count(name=\"n_reads_viral\") %>% \n inner_join(path_genera, by=c(\"name\", \"taxid\")) %>%\n left_join(read_counts_raw, by=c(\"sample\", \"date\", \"sample_type_short\")) %>%\n mutate(p_reads_viral = n_reads_viral/n_reads_raw)\n\n# Pivot out and back to add zero lines\nn_path_genera_out <- n_path_genera %>% ungroup %>% select(sample, name, n_reads_viral) %>%\n pivot_wider(names_from=\"name\", values_from=\"n_reads_viral\", values_fill=0) %>%\n pivot_longer(-sample, names_to=\"name\", values_to=\"n_reads_viral\") %>%\n left_join(read_counts_raw, by=\"sample\") %>%\n left_join(path_genera, by=\"name\") %>%\n mutate(p_reads_viral = n_reads_viral/n_reads_raw)\n\n## Aggregate across dates\nn_path_genera_stype <- n_path_genera_out %>% \n group_by(name, taxid, genome_type, sample_type_short) %>%\n summarize(n_reads_raw = sum(n_reads_raw),\n n_reads_viral = sum(n_reads_viral), .groups = \"drop\") %>%\n mutate(sample=\"All samples\", location=\"All locations\",\n p_reads_viral = n_reads_viral/n_reads_raw,\n na_type = \"DNA\")\n\n# Plot\ng_path_genera <- ggplot(n_path_genera_stype,\n aes(y=name, x=p_reads_viral, color=sample_type_short)) +\n geom_point() +\n scale_x_log10(name=\"Relative abundance\") +\n scale_color_st() +\n facet_grid(genome_type~., scales=\"free_y\") +\n theme_base + theme(axis.title.y = element_blank())\ng_path_genera\n\n\n\n\n\n\n\nConclusion\nThis is another dataset with very low HV abundance, arising from lab methods intended to maximize bacterial abundance at the expense of other taxa. Nevertheless, this dataset had higher HV relative abundance than the last one. Interestingly, all three wastewater DNA datasets analyzed so far have exhibited a strong predominance of adenoviruses, and especially human mastadenovirus F, among human-infecting viruses. We’ll see if this pattern persists in the other DNA wastewater datasets I have in the queue." } ] \ No newline at end of file diff --git a/img/2024-05-01_ng-schematic.png b/img/2024-05-01_ng-schematic.png new file mode 100644 index 0000000..eecfad3 Binary files /dev/null and b/img/2024-05-01_ng-schematic.png differ diff --git a/notebooks/2024-05-01_bengtsson-palme.qmd b/notebooks/2024-05-01_bengtsson-palme.qmd index 2f14fd0..76227ac 100644 --- a/notebooks/2024-05-01_bengtsson-palme.qmd +++ b/notebooks/2024-05-01_bengtsson-palme.qmd @@ -1,5 +1,5 @@ --- -title: "Workflow analysis of Bengtsson-Palme et al. (2021)" +title: "Workflow analysis of Bengtsson-Palme et al. (2016)" subtitle: "Wastewater grab samples from Sweden." author: "Will Bradshaw" date: 2024-05-01 @@ -145,7 +145,7 @@ g_basic <- ggplot(basic_stats_raw_metrics, expand_limits(y=c(0,100)) + scale_fill_st() + facet_grid(metric~location, scales = "free", space="free_x", switch="y") + - theme_kit + theme( + theme_xblank + theme( axis.title.y = element_blank(), strip.text.y = element_text(face="plain") ) diff --git a/notebooks/2024-05-01_ng.qmd b/notebooks/2024-05-01_ng.qmd new file mode 100644 index 0000000..009808a --- /dev/null +++ b/notebooks/2024-05-01_ng.qmd @@ -0,0 +1,1113 @@ +--- +title: "Workflow analysis of Ng et al. (2019)" +subtitle: "Wastewater from Singapore." +author: "Will Bradshaw" +date: 2024-05-01 +format: + html: + code-fold: true + code-tools: true + code-link: true + df-print: paged +editor: visual +title-block-banner: black +--- + +```{r} +#| label: preamble +#| include: false + +# Load packages +library(tidyverse) +library(cowplot) +library(patchwork) +library(fastqcr) +library(RColorBrewer) +source("../scripts/aux_plot-theme.R") + +# GGplot themes and scales +theme_base <- theme_base + theme(aspect.ratio = NULL) +theme_rotate <- theme_base + theme( + axis.text.x = element_text(hjust = 1, angle = 45), +) +theme_kit <- theme_rotate + theme( + axis.title.x = element_blank(), +) +theme_xblank <- theme_kit + theme( + axis.text.x = element_blank() +) +tnl <- theme(legend.position = "none") +``` + +Continuing my analysis of datasets from the [P2RA preprint](https://doi.org/10.1101/2023.12.22.23300450), I analyzed the data from [Ng et al. (2019)](https://www.frontiersin.org/articles/10.3389/fmicb.2019.00172/full), a study that used DNA sequencing of wastewater samples to characterize the bacterial microbiota and resistome in Singapore. This study used processing methods I haven't seen before: + +1. All samples passed through “a filter” on-site at the WWTP prior to further processing in lab. + +2. Samples concentrated to 400ml using a Hemoflow dialyzer “via standard bloodline tubing”. + +3. Eluted concentrates then further concentrated by passing through a 0.22um filter and retaining the **retentate** (NB: this is anti-selecting for viruses). + +4. Sludge samples were instead centrifuged and the pellet kept for further analysis. + +5. After concentration, samples underwent DNA extraction with the PowerSoil DNA Isolation Kit, then underwent library prep and Illumina sequencing with an Illumina HiSeq2500 (2x250bp). + +Since this was a bacteria-focused study that used processing methods we expect to select against viruses, we wouldn't expect to see high viral relative abundances here. Nevertheless, it's worth seeing what we can see. + +# The raw data + +Samples were collected from six different locations in the treatment plant on six different dates (from October 2016 to August 2017) for a total of 36 samples: + +![](/img/2024-05-01_ng-schematic.png) + +```{r} +#| warning: false +#| label: import-qc-data + +# Importing the data is a bit more complicated this time as the samples are split across three pipeline runs +data_dir <- "../data/2024-05-01_ng" + +# Data input paths +libraries_path <- file.path(data_dir, "sample-metadata.csv") +basic_stats_path <- file.path(data_dir, "qc_basic_stats.tsv.gz") +adapter_stats_path <- file.path(data_dir, "qc_adapter_stats.tsv.gz") +quality_base_stats_path <- file.path(data_dir, "qc_quality_base_stats.tsv.gz") +quality_seq_stats_path <- file.path(data_dir, "qc_quality_sequence_stats.tsv.gz") + +# Import libraries and extract metadata from sample names +locs <- c("INF", "PST", "SLUDGE", "SST", "MBR", "WW") +libraries_raw <- lapply(libraries_path, read_csv, show_col_types = FALSE) %>% + bind_rows +libraries <- libraries_raw %>% + mutate(sample_type_long = gsub(" \\(.*", "", sample_type), + sample_type_short = ifelse(sample_type_long == "Influent", "INF", + sub(".*\\((.*)\\)", "\\1", sample_type)), + sample_type_short = factor(sample_type_short, levels=locs)) %>% + arrange(sample_type_short, date) %>% + mutate(sample_type_long = fct_inorder(sample_type_long), + sample = fct_inorder(sample)) %>% + arrange(date) %>% + mutate(date = fct_inorder(date)) + +# Make table +count_samples <- libraries %>% group_by(sample_type_long, sample_type_short) %>% + count %>% + rename(`Sample Type`=sample_type_long, Abbreviation=sample_type_short) +count_samples +``` + +```{r} +#| label: process-qc-data + +# Import QC data +stages <- c("raw_concat", "cleaned", "dedup", "ribo_initial", "ribo_secondary") +import_basic <- function(paths){ + lapply(paths, read_tsv, show_col_types = FALSE) %>% bind_rows %>% + inner_join(libraries, by="sample") %>% + arrange(sample_type_short, date, sample) %>% + mutate(stage = factor(stage, levels = stages), + sample = fct_inorder(sample)) +} +import_basic_paired <- function(paths){ + import_basic(paths) %>% arrange(read_pair) %>% + mutate(read_pair = fct_inorder(as.character(read_pair))) +} +basic_stats <- import_basic(basic_stats_path) +adapter_stats <- import_basic_paired(adapter_stats_path) +quality_base_stats <- import_basic_paired(quality_base_stats_path) +quality_seq_stats <- import_basic_paired(quality_seq_stats_path) + +# Filter to raw data +basic_stats_raw <- basic_stats %>% filter(stage == "raw_concat") +adapter_stats_raw <- adapter_stats %>% filter(stage == "raw_concat") +quality_base_stats_raw <- quality_base_stats %>% filter(stage == "raw_concat") +quality_seq_stats_raw <- quality_seq_stats %>% filter(stage == "raw_concat") + +# Get key values for readout +raw_read_counts <- basic_stats_raw %>% ungroup %>% + summarize(rmin = min(n_read_pairs), rmax=max(n_read_pairs), + rmean=mean(n_read_pairs), + rtot = sum(n_read_pairs), + btot = sum(n_bases_approx), + dmin = min(percent_duplicates), dmax=max(percent_duplicates), + dmean=mean(percent_duplicates), .groups = "drop") +``` + +These 36 samples yielded 26.6M-74.1M (mean 46.1M) reads per sample, for a total of 1.7B read pairs (830 gigabases of sequence). Read qualities were mostly high but tailed off towards the 3' end, requiring some trimming. Adapter levels were fairly low but still in need of some trimming. Inferred duplication levels were variable (1-64%, mean 31%), with libraries with lower read depth showing much lower duplicate levels. + +```{r} +#| fig-width: 9 +#| warning: false +#| label: plot-basic-stats + +# Prepare data +basic_stats_raw_metrics <- basic_stats_raw %>% + select(sample, sample_type_short, date, + `# Read pairs` = n_read_pairs, + `Total base pairs\n(approx)` = n_bases_approx, + `% Duplicates\n(FASTQC)` = percent_duplicates) %>% + pivot_longer(-(sample:date), names_to = "metric", values_to = "value") %>% + mutate(metric = fct_inorder(metric)) + +# Set up plot templates +scale_fill_st <- purrr::partial(scale_fill_brewer, palette="Set1", name="Sample Type") +g_basic <- ggplot(basic_stats_raw_metrics, + aes(x=sample, y=value, fill=sample_type_short, + group=interaction(sample_type_short,sample))) + + geom_col(position = "dodge") + + scale_y_continuous(expand=c(0,0)) + + expand_limits(y=c(0,100)) + + scale_fill_st() + + facet_grid(metric~., scales = "free", space="free_x", switch="y") + + theme_xblank + theme( + axis.title.y = element_blank(), + strip.text.y = element_text(face="plain") + ) +g_basic +``` + +```{r} +#| label: plot-raw-quality + +# Set up plotting templates +scale_color_st <- purrr::partial(scale_color_brewer, palette="Set1", + name="Sample Type") +g_qual_raw <- ggplot(mapping=aes(color=sample_type_short, linetype=read_pair, + group=interaction(sample,read_pair))) + + scale_color_st() + scale_linetype_discrete(name = "Read Pair") + + guides(color=guide_legend(nrow=2,byrow=TRUE), + linetype = guide_legend(nrow=2,byrow=TRUE)) + + theme_base + +# Visualize adapters +g_adapters_raw <- g_qual_raw + + geom_line(aes(x=position, y=pc_adapters), data=adapter_stats_raw) + + scale_y_continuous(name="% Adapters", limits=c(0,NA), + breaks = seq(0,100,1), expand=c(0,0)) + + scale_x_continuous(name="Position", limits=c(0,NA), + breaks=seq(0,500,20), expand=c(0,0)) + + facet_grid(.~adapter) +g_adapters_raw + +# Visualize quality +g_quality_base_raw <- g_qual_raw + + geom_hline(yintercept=25, linetype="dashed", color="red") + + geom_hline(yintercept=30, linetype="dashed", color="red") + + geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats_raw) + + scale_y_continuous(name="Mean Phred score", expand=c(0,0), limits=c(10,45)) + + scale_x_continuous(name="Position", limits=c(0,NA), + breaks=seq(0,500,20), expand=c(0,0)) +g_quality_base_raw + +g_quality_seq_raw <- g_qual_raw + + geom_vline(xintercept=25, linetype="dashed", color="red") + + geom_vline(xintercept=30, linetype="dashed", color="red") + + geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats_raw) + + scale_x_continuous(name="Mean Phred score", expand=c(0,0)) + + scale_y_continuous(name="# Sequences", expand=c(0,0)) +g_quality_seq_raw +``` + +# Preprocessing + +The average fraction of reads lost at each stage in the preprocessing pipeline is shown in the following table. As expected given the observed difference in duplication levels, many more reads were lost during deduplication in liquid samples than sludge samples. Conversely, trimming and filtering consistently removed more reads in sludge than in liquid samples, though the effect was less dramatic than for deduplication. Very few reads were lost during ribodepletion, as expected for DNA sequencing libraries. + +```{r} +#| label: preproc-table +n_reads_rel <- basic_stats %>% + select(sample, sample_type_short, date, stage, + percent_duplicates, n_read_pairs) %>% + group_by(sample) %>% arrange(sample, stage) %>% + mutate(p_reads_retained = replace_na(n_read_pairs / lag(n_read_pairs), 0), + p_reads_lost = 1 - p_reads_retained, + p_reads_retained_abs = n_read_pairs / n_read_pairs[1], + p_reads_lost_abs = 1-p_reads_retained_abs, + p_reads_lost_abs_marginal = replace_na(p_reads_lost_abs - lag(p_reads_lost_abs), 0)) +n_reads_rel_display <- n_reads_rel %>% + group_by(`Sample Type`=sample_type_short, Stage=stage) %>% + summarize(`% Total Reads Lost (Cumulative)` = paste0(round(min(p_reads_lost_abs*100),1), "-", round(max(p_reads_lost_abs*100),1), " (mean ", round(mean(p_reads_lost_abs*100),1), ")"), + `% Total Reads Lost (Marginal)` = paste0(round(min(p_reads_lost_abs_marginal*100),1), "-", round(max(p_reads_lost_abs_marginal*100),1), " (mean ", round(mean(p_reads_lost_abs_marginal*100),1), ")"), .groups="drop") %>% + filter(Stage != "raw_concat") %>% + mutate(Stage = Stage %>% as.numeric %>% factor(labels=c("Trimming & filtering", "Deduplication", "Initial ribodepletion", "Secondary ribodepletion"))) +n_reads_rel_display +``` + +```{r} +#| label: preproc-figures +#| warning: false +#| fig-height: 4 +#| fig-width: 6 + +g_stage_base <- ggplot(mapping=aes(x=stage, color=sample_type_short, group=sample)) + + scale_color_st() + + theme_kit + +# Plot reads over preprocessing +g_reads_stages <- g_stage_base + + geom_line(aes(y=n_read_pairs), data=basic_stats) + + scale_y_continuous("# Read pairs", expand=c(0,0), limits=c(0,NA)) +g_reads_stages + +# Plot relative read losses during preprocessing +g_reads_rel <- g_stage_base + + geom_line(aes(y=p_reads_lost_abs_marginal), data=n_reads_rel) + + scale_y_continuous("% Total Reads Lost", expand=c(0,0), + labels = function(x) x*100) +g_reads_rel +``` + +Data cleaning was very successful at removing adapters and improving read qualities: + +```{r} +#| warning: false +#| label: plot-quality +#| fig-height: 7 + +g_qual <- ggplot(mapping=aes(color=sample_type_short, linetype=read_pair, + group=interaction(sample,read_pair))) + + scale_color_st() + scale_linetype_discrete(name = "Read Pair") + + guides(color=guide_legend(nrow=2,byrow=TRUE), + linetype = guide_legend(nrow=2,byrow=TRUE)) + + theme_base + +# Visualize adapters +g_adapters <- g_qual + + geom_line(aes(x=position, y=pc_adapters), data=adapter_stats) + + scale_y_continuous(name="% Adapters", limits=c(0,20), + breaks = seq(0,50,10), expand=c(0,0)) + + scale_x_continuous(name="Position", limits=c(0,NA), + breaks=seq(0,140,20), expand=c(0,0)) + + facet_grid(stage~adapter) +g_adapters + +# Visualize quality +g_quality_base <- g_qual + + geom_hline(yintercept=25, linetype="dashed", color="red") + + geom_hline(yintercept=30, linetype="dashed", color="red") + + geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats) + + scale_y_continuous(name="Mean Phred score", expand=c(0,0), limits=c(10,45)) + + scale_x_continuous(name="Position", limits=c(0,NA), + breaks=seq(0,140,20), expand=c(0,0)) + + facet_grid(stage~.) +g_quality_base + +g_quality_seq <- g_qual + + geom_vline(xintercept=25, linetype="dashed", color="red") + + geom_vline(xintercept=30, linetype="dashed", color="red") + + geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats) + + scale_x_continuous(name="Mean Phred score", expand=c(0,0)) + + scale_y_continuous(name="# Sequences", expand=c(0,0)) + + facet_grid(stage~.) +g_quality_seq +``` + +According to FASTQC, cleaning + deduplication was very effective at reducing measured duplicate levels, which fell from an average of 31% to 6.5%: + +```{r} +#| label: preproc-dedup +#| fig-height: 3.5 +#| fig-width: 6 + +stage_dup <- basic_stats %>% group_by(stage) %>% + summarize(dmin = min(percent_duplicates), dmax=max(percent_duplicates), + dmean=mean(percent_duplicates), .groups = "drop") + +g_dup_stages <- g_stage_base + + geom_line(aes(y=percent_duplicates), data=basic_stats) + + scale_y_continuous("% Duplicates", limits=c(0,NA), expand=c(0,0)) +g_dup_stages + +g_readlen_stages <- g_stage_base + + geom_line(aes(y=mean_seq_len), data=basic_stats) + + scale_y_continuous("Mean read length (nt)", expand=c(0,0), limits=c(0,NA)) +g_readlen_stages +``` + +# High-level composition + +As before, to assess the high-level composition of the reads, I ran the ribodepleted files through Kraken (using the Standard 16 database) and summarized the results with Bracken. Combining these results with the read counts above gives us a breakdown of the inferred composition of the samples: + +```{r} +#| label: prepare-composition + +classifications <- c("Filtered", "Duplicate", "Ribosomal", "Unassigned", + "Bacterial", "Archaeal", "Viral", "Human") + +# Import composition data +comp_path <- file.path(data_dir, "taxonomic_composition.tsv.gz") +comp <- read_tsv(comp_path, show_col_types = FALSE) %>% + left_join(libraries, by="sample") %>% + mutate(classification = factor(classification, levels = classifications)) + + +# Summarize composition +read_comp_summ <- comp %>% + group_by(sample_type_short, classification) %>% + summarize(n_reads = sum(n_reads), .groups = "drop_last") %>% + mutate(n_reads = replace_na(n_reads,0), + p_reads = n_reads/sum(n_reads), + pc_reads = p_reads*100) +``` + +```{r} +#| label: plot-composition-all +#| fig-height: 7 +#| fig-width: 8 + +# Prepare plotting templates +g_comp_base <- ggplot(mapping=aes(x=sample, y=p_reads, fill=classification)) + + facet_wrap(~sample_type_short, scales = "free_x", ncol=3, + labeller = label_wrap_gen(multi_line=FALSE, width=20)) + + theme_xblank +scale_y_pc_reads <- purrr::partial(scale_y_continuous, name = "% Reads", + expand = c(0,0), labels = function(y) y*100) + +# Plot overall composition +g_comp <- g_comp_base + geom_col(data = comp, position = "stack", width=1) + + scale_y_pc_reads(limits = c(0,1.01), breaks = seq(0,1,0.2)) + + scale_fill_brewer(palette = "Set1", name = "Classification") +g_comp + +# Plot composition of minor components +comp_minor <- comp %>% + filter(classification %in% c("Archaeal", "Viral", "Human", "Other")) +palette_minor <- brewer.pal(9, "Set1")[6:9] +g_comp_minor <- g_comp_base + + geom_col(data=comp_minor, position = "stack", width=1) + + scale_y_pc_reads() + + scale_fill_manual(values=palette_minor, name = "Classification") +g_comp_minor + +``` + +```{r} +#| label: composition-summary + +p_reads_summ_group <- comp %>% + mutate(classification = ifelse(classification %in% c("Filtered", "Duplicate", "Unassigned"), "Excluded", as.character(classification)), + classification = fct_inorder(classification)) %>% + group_by(classification, sample, sample_type_short) %>% + summarize(p_reads = sum(p_reads), .groups = "drop") %>% + group_by(classification, sample_type_short) %>% + summarize(pc_min = min(p_reads)*100, pc_max = max(p_reads)*100, + pc_mean = mean(p_reads)*100, .groups = "drop") +p_reads_summ_prep <- p_reads_summ_group %>% + mutate(classification = fct_inorder(classification), + pc_min = pc_min %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2), + pc_max = pc_max %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2), + pc_mean = pc_mean %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2), + display = paste0(pc_min, "-", pc_max, "% (mean ", pc_mean, "%)")) +p_reads_summ <- p_reads_summ_prep %>% + select(`Sample Type`=sample_type_short, Classification=classification, + `Read Fraction`=display) %>% + arrange(`Sample Type`, Classification) +p_reads_summ +``` + +As in previous DNA datasets, the vast majority of classified reads were bacterial in origin. The fraction of virus reads varied substantially between sample types, averaging \<0.01% in influent and final effluent but closer to 0.05% in other sample types. Interestingly (though not particularly relevantly for this analysis), the fraction of archaeal reads was much higher in influent than other sample types, in contrast to [Bengtsson-Palme](https://data.securebio.org/wills-public-notebook/notebooks/2024-05-01_bengtsson-palme.html) where it was highest in slidge. + +As is common for DNA data, viral reads were overwhelmingly dominated by *Caudoviricetes* phages, though one wet-well sample contained a substantial fraction of *Alsuviricetes* (a class of mainly plant pathogens that includes *Virgaviridae*): + +```{r} +#| label: extract-viral-taxa + +# Get Kraken reports +reports_path <- file.path(data_dir, "kraken_reports.tsv.gz") +reports <- read_tsv(reports_path, show_col_types = FALSE) + +# Get viral taxonomy +viral_taxa_path <- file.path(data_dir, "viral-taxids.tsv.gz") +viral_taxa <- read_tsv(viral_taxa_path, show_col_types = FALSE) + +# Filter to viral taxa +kraken_reports_viral <- filter(reports, taxid %in% viral_taxa$taxid) %>% + group_by(sample) %>% + mutate(p_reads_viral = n_reads_clade/n_reads_clade[1]) +kraken_reports_viral_cleaned <- kraken_reports_viral %>% + inner_join(libraries, by="sample") %>% + select(-pc_reads_total, -n_reads_direct, -contains("minimizers")) %>% + select(name, taxid, p_reads_viral, n_reads_clade, everything()) + +viral_classes <- kraken_reports_viral_cleaned %>% filter(rank == "C") +viral_families <- kraken_reports_viral_cleaned %>% filter(rank == "F") + +``` + +```{r} +#| label: viral-class-composition + +major_threshold <- 0.02 + +# Identify major viral classes +viral_classes_major_tab <- viral_classes %>% + group_by(name, taxid) %>% + summarize(p_reads_viral_max = max(p_reads_viral), .groups="drop") %>% + filter(p_reads_viral_max >= major_threshold) +viral_classes_major_list <- viral_classes_major_tab %>% pull(name) +viral_classes_major <- viral_classes %>% + filter(name %in% viral_classes_major_list) %>% + select(name, taxid, sample, sample_type_short, date, p_reads_viral) +viral_classes_minor <- viral_classes_major %>% + group_by(sample, sample_type_short, date) %>% + summarize(p_reads_viral_major = sum(p_reads_viral), .groups = "drop") %>% + mutate(name = "Other", taxid=NA, p_reads_viral = 1-p_reads_viral_major) %>% + select(name, taxid, sample, sample_type_short, date, p_reads_viral) +viral_classes_display <- bind_rows(viral_classes_major, viral_classes_minor) %>% + arrange(desc(p_reads_viral)) %>% + mutate(name = factor(name, levels=c(viral_classes_major_list, "Other")), + p_reads_viral = pmax(p_reads_viral, 0)) %>% + rename(p_reads = p_reads_viral, classification=name) + +palette_viral <- c(brewer.pal(12, "Set3"), brewer.pal(8, "Dark2")) +g_classes <- g_comp_base + + geom_col(data=viral_classes_display, position = "stack", width=1) + + scale_y_continuous(name="% Viral Reads", limits=c(0,1.01), breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral class") + +g_classes + +``` + +# Human-infecting virus reads: validation + +Next, I investigated the human-infecting virus read content of these unenriched samples. A grand total of 527 reads were identified as putatively human-viral, with half of samples showing 5 or fewer total HV read pairs. + +```{r} +#| label: hv-read-counts + +# Import HV read data +hv_reads_filtered_path <- file.path(data_dir, "hv_hits_putative_filtered.tsv.gz") +hv_reads_filtered <- lapply(hv_reads_filtered_path, read_tsv, + show_col_types = FALSE) %>% + bind_rows() %>% + left_join(libraries, by="sample") + +# Count reads +n_hv_filtered <- hv_reads_filtered %>% + group_by(sample, date, sample_type_short, seq_id) %>% count %>% + group_by(sample, date, sample_type_short) %>% count %>% + inner_join(basic_stats %>% filter(stage == "ribo_initial") %>% + select(sample, n_read_pairs), by="sample") %>% + rename(n_putative = n, n_total = n_read_pairs) %>% + mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100) +n_hv_filtered_summ <- n_hv_filtered %>% ungroup %>% + summarize(n_putative = sum(n_putative), n_total = sum(n_total), + .groups="drop") %>% + mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100) +``` + +```{r} +#| label: plot-hv-scores +#| warning: false +#| fig-width: 8 + +# Collapse multi-entry sequences +rmax <- purrr::partial(max, na.rm = TRUE) +collapse <- function(x) ifelse(all(x == x[1]), x[1], paste(x, collapse="/")) +mrg <- hv_reads_filtered %>% + mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev, na.rm = TRUE)) %>% + arrange(desc(adj_score_max)) %>% + group_by(seq_id) %>% + summarize(sample = collapse(sample), + genome_id = collapse(genome_id), + taxid_best = taxid[1], + taxid = collapse(as.character(taxid)), + best_alignment_score_fwd = rmax(best_alignment_score_fwd), + best_alignment_score_rev = rmax(best_alignment_score_rev), + query_len_fwd = rmax(query_len_fwd), + query_len_rev = rmax(query_len_rev), + query_seq_fwd = query_seq_fwd[!is.na(query_seq_fwd)][1], + query_seq_rev = query_seq_rev[!is.na(query_seq_rev)][1], + classified = rmax(classified), + assigned_name = collapse(assigned_name), + assigned_taxid_best = assigned_taxid[1], + assigned_taxid = collapse(as.character(assigned_taxid)), + assigned_hv = rmax(assigned_hv), + hit_hv = rmax(hit_hv), + encoded_hits = collapse(encoded_hits), + adj_score_fwd = rmax(adj_score_fwd), + adj_score_rev = rmax(adj_score_rev) + ) %>% + inner_join(libraries, by="sample") %>% + mutate(kraken_label = ifelse(assigned_hv, "Kraken2 HV\nassignment", + ifelse(hit_hv, "Kraken2 HV\nhit", + "No hit or\nassignment"))) %>% + mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev), + highscore = adj_score_max >= 20) + +# Plot results +geom_vhist <- purrr::partial(geom_histogram, binwidth=5, boundary=0) +g_vhist_base <- ggplot(mapping=aes(x=adj_score_max)) + + geom_vline(xintercept=20, linetype="dashed", color="red") + + facet_wrap(~kraken_label, labeller = labeller(kit = label_wrap_gen(20)), scales = "free_y") + + scale_x_continuous(name = "Maximum adjusted alignment score") + + scale_y_continuous(name="# Read pairs") + + theme_base +g_vhist_0 <- g_vhist_base + geom_vhist(data=mrg) +g_vhist_0 +``` + +BLASTing these reads against nt, we find that the pipeline performs well, with only a single high-scoring false-positive read: + +```{r} +#| label: process-blast-data +#| warning: false + +# Import paired BLAST results +blast_paired_path <- file.path(data_dir, "hv_hits_blast_paired.tsv.gz") +blast_paired <- read_tsv(blast_paired_path, show_col_types = FALSE) + +# Add viral status +blast_viral <- mutate(blast_paired, viral = staxid %in% viral_taxa$taxid) %>% + mutate(viral_full = viral & n_reads == 2) + +# Compare to Kraken & Bowtie assignments +match_taxid <- function(taxid_1, taxid_2){ + p1 <- mapply(grepl, paste0("/", taxid_1, "$"), taxid_2) + p2 <- mapply(grepl, paste0("^", taxid_1, "/"), taxid_2) + p3 <- mapply(grepl, paste0("^", taxid_1, "$"), taxid_2) + out <- setNames(p1|p2|p3, NULL) + return(out) +} +mrg_assign <- mrg %>% select(sample, seq_id, taxid, assigned_taxid, adj_score_max) +blast_assign <- inner_join(blast_viral, mrg_assign, by="seq_id") %>% + mutate(taxid_match_bowtie = match_taxid(staxid, taxid), + taxid_match_kraken = match_taxid(staxid, assigned_taxid), + taxid_match_any = taxid_match_bowtie | taxid_match_kraken) +blast_out <- blast_assign %>% + group_by(seq_id) %>% + summarize(viral_status = ifelse(any(viral_full), 2, + ifelse(any(taxid_match_any), 2, + ifelse(any(viral), 1, 0))), + .groups = "drop") +``` + +```{r} +#| label: plot-blast-results +#| fig-height: 6 +#| warning: false + +# Merge BLAST results with unenriched read data +mrg_blast <- full_join(mrg, blast_out, by="seq_id") %>% + mutate(viral_status = replace_na(viral_status, 0), + viral_status_out = ifelse(viral_status == 0, FALSE, TRUE)) + +# Plot +g_vhist_1 <- g_vhist_base + geom_vhist(data=mrg_blast, mapping=aes(fill=viral_status_out)) + + scale_fill_brewer(palette = "Set1", name = "Viral status") +g_vhist_1 +``` + +My usual disjunctive score threshold of 20 gave precision, sensitivity, and F1 scores all \>97%: + +```{r} +#| label: plot-f1 +test_sens_spec <- function(tab, score_threshold){ + tab_retained <- tab %>% + mutate(retain_score = (adj_score_fwd > score_threshold | adj_score_rev > score_threshold), + retain = assigned_hv | retain_score) %>% + group_by(viral_status_out, retain) %>% count + pos_tru <- tab_retained %>% filter(viral_status_out == "TRUE", retain) %>% pull(n) %>% sum + pos_fls <- tab_retained %>% filter(viral_status_out != "TRUE", retain) %>% pull(n) %>% sum + neg_tru <- tab_retained %>% filter(viral_status_out != "TRUE", !retain) %>% pull(n) %>% sum + neg_fls <- tab_retained %>% filter(viral_status_out == "TRUE", !retain) %>% pull(n) %>% sum + sensitivity <- pos_tru / (pos_tru + neg_fls) + specificity <- neg_tru / (neg_tru + pos_fls) + precision <- pos_tru / (pos_tru + pos_fls) + f1 <- 2 * precision * sensitivity / (precision + sensitivity) + out <- tibble(threshold=score_threshold, sensitivity=sensitivity, + specificity=specificity, precision=precision, f1=f1) + return(out) +} +range_f1 <- function(intab, inrange=15:45){ + tss <- purrr::partial(test_sens_spec, tab=intab) + stats <- lapply(inrange, tss) %>% bind_rows %>% + pivot_longer(!threshold, names_to="metric", values_to="value") + return(stats) +} +stats_0 <- range_f1(mrg_blast) +g_stats_0 <- ggplot(stats_0, aes(x=threshold, y=value, color=metric)) + + geom_vline(xintercept=20, color = "red", linetype = "dashed") + + geom_line() + + scale_y_continuous(name = "Value", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) + + scale_x_continuous(name = "Adjusted Score Threshold", expand = c(0,0)) + + scale_color_brewer(palette="Dark2") + + theme_base +g_stats_0 +stats_0 %>% filter(threshold == 20) %>% + select(Threshold=threshold, Metric=metric, Value=value) +``` + +# Human-infecting viruses: overall relative abundance + +```{r} +#| label: count-hv-reads + +# Get raw read counts +read_counts_raw <- basic_stats_raw %>% + select(sample, sample_type_short, date, n_reads_raw = n_read_pairs) + +# Get HV read counts +mrg_hv <- mrg %>% mutate(hv_status = assigned_hv | highscore) %>% + rename(taxid_all = taxid, taxid = taxid_best) +read_counts_hv <- mrg_hv %>% filter(hv_status) %>% group_by(sample) %>% + count(name="n_reads_hv") +read_counts <- read_counts_raw %>% left_join(read_counts_hv, by="sample") %>% + mutate(n_reads_hv = replace_na(n_reads_hv, 0)) + +# Aggregate +read_counts_grp <- read_counts %>% group_by(date, sample_type_short) %>% + summarize(n_reads_raw = sum(n_reads_raw), + n_reads_hv = sum(n_reads_hv), .groups="drop") %>% + mutate(sample= "All samples") +read_counts_st <- read_counts_grp %>% group_by(sample, sample_type_short) %>% + summarize(n_reads_raw = sum(n_reads_raw), + n_reads_hv = sum(n_reads_hv), .groups="drop") %>% + mutate(date = "All dates") +read_counts_date <- read_counts_grp %>% + group_by(sample, date) %>% + summarize(n_reads_raw = sum(n_reads_raw), + n_reads_hv = sum(n_reads_hv), .groups="drop") %>% + mutate(sample_type_short = "All sample types") +read_counts_tot <- read_counts_date %>% group_by(sample, sample_type_short) %>% + summarize(n_reads_raw = sum(n_reads_raw), + n_reads_hv = sum(n_reads_hv), .groups="drop") %>% + mutate(date = "All dates") +read_counts_agg <- bind_rows(read_counts_grp, read_counts_st, + read_counts_date, read_counts_tot) %>% + mutate(p_reads_hv = n_reads_hv/n_reads_raw, + date = factor(date, levels = c(levels(libraries$date), "All dates")), + sample_type_short = factor(sample_type_short, levels = c(levels(libraries$sample_type_short), "All sample types"))) +``` + +Applying a disjunctive cutoff at S=20 identifies 482 read pairs as human-viral. This gives an overall relative HV abundance of $2.90 \times 10^{-7}$; on the low end across all datasets I've analyzed, though higher than for [Bengtsson-Palme](https://data.securebio.org/wills-public-notebook/notebooks/2024-05-01_bengtsson-palme.html): + +```{r} +#| label: plot-hv-ra +#| warning: false +# Visualize +g_phv_agg <- ggplot(read_counts_agg, aes(x=date, color=sample_type_short)) + + geom_point(aes(y=p_reads_hv)) + + scale_y_log10("Relative abundance of human virus reads") + + scale_color_st() + theme_kit +g_phv_agg +``` + +```{r} +#| label: ra-hv-past + +# Collate past RA values +ra_past <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched, + "Brumfield", 5e-5, "RNA", FALSE, + "Brumfield", 3.66e-7, "DNA", FALSE, + "Spurbeck", 5.44e-6, "RNA", FALSE, + "Yang", 3.62e-4, "RNA", FALSE, + "Rothman (unenriched)", 1.87e-5, "RNA", FALSE, + "Rothman (panel-enriched)", 3.3e-5, "RNA", TRUE, + "Crits-Christoph (unenriched)", 1.37e-5, "RNA", FALSE, + "Crits-Christoph (panel-enriched)", 1.26e-2, "RNA", TRUE, + "Prussin (non-control)", 1.63e-5, "RNA", FALSE, + "Prussin (non-control)", 4.16e-5, "DNA", FALSE, + "Rosario (non-control)", 1.21e-5, "RNA", FALSE, + "Rosario (non-control)", 1.50e-4, "DNA", FALSE, + "Leung", 1.73e-5, "DNA", FALSE, + "Brinch", 3.88e-6, "DNA", FALSE, + "Bengtsson-Palme", 8.86e-8, "DNA", FALSE +) + +# Collate new RA values +ra_new <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched, + "Ng", 2.90e-7, "DNA", FALSE) + + +# Plot +scale_color_na <- purrr::partial(scale_color_brewer, palette="Set1", + name="Nucleic acid type") +ra_comp <- bind_rows(ra_past, ra_new) %>% mutate(dataset = fct_inorder(dataset)) +g_ra_comp <- ggplot(ra_comp, aes(y=dataset, x=ra, color=na_type)) + + geom_point() + + scale_color_na() + + scale_x_log10(name="Relative abundance of human virus reads") + + theme_base + theme(axis.title.y = element_blank()) +g_ra_comp +``` + +# Human-infecting viruses: taxonomy and composition + +In investigating the taxonomy of human-infecting virus reads, I restricted my analysis to samples with more than 5 HV read pairs total across all viruses, to reduce noise arising from extremely low HV read counts in some samples. 13 samples met this criterion. + +At the family level, most samples were overwhelmingly dominated by *Adenoviridae*, with *Picornaviridae*, *Polyomaviridae* and *Papillomaviridae* making up most of the rest: + +```{r} +#| label: raise-hv-taxa + +# Get viral taxon names for putative HV reads +viral_taxa$name[viral_taxa$taxid == 249588] <- "Mamastrovirus" +viral_taxa$name[viral_taxa$taxid == 194960] <- "Kobuvirus" +viral_taxa$name[viral_taxa$taxid == 688449] <- "Salivirus" +viral_taxa$name[viral_taxa$taxid == 585893] <- "Picobirnaviridae" +viral_taxa$name[viral_taxa$taxid == 333922] <- "Betapapillomavirus" +viral_taxa$name[viral_taxa$taxid == 334207] <- "Betapapillomavirus 3" +viral_taxa$name[viral_taxa$taxid == 369960] <- "Porcine type-C oncovirus" +viral_taxa$name[viral_taxa$taxid == 333924] <- "Betapapillomavirus 2" +viral_taxa$name[viral_taxa$taxid == 687329] <- "Anelloviridae" +viral_taxa$name[viral_taxa$taxid == 325455] <- "Gammapapillomavirus" +viral_taxa$name[viral_taxa$taxid == 333750] <- "Alphapapillomavirus" +viral_taxa$name[viral_taxa$taxid == 694002] <- "Betacoronavirus" +viral_taxa$name[viral_taxa$taxid == 334202] <- "Mupapillomavirus" +viral_taxa$name[viral_taxa$taxid == 197911] <- "Alphainfluenzavirus" +viral_taxa$name[viral_taxa$taxid == 186938] <- "Respirovirus" +viral_taxa$name[viral_taxa$taxid == 333926] <- "Gammapapillomavirus 1" +viral_taxa$name[viral_taxa$taxid == 337051] <- "Betapapillomavirus 1" +viral_taxa$name[viral_taxa$taxid == 337043] <- "Alphapapillomavirus 4" +viral_taxa$name[viral_taxa$taxid == 694003] <- "Betacoronavirus 1" +viral_taxa$name[viral_taxa$taxid == 334204] <- "Mupapillomavirus 2" +viral_taxa$name[viral_taxa$taxid == 334208] <- "Betapapillomavirus 4" +viral_taxa$name[viral_taxa$taxid == 333928] <- "Gammapapillomavirus 2" +viral_taxa$name[viral_taxa$taxid == 337039] <- "Alphapapillomavirus 2" +viral_taxa$name[viral_taxa$taxid == 333929] <- "Gammapapillomavirus 3" +viral_taxa$name[viral_taxa$taxid == 337042] <- "Alphapapillomavirus 7" +viral_taxa$name[viral_taxa$taxid == 334203] <- "Mupapillomavirus 1" +viral_taxa$name[viral_taxa$taxid == 333757] <- "Alphapapillomavirus 8" +viral_taxa$name[viral_taxa$taxid == 337050] <- "Alphapapillomavirus 6" +viral_taxa$name[viral_taxa$taxid == 333767] <- "Alphapapillomavirus 3" +viral_taxa$name[viral_taxa$taxid == 333754] <- "Alphapapillomavirus 10" +viral_taxa$name[viral_taxa$taxid == 687363] <- "Torque teno virus 24" +viral_taxa$name[viral_taxa$taxid == 687342] <- "Torque teno virus 3" +viral_taxa$name[viral_taxa$taxid == 687359] <- "Torque teno virus 20" +viral_taxa$name[viral_taxa$taxid == 194441] <- "Primate T-lymphotropic virus 2" +viral_taxa$name[viral_taxa$taxid == 334209] <- "Betapapillomavirus 5" +viral_taxa$name[viral_taxa$taxid == 194965] <- "Aichivirus B" +viral_taxa$name[viral_taxa$taxid == 333930] <- "Gammapapillomavirus 4" +viral_taxa$name[viral_taxa$taxid == 337048] <- "Alphapapillomavirus 1" +viral_taxa$name[viral_taxa$taxid == 337041] <- "Alphapapillomavirus 9" +viral_taxa$name[viral_taxa$taxid == 337049] <- "Alphapapillomavirus 11" +viral_taxa$name[viral_taxa$taxid == 337044] <- "Alphapapillomavirus 5" + +# Filter samples and add viral taxa information +samples_keep <- read_counts %>% filter(n_reads_hv > 5) %>% pull(sample) +mrg_hv_named <- mrg_hv %>% filter(sample %in% samples_keep, hv_status) %>% left_join(viral_taxa, by="taxid") + +# Discover viral species & genera for HV reads +raise_rank <- function(read_db, taxid_db, out_rank = "species", verbose = FALSE){ + # Get higher ranks than search rank + ranks <- c("subspecies", "species", "subgenus", "genus", "subfamily", "family", "suborder", "order", "class", "subphylum", "phylum", "kingdom", "superkingdom") + rank_match <- which.max(ranks == out_rank) + high_ranks <- ranks[rank_match:length(ranks)] + # Merge read DB and taxid DB + reads <- read_db %>% select(-parent_taxid, -rank, -name) %>% + left_join(taxid_db, by="taxid") + # Extract sequences that are already at appropriate rank + reads_rank <- filter(reads, rank == out_rank) + # Drop sequences at a higher rank and return unclassified sequences + reads_norank <- reads %>% filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid)) + while(nrow(reads_norank) > 0){ # As long as there are unclassified sequences... + # Promote read taxids and re-merge with taxid DB, then re-classify and filter + reads_remaining <- reads_norank %>% mutate(taxid = parent_taxid) %>% + select(-parent_taxid, -rank, -name) %>% + left_join(taxid_db, by="taxid") + reads_rank <- reads_remaining %>% filter(rank == out_rank) %>% + bind_rows(reads_rank) + reads_norank <- reads_remaining %>% + filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid)) + } + # Finally, extract and append reads that were excluded during the process + reads_dropped <- reads %>% filter(!seq_id %in% reads_rank$seq_id) + reads_out <- reads_rank %>% bind_rows(reads_dropped) %>% + select(-parent_taxid, -rank, -name) %>% + left_join(taxid_db, by="taxid") + return(reads_out) +} +hv_reads_species <- raise_rank(mrg_hv_named, viral_taxa, "species") +hv_reads_genus <- raise_rank(mrg_hv_named, viral_taxa, "genus") +hv_reads_family <- raise_rank(mrg_hv_named, viral_taxa, "family") +``` + +```{r} +#| label: hv-family +#| fig-height: 5 +#| fig-width: 7 + +threshold_major_family <- 0.02 + +# Count reads for each human-viral family +hv_family_counts <- hv_reads_family %>% + group_by(sample, date, sample_type_short, name, taxid) %>% + count(name = "n_reads_hv") %>% + group_by(sample, date, sample_type_short) %>% + mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv)) + +# Identify high-ranking families and group others +hv_family_major_tab <- hv_family_counts %>% group_by(name) %>% + filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>% + arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > threshold_major_family) +hv_family_counts_major <- hv_family_counts %>% + mutate(name_display = ifelse(name %in% hv_family_major_tab$name, name, "Other")) %>% + group_by(sample, date, sample_type_short, name_display) %>% + summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv), + .groups="drop") %>% + mutate(name_display = factor(name_display, + levels = c(hv_family_major_tab$name, "Other"))) +hv_family_counts_display <- hv_family_counts_major %>% + rename(p_reads = p_reads_hv, classification = name_display) + +# Plot +g_hv_family <- g_comp_base + + geom_col(data=hv_family_counts_display, position = "stack") + + scale_y_continuous(name="% HV Reads", limits=c(0,1.01), + breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral family") + + labs(title="Family composition of human-viral reads") + + guides(fill=guide_legend(ncol=4)) + + theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain")) +g_hv_family + +# Get most prominent families for text +hv_family_collate <- hv_family_counts %>% group_by(name, taxid) %>% + summarize(n_reads_tot = sum(n_reads_hv), + p_reads_max = max(p_reads_hv), .groups="drop") %>% + arrange(desc(n_reads_tot)) +``` + +In investigating individual viral families, to avoid distortions from a few rare reads, I restricted myself to samples where that family made up at least 10% of human-viral reads: + +```{r} +#| label: hv-species-adeno +#| fig-height: 5 +#| fig-width: 7 + +threshold_major_species <- 0.05 +taxid_adeno <- 10508 + +# Get set of adenoviridae reads +adeno_samples <- hv_family_counts %>% filter(taxid == taxid_adeno) %>% + filter(p_reads_hv >= 0.1) %>% + pull(sample) +adeno_ids <- hv_reads_family %>% + filter(taxid == taxid_adeno, sample %in% adeno_samples) %>% + pull(seq_id) + +# Count reads for each adenoviridae species +adeno_species_counts <- hv_reads_species %>% + filter(seq_id %in% adeno_ids) %>% + group_by(sample, date, sample_type_short, name, taxid) %>% + count(name = "n_reads_hv") %>% + group_by(sample, date, sample_type_short) %>% + mutate(p_reads_adeno = n_reads_hv/sum(n_reads_hv)) + +# Identify high-ranking families and group others +adeno_species_major_tab <- adeno_species_counts %>% group_by(name) %>% + filter(p_reads_adeno == max(p_reads_adeno)) %>% + filter(row_number() == 1) %>% + arrange(desc(p_reads_adeno)) %>% + filter(p_reads_adeno > threshold_major_species) +adeno_species_counts_major <- adeno_species_counts %>% + mutate(name_display = ifelse(name %in% adeno_species_major_tab$name, + name, "Other")) %>% + group_by(sample, date, sample_type_short, name_display) %>% + summarize(n_reads_adeno = sum(n_reads_hv), + p_reads_adeno = sum(p_reads_adeno), + .groups="drop") %>% + mutate(name_display = factor(name_display, + levels = c(adeno_species_major_tab$name, "Other"))) +adeno_species_counts_display <- adeno_species_counts_major %>% + rename(p_reads = p_reads_adeno, classification = name_display) + +# Plot +g_adeno_species <- g_comp_base + + geom_col(data=adeno_species_counts_display, position = "stack") + + scale_y_continuous(name="% Adenoviridae Reads", limits=c(0,1.01), + breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral species") + + labs(title="Species composition of Adenoviridae reads") + + guides(fill=guide_legend(ncol=3)) + + theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain")) + +g_adeno_species + +# Get most prominent species for text +adeno_species_collate <- adeno_species_counts %>% group_by(name, taxid) %>% + summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_adeno), .groups="drop") %>% + arrange(desc(n_reads_tot)) +``` + +```{r} +#| label: hv-species-picorna +#| fig-height: 5 +#| fig-width: 7 + +threshold_major_species <- 0.1 +taxid_picorna <- 12058 + +# Get set of picornaviridae reads +picorna_samples <- hv_family_counts %>% filter(taxid == taxid_picorna) %>% + filter(p_reads_hv >= 0.1) %>% + pull(sample) +picorna_ids <- hv_reads_family %>% + filter(taxid == taxid_picorna, sample %in% picorna_samples) %>% + pull(seq_id) + +# Count reads for each picornaviridae species +picorna_species_counts <- hv_reads_species %>% + filter(seq_id %in% picorna_ids) %>% + group_by(sample, date, sample_type_short, name, taxid) %>% + count(name = "n_reads_hv") %>% + group_by(sample, date, sample_type_short) %>% + mutate(p_reads_picorna = n_reads_hv/sum(n_reads_hv)) + +# Identify high-ranking families and group others +picorna_species_major_tab <- picorna_species_counts %>% group_by(name) %>% + filter(p_reads_picorna == max(p_reads_picorna)) %>% + filter(row_number() == 1) %>% + arrange(desc(p_reads_picorna)) %>% + filter(p_reads_picorna > threshold_major_species) +picorna_species_counts_major <- picorna_species_counts %>% + mutate(name_display = ifelse(name %in% picorna_species_major_tab$name, + name, "Other")) %>% + group_by(sample, date, sample_type_short, name_display) %>% + summarize(n_reads_picorna = sum(n_reads_hv), + p_reads_picorna = sum(p_reads_picorna), + .groups="drop") %>% + mutate(name_display = factor(name_display, + levels = c(picorna_species_major_tab$name, "Other"))) +picorna_species_counts_display <- picorna_species_counts_major %>% + rename(p_reads = p_reads_picorna, classification = name_display) + +# Plot +g_picorna_species <- g_comp_base + + geom_col(data=picorna_species_counts_display, position = "stack") + + scale_y_continuous(name="% Picornaviridae Reads", limits=c(0,1.01), + breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral species") + + labs(title="Species composition of Picornaviridae reads") + + guides(fill=guide_legend(ncol=3)) + + theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain")) + +g_picorna_species + +# Get most prominent species for text +picorna_species_collate <- picorna_species_counts %>% group_by(name, taxid) %>% + summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_picorna), .groups="drop") %>% + arrange(desc(n_reads_tot)) +``` + +```{r} +#| label: hv-species-polyoma +#| fig-height: 5 +#| fig-width: 7 + +threshold_major_species <- 0.1 +taxid_polyoma <- 151341 + +# Get set of polyomaviridae reads +polyoma_samples <- hv_family_counts %>% filter(taxid == taxid_polyoma) %>% + filter(p_reads_hv >= 0.1) %>% + pull(sample) +polyoma_ids <- hv_reads_family %>% + filter(taxid == taxid_polyoma, sample %in% polyoma_samples) %>% + pull(seq_id) + +# Count reads for each polyomaviridae species +polyoma_species_counts <- hv_reads_species %>% + filter(seq_id %in% polyoma_ids) %>% + group_by(sample, date, sample_type_short, name, taxid) %>% + count(name = "n_reads_hv") %>% + group_by(sample, date, sample_type_short) %>% + mutate(p_reads_polyoma = n_reads_hv/sum(n_reads_hv)) + +# Identify high-ranking families and group others +polyoma_species_major_tab <- polyoma_species_counts %>% group_by(name) %>% + filter(p_reads_polyoma == max(p_reads_polyoma)) %>% + filter(row_number() == 1) %>% + arrange(desc(p_reads_polyoma)) %>% + filter(p_reads_polyoma > threshold_major_species) +polyoma_species_counts_major <- polyoma_species_counts %>% + mutate(name_display = ifelse(name %in% polyoma_species_major_tab$name, + name, "Other")) %>% + group_by(sample, date, sample_type_short, name_display) %>% + summarize(n_reads_polyoma = sum(n_reads_hv), + p_reads_polyoma = sum(p_reads_polyoma), + .groups="drop") %>% + mutate(name_display = factor(name_display, + levels = c(polyoma_species_major_tab$name, "Other"))) +polyoma_species_counts_display <- polyoma_species_counts_major %>% + rename(p_reads = p_reads_polyoma, classification = name_display) + +# Plot +g_polyoma_species <- g_comp_base + + geom_col(data=polyoma_species_counts_display, position = "stack") + + scale_y_continuous(name="% Polyomaviridae Reads", limits=c(0,1.01), + breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral species") + + labs(title="Species composition of Polyomaviridae reads") + + guides(fill=guide_legend(ncol=3)) + + theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain")) + +g_polyoma_species + +# Get most prominent species for text +polyoma_species_collate <- polyoma_species_counts %>% group_by(name, taxid) %>% + summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_polyoma), .groups="drop") %>% + arrange(desc(n_reads_tot)) +``` + +Finally, here again are the overall relative abundances of the specific viral genera I picked out manually in my last entry: + +```{r} +#| fig-height: 5 +#| label: ra-genera +#| warning: false + +# Define reference genera +path_genera_rna <- c("Mamastrovirus", "Enterovirus", "Salivirus", "Kobuvirus", "Norovirus", "Sapovirus", "Rotavirus", "Alphacoronavirus", "Betacoronavirus", "Alphainfluenzavirus", "Betainfluenzavirus", "Lentivirus") +path_genera_dna <- c("Mastadenovirus", "Alphapolyomavirus", "Betapolyomavirus", "Alphapapillomavirus", "Betapapillomavirus", "Gammapapillomavirus", "Orthopoxvirus", "Simplexvirus", + "Lymphocryptovirus", "Cytomegalovirus", "Dependoparvovirus") +path_genera <- bind_rows(tibble(name=path_genera_rna, genome_type="RNA genome"), + tibble(name=path_genera_dna, genome_type="DNA genome")) %>% + left_join(viral_taxa, by="name") + +# Count in each sample +mrg_hv_named_all <- mrg_hv %>% left_join(viral_taxa, by="taxid") +hv_reads_genus_all <- raise_rank(mrg_hv_named_all, viral_taxa, "genus") +n_path_genera <- hv_reads_genus_all %>% + group_by(sample, date, sample_type_short, name, taxid) %>% + count(name="n_reads_viral") %>% + inner_join(path_genera, by=c("name", "taxid")) %>% + left_join(read_counts_raw, by=c("sample", "date", "sample_type_short")) %>% + mutate(p_reads_viral = n_reads_viral/n_reads_raw) + +# Pivot out and back to add zero lines +n_path_genera_out <- n_path_genera %>% ungroup %>% select(sample, name, n_reads_viral) %>% + pivot_wider(names_from="name", values_from="n_reads_viral", values_fill=0) %>% + pivot_longer(-sample, names_to="name", values_to="n_reads_viral") %>% + left_join(read_counts_raw, by="sample") %>% + left_join(path_genera, by="name") %>% + mutate(p_reads_viral = n_reads_viral/n_reads_raw) + +## Aggregate across dates +n_path_genera_stype <- n_path_genera_out %>% + group_by(name, taxid, genome_type, sample_type_short) %>% + summarize(n_reads_raw = sum(n_reads_raw), + n_reads_viral = sum(n_reads_viral), .groups = "drop") %>% + mutate(sample="All samples", location="All locations", + p_reads_viral = n_reads_viral/n_reads_raw, + na_type = "DNA") + +# Plot +g_path_genera <- ggplot(n_path_genera_stype, + aes(y=name, x=p_reads_viral, color=sample_type_short)) + + geom_point() + + scale_x_log10(name="Relative abundance") + + scale_color_st() + + facet_grid(genome_type~., scales="free_y") + + theme_base + theme(axis.title.y = element_blank()) +g_path_genera +``` + +# Conclusion + +This is another dataset with very low HV abundance, arising from lab methods intended to maximize bacterial abundance at the expense of other taxa. Nevertheless, this dataset had higher HV relative abundance than the last one. Interestingly, all three wastewater DNA datasets analyzed so far have exhibited a strong predominance of adenoviruses, and especially human mastadenovirus F, among human-infecting viruses. We'll see if this pattern persists in the other DNA wastewater datasets I have in the queue.