diff --git a/data/2023-11-01_pr-comp/clashes/230926EsvA_D23-13405-1.clashes1.csv b/data/2023-11-01_pr-comp/clashes/230926EsvA_D23-13405-1.clashes1.csv
new file mode 100644
index 0000000..9f4ac1b
--- /dev/null
+++ b/data/2023-11-01_pr-comp/clashes/230926EsvA_D23-13405-1.clashes1.csv
@@ -0,0 +1,20 @@
+﻿sample,read_id_full,read_id_short,assignment_old,assignment_new,blast_hit_nt_viral,blast_hit_nt,blast_best_match
+D23-13405-1,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:1:10102:0403:3142:TCTAGGCT+NNNNNNNN",1:10102:0403:3142,Human mastadenovirus B (taxid 108098),Unclassified (hits to taxid 108098),FALSE,FALSE,None
+D23-13405-1,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:1:10603:0328:0567:TCTAGGCT+NNNNNNNN",1:10603:0328:0567,Macacine alphaherpesvirus 1 (taxid 10325),Unclassified (hits to taxid 10325),FALSE,FALSE,Bacterial (Stenotrophomonas)
+D23-13405-1,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:1:21304:0821:2638:TCTAGGCT+NNNNNNNN",1:21304:0821:2638,Yaba monkey tumor virus (taxid 38804),Unclassified (hits to taxid 38804),FALSE,FALSE,Bacterial (Flavobacterium) / Viral (Gordonia phage Nedarya)
+D23-13405-1,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:1:10204:5122:0531:TCTAGGCT+NNNNNNNN",1:10204:5122:0531,Avian paramyxovirus goose/Shimane/67/2000 (taxid 1401445),Unclassified (no viral hits),FALSE,FALSE,Bacterial (Acidovorax temperans strain LMJ)
+D23-13405-1,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:1:20901:3396:1256:TCTAGGCT+NNNNNNNN",1:20901:3396:1256,Hepatitis C virus genotype 2 (taxid 40271),Unclassified (no viral hits),FALSE,FALSE,Bacterial (Flavobacterium)
+D23-13405-1,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:1:11503:3203:1707:TCTAGGCT+NNNNNNNN",1:11503:3203:1707,Molluscum contagiosum virus subtype 1 (taxid 10280),Unclassified (read 2 failed quality filtering),FALSE,FALSE,Bacterial (Microbacterium)
+D23-13406-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:20202:3004:2385:CCGCTATT+NNNNNNNN",2:20202:3004:2385,Ekpoma virus 1 (taxid 1987020),Unclassified (hits to taxid 1987020),FALSE,FALSE,Bacterial (Cloacibacterium)
+D23-13406-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:10304:4444:3122:CCGCTATT+NNNNNNNN",2:10304:4444:3122,Ekpoma virus 1 (taxid 1987020),Unclassified (hits to taxid 1987020),FALSE,FALSE,Bacterial (Cloacibacterium)
+D23-13406-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:11403:3457:2144:CCGCTATT+NNNNNNNN",2:11403:3457:2144,Human metapneumovirus (taxid 162145),Unclassified (hits to taxid 162145),FALSE,FALSE,Eukaryotic (Ptychoptera)
+D23-13406-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:21902:3424:3184:CCGCTATT+NNNNNNNN",2:21902:3424:3184,Human betaherpesvirus 6B (taxid 32604),Unclassified (hits to taxid 32604),FALSE,FALSE,Bacterial (Quatrionicoccus)
+D23-13406-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:11903:2428:0754:CCGCTATT+NNNNNNNN",2:11903:2428:0754,Human alphaherpesvirus 1 (taxid 10298),Unclassified (hits to taxid 10298),FALSE,FALSE,"Bacterial (	Verrucomicrobia)"
+D23-13406-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:21904:3435:0179:CCGCTATT+NNNNNNNN",2:21904:3435:0179,Ekpoma virus 1 (taxid 1987020),Unclassified (hits to taxid 1987020),FALSE,FALSE,Bacterial (Cloacibacterium)
+D23-13406-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:20702:5106:2394:CCGCTATT+NNNNNNNN",2:20702:5106:2394,Human papillomavirus KC5 (taxid 1647924),Unclassified (no viral hits),FALSE,FALSE,Bacterial (Pulveribacter) / Eukaryotic (Darwinula)
+D23-13406-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:21102:4135:0742:CCGCTAAT+NNNNNNNN",2:21102:4135:0742,Influenza A virus (A/New York/392/2004(H3N2)) (taxid 335341),Unclassified (excluded during ribodepletion),TRUE,TRUE,Bacterial (Xiphinematobacter) / Viral (Influenza A)
+D23-13405-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:10502:3202:1468:TCTAGGCT+NNNNNNNN",2:10502:3202:1468,Human mastadenovirus E (taxid 130308),Unclassified (hits to taxid 130308),FALSE,FALSE,Viral (Caudoviricetes phage)
+D23-13405-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:12203:5006:1380:TCTAGGCT+NNNNNNNN",2:12203:5006:1380,Senegalvirus marseillevirus (taxid 944645),Unclassified (hits to taxid 944645),FALSE,FALSE,Eukaryotic (Vigna unguiculata)
+D23-13405-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:12102:3184:2302:TCTAGGCT+NNNNNNNN",2:12102:3184:2302,Human herpesvirus 4 type 2 (taxid 12509),Unclassified (hits to taxid 12509),FALSE,FALSE,None
+D23-13405-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:21902:2639:1967:TCTAGGCT+NNNNNNNN",2:21902:2639:1967,Sandfly fever Turkey virus (taxid 688699),Unclassified (hits to taxid 688699),TRUE,TRUE,Viral (Sandfly Sicilian Turkey virus)
+D23-13405-2,"AV224802:231013_72603_6408E:2320572603,810-00002,2307310022,20240730:2:21603:4385:2524:TCTAGGCT+NNNNNNNN",2:21603:4385:2524,Orf virus (taxid 10258),Unclassified (read 2 failed quality filtering),FALSE,FALSE,Bacterial (Deinococcus)
\ No newline at end of file
diff --git a/data/2023-11-06_pr-dedup/n_dup.csv b/data/2023-11-06_pr-dedup/n_dup.csv
new file mode 100644
index 0000000..660e9ae
--- /dev/null
+++ b/data/2023-11-06_pr-dedup/n_dup.csv
@@ -0,0 +1,97 @@
+﻿sample,replicate,read_pairs_in,n_reads_out,read_pairs_out,n_dup,p_dup,unpair_repair
+D23-13404,1,10000,19994,9997,3,0.0003,TRUE
+D23-13404,2,10000,19992,9996,4,0.0004,TRUE
+D23-13404,3,10000,19996,9998,2,0.0002,TRUE
+D23-13405,1,10000,20000,10000,0,0,TRUE
+D23-13405,2,10000,19998,9999,1,0.0001,TRUE
+D23-13405,3,10000,19990,9995,5,0.0005,TRUE
+D23-13406,1,10000,19996,9998,2,0.0002,TRUE
+D23-13406,2,10000,19998,9999,1,0.0001,TRUE
+D23-13406,3,10000,20000,10000,0,0,TRUE
+D23-13404,1,100000,199486,99743,257,0.00257,TRUE
+D23-13404,2,100000,199500,99750,250,0.0025,TRUE
+D23-13404,3,100000,199598,99799,201,0.00201,TRUE
+D23-13405,1,100000,199512,99756,244,0.00244,TRUE
+D23-13405,2,100000,199568,99784,216,0.00216,TRUE
+D23-13405,3,100000,199524,99762,238,0.00238,TRUE
+D23-13406,1,100000,199840,99920,80,0.0008,TRUE
+D23-13406,2,100000,199806,99903,97,0.00097,TRUE
+D23-13406,3,100000,199808,99904,96,0.00096,TRUE
+D23-13404,1,1000000,1970336,985168,14832,0.014832,TRUE
+D23-13404,2,1000000,1970432,985216,14784,0.014784,TRUE
+D23-13404,3,1000000,1969858,984929,15071,0.015071,TRUE
+D23-13405,1,1000000,1970370,985185,14815,0.014815,TRUE
+D23-13405,2,1000000,1969994,984997,15003,0.015003,TRUE
+D23-13405,3,1000000,1970210,985105,14895,0.014895,TRUE
+D23-13406,1,1000000,1988396,994198,5802,0.005802,TRUE
+D23-13406,2,1000000,1988482,994241,5759,0.005759,TRUE
+D23-13406,3,1000000,1987738,993869,6131,0.006131,TRUE
+D23-13404,1,10000000,17731492,8865746,1134254,0.1134254,TRUE
+D23-13404,2,10000000,17729818,8864909,1135091,0.1135091,TRUE
+D23-13404,3,10000000,17732345,8866172.5,1133827.5,0.11338275,TRUE
+D23-13405,1,10000000,17728293,8864146.5,1135853.5,0.11358535,TRUE
+D23-13405,2,10000000,17726169,8863084.5,1136915.5,0.11369155,TRUE
+D23-13405,3,10000000,17726870,8863435,1136565,0.1136565,TRUE
+D23-13406,1,10000000,18907178,9453589,546411,0.0546411,TRUE
+D23-13406,2,10000000,18908203,9454101.5,545898.5,0.05458985,TRUE
+D23-13406,3,10000000,18909204,9454602,545398,0.0545398,TRUE
+D23-13404,1,100000000,140097589,70048794.5,29951205.5,0.299512055,TRUE
+D23-13404,2,100000000,140098194,70049097,29950903,0.29950903,TRUE
+D23-13404,3,100000000,140115115,70057557.5,29942442.5,0.299424425,TRUE
+D23-13405,1,100000000,139910932,69955466,30044534,0.30044534,TRUE
+D23-13405,2,100000000,139927287,69963643.5,30036356.5,0.300363565,TRUE
+D23-13405,3,100000000,139919577,69959788.5,30040211.5,0.300402115,TRUE
+D23-13406,1,100000000,169354120,84677060,15322940,0.1532294,TRUE
+D23-13406,2,100000000,169362691,84681345.5,15318654.5,0.153186545,TRUE
+D23-13406,3,100000000,169362791,84681395.5,15318604.5,0.153186045,TRUE
+D23-13404,1,229823475,279881653,139940826.5,89882648.5,0.391094289,TRUE
+D23-13405,1,196470009,246083898,123041949,73428060,0.373736737,TRUE
+D23-13406,1,118938720,198682176,99341088,19597632,0.164770833,TRUE
+D23-13404,1,10000,19996,9998,2,0.0002,FALSE
+D23-13404,2,10000,19994,9997,3,0.0003,FALSE
+D23-13404,3,10000,20000,10000,0,0,FALSE
+D23-13405,1,10000,20000,10000,0,0,FALSE
+D23-13405,2,10000,20000,10000,0,0,FALSE
+D23-13405,3,10000,19994,9997,3,0.0003,FALSE
+D23-13406,1,10000,20000,10000,0,0,FALSE
+D23-13406,2,10000,19998,9999,1,0.0001,FALSE
+D23-13406,3,10000,20000,10000,0,0,FALSE
+D23-13404,1,100000,199716,99858,142,0.00142,FALSE
+D23-13404,2,100000,199748,99874,126,0.00126,FALSE
+D23-13404,3,100000,199792,99896,104,0.00104,FALSE
+D23-13405,1,100000,199736,99868,132,0.00132,FALSE
+D23-13405,2,100000,199764,99882,118,0.00118,FALSE
+D23-13405,3,100000,199786,99893,107,0.00107,FALSE
+D23-13406,1,100000,199910,99955,45,0.00045,FALSE
+D23-13406,2,100000,199896,99948,52,0.00052,FALSE
+D23-13406,3,100000,199908,99954,46,0.00046,FALSE
+D23-13404,1,1000000,1982716,991358,8642,0.008642,FALSE
+D23-13404,2,1000000,1982784,991392,8608,0.008608,FALSE
+D23-13404,3,1000000,1982104,991052,8948,0.008948,FALSE
+D23-13405,1,1000000,1982832,991416,8584,0.008584,FALSE
+D23-13405,2,1000000,1982594,991297,8703,0.008703,FALSE
+D23-13405,3,1000000,1982574,991287,8713,0.008713,FALSE
+D23-13406,1,1000000,1993000,996500,3500,0.0035,FALSE
+D23-13406,2,1000000,1993062,996531,3469,0.003469,FALSE
+D23-13406,3,1000000,1992524,996262,3738,0.003738,FALSE
+D23-13404,1,10000000,19050540,9525270,474730,0.047473,FALSE
+D23-13404,2,10000000,19051530,9525765,474235,0.0474235,FALSE
+D23-13404,3,10000000,19053706,9526853,473147,0.0473147,FALSE
+D23-13405,1,10000000,19044532,9522266,477734,0.0477734,FALSE
+D23-13405,2,10000000,19045302,9522651,477349,0.0477349,FALSE
+D23-13405,3,10000000,19045442,9522721,477279,0.0477279,FALSE
+D23-13406,1,10000000,19613738,9806869,193131,0.0193131,FALSE
+D23-13406,2,10000000,19614816,9807408,192592,0.0192592,FALSE
+D23-13406,3,10000000,19615128,9807564,192436,0.0192436,FALSE
+D23-13404,1,100000000,166457946,83228973,16771027,0.16771027,FALSE
+D23-13404,2,100000000,166457242,83228621,16771379,0.16771379,FALSE
+D23-13404,3,100000000,166480106,83240053,16759947,0.16759947,FALSE
+D23-13405,1,100000000,166191238,83095619,16904381,0.16904381,FALSE
+D23-13405,2,100000000,166206048,83103024,16896976,0.16896976,FALSE
+D23-13405,3,100000000,166199686,83099843,16900157,0.16900157,FALSE
+D23-13406,1,100000000,184420200,92210100,7789900,0.077899,FALSE
+D23-13406,2,100000000,184422484,92211242,7788758,0.07788758,FALSE
+D23-13406,3,100000000,184425006,92212503,7787497,0.07787497,FALSE
+D23-13404,1,229823475,350045876,175022938,54800537,0.238446212,FALSE
+D23-13405,1,196470009,304338346,152169173,44300836,0.225483962,FALSE
+D23-13406,1,118938720,217466564,108733282,10205438,0.085804169,FALSE
\ No newline at end of file
diff --git a/data/2023-11-06_pr-dedup/n_dup.xlsx b/data/2023-11-06_pr-dedup/n_dup.xlsx
new file mode 100644
index 0000000..9498dba
Binary files /dev/null and b/data/2023-11-06_pr-dedup/n_dup.xlsx differ
diff --git a/docs/index.html b/docs/index.html
index 37f8f8d..06c9dac 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -151,7 +151,29 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="1" data-listing-date-sort="1698724800000" data-listing-file-modified-sort="1698941598593" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="17">
+<div class="quarto-post image-right" data-index="1" data-listing-date-sort="1698897600000" data-listing-file-modified-sort="1699450558652" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="13">
+<div class="thumbnail">
+<p><a href="./notebooks/2023-11-02_project-runway-dna-deduplication.html"> <p class="card-img-top"><img src="notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-2-1.png"  class="thumbnail-image card-img"/></p> </a></p>
+</div>
+<div class="body">
+<a href="./notebooks/2023-11-02_project-runway-dna-deduplication.html">
+<h3 class="no-anchor listing-title">
+Estimating the effect of read depth on duplication rate for Project Runway DNA data
+</h3>
+<div class="listing-subtitle">
+How deep can we go?
+</div>
+</a>
+</div>
+<div class="metadata">
+<a href="./notebooks/2023-11-02_project-runway-dna-deduplication.html">
+<div class="listing-date">
+Nov 2, 2023
+</div>
+</a>
+</div>
+</div>
+<div class="quarto-post image-right" data-index="2" data-listing-date-sort="1698724800000" data-listing-file-modified-sort="1698941598593" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="17">
 <div class="thumbnail">
 <p><a href="./notebooks/2023-10-31_project-runway-initial.html"> <p class="card-img-top"><img src="notebooks/2023-10-31_project-runway-initial_files/figure-html/unnamed-chunk-3-1.png"  class="thumbnail-image card-img"/></p> </a></p>
 </div>
@@ -173,7 +195,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="2" data-listing-date-sort="1697688000000" data-listing-file-modified-sort="1697766328595" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="11">
+<div class="quarto-post image-right" data-index="3" data-listing-date-sort="1697688000000" data-listing-file-modified-sort="1697766328595" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="11">
 <div class="thumbnail">
 <p><a href="./notebooks/2023-10-19_deduplication.html"> <p class="card-img-top"><img src="notebooks/2023-10-19_deduplication_files/figure-html/unnamed-chunk-2-1.png"  class="thumbnail-image card-img"/></p> </a></p>
 </div>
@@ -195,7 +217,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="3" data-listing-date-sort="1697428800000" data-listing-file-modified-sort="1697493211896" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="15">
+<div class="quarto-post image-right" data-index="4" data-listing-date-sort="1697428800000" data-listing-file-modified-sort="1697493211896" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="15">
 <div class="thumbnail">
 <p><a href="./notebooks/2023-10-13_rrna-removal.html"> <p class="card-img-top"><img src="notebooks/2023-10-13_rrna-removal_files/figure-html/rrna-overlap-venn-johnson-1.png"  class="thumbnail-image card-img"/></p> </a></p>
 </div>
@@ -217,7 +239,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="4" data-listing-date-sort="1697083200000" data-listing-file-modified-sort="1697319460554" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="12">
+<div class="quarto-post image-right" data-index="5" data-listing-date-sort="1697083200000" data-listing-file-modified-sort="1697319460554" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="12">
 <div class="thumbnail">
 <p><a href="./notebooks/2023-10-12_fastp-vs-adapterremoval.html"> <p class="card-img-top"><img src="notebooks/2023-10-12_fastp-vs-adapterremoval_files/figure-html/unnamed-chunk-2-1.png"  class="thumbnail-image card-img"/></p> </a></p>
 </div>
@@ -239,7 +261,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="5" data-listing-date-sort="1696996800000" data-listing-file-modified-sort="1697148020355" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="10">
+<div class="quarto-post image-right" data-index="6" data-listing-date-sort="1696996800000" data-listing-file-modified-sort="1697148020355" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="10">
 <div class="thumbnail">
 <p><a href="./notebooks/2023-10-12_how-does-element-sequencing-work.html"> <p class="card-img-top"><img src="img/2023-10-11_rolling-circle-amplification.png"  class="thumbnail-image card-img"/></p> </a></p>
 </div>
@@ -261,7 +283,7 @@ <h3 class="no-anchor listing-title">
 </a>
 </div>
 </div>
-<div class="quarto-post image-right" data-index="6" data-listing-date-sort="1695268800000" data-listing-file-modified-sort="1695331351195" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="11">
+<div class="quarto-post image-right" data-index="7" data-listing-date-sort="1695268800000" data-listing-file-modified-sort="1695331351195" data-listing-date-modified-sort="NaN" data-listing-reading-time-sort="11">
 <div class="thumbnail">
 <p><a href="./notebooks/2023-09-12_settled-solids-extraction-test.html"> <p class="card-img-top"><img src="notebooks/2023-09-12_settled-solids-extraction-test_files/figure-html/plot-concentrations-1.png"  class="thumbnail-image card-img"/></p> </a></p>
 </div>
diff --git a/docs/listings.json b/docs/listings.json
index cf4bff4..8c770d4 100644
--- a/docs/listings.json
+++ b/docs/listings.json
@@ -3,6 +3,7 @@
     "listing": "/index.html",
     "items": [
       "/notebooks/2023-11-02_project-runway-comparison.html",
+      "/notebooks/2023-11-02_project-runway-dna-deduplication.html",
       "/notebooks/2023-10-31_project-runway-initial.html",
       "/notebooks/2023-10-19_deduplication.html",
       "/notebooks/2023-10-13_rrna-removal.html",
diff --git a/docs/notebooks/2023-11-02_project-runway-dna-deduplication.html b/docs/notebooks/2023-11-02_project-runway-dna-deduplication.html
new file mode 100644
index 0000000..49e0fce
--- /dev/null
+++ b/docs/notebooks/2023-11-02_project-runway-dna-deduplication.html
@@ -0,0 +1,867 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.3.450">
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+<meta name="author" content="Will Bradshaw">
+<meta name="dcterms.date" content="2023-11-02">
+<title>Will’s Public NAO Notebook - Estimating the effect of read depth on duplication rate for Project Runway DNA data</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/quarto-nav/headroom.min.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<script src="../site_libs/quarto-html/quarto.js"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light"><script id="quarto-search-options" type="application/json">{
+  "location": "navbar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "end",
+  "type": "overlay",
+  "limit": 20,
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script><style>
+
+      .quarto-title-block .quarto-title-banner {
+        background: black;
+      }
+</style>
+<link href="../site_libs/pagedtable-1.1/css/pagedtable.css" rel="stylesheet">
+<script src="../site_libs/pagedtable-1.1/js/pagedtable.js"></script><script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script><script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
+</head>
+<body class="nav-fixed fullcontent">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top"><nav class="navbar navbar-expand-lg navbar-dark "><div class="navbar-container container-fluid">
+      <div class="navbar-brand-container">
+    <a class="navbar-brand" href="../index.html">
+    <span class="navbar-title">Will’s Public NAO Notebook</span>
+    </a>
+  </div>
+        <div class="quarto-navbar-tools ms-auto">
+</div>
+          <div id="quarto-search" class="" title="Search"></div>
+      </div> <!-- /container-fluid -->
+    </nav></header><!-- content --><header id="title-block-header" class="quarto-title-block default page-columns page-full"><div class="quarto-title-banner page-columns page-full">
+    <div class="quarto-title column-body">
+      <div class="quarto-title-block"><div><h1 class="title">Estimating the effect of read depth on duplication rate for Project Runway DNA data</h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
+            <p class="subtitle lead">How deep can we go?</p>
+                      </div>
+  </div>
+    
+  
+  <div class="quarto-title-meta">
+
+      <div>
+      <div class="quarto-title-meta-heading">Author</div>
+      <div class="quarto-title-meta-contents">
+               <p>Will Bradshaw </p>
+            </div>
+    </div>
+      
+      <div>
+      <div class="quarto-title-meta-heading">Published</div>
+      <div class="quarto-title-meta-contents">
+        <p class="date">November 2, 2023</p>
+      </div>
+    </div>
+    
+      
+    </div>
+    
+  
+  </header><div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
+<!-- sidebar -->
+<!-- margin-sidebar -->
+    
+<!-- main -->
+<main class="content quarto-banner-title-block" id="quarto-document-content"><p>One relevant question for both Project Runway and other NAO sequencing is: what is the maximum read depth at which we can sequence a given sample while retaining an acceptable level of sequence duplication?</p>
+<p>As discussed in a previous entry, duplicate reads can arise in sequencing data from a variety of processes, including true biological duplicates present in the raw sample; processing duplicates arising from amplification and other processes during sample and library prep; and sequencing duplicates arising from various processes on the actual flow cell.</p>
+<p>As we sequence more deeply, we expect the fraction of biological and processing duplicates (but not, I think, sequencing duplicates) in our read data to increase. In the former case, this is because we are capturing a larger fraction of all the input molecules in our sample; in the latter, because we are sequencing copies of the same sequence over and over again. Intuitively, I expect the increase in processing duplicates to swamp that in biological duplicates at high read depth, at least for library prep protocols that involve amplification<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a>.</p>
+<p>One simple approach to investigate the overall effect of read depth on duplication levels in the sample is rarefaction: downsampling a library to different numbers of reads and seeing how the duplication rate changes as a function of read count. In this notebook entry, I apply this approach to sequencing data from the Project Runway initial DNA dataset, to see how duplication rate behaves in this case.</p>
+<section id="methods" class="level1"><h1>Methods</h1>
+<p>To analyze the effect of read depth on duplication rates in this data, I first concatenated the raw reads from the two sequencing replicates of each sample together. This allowed the analysis to detect duplicates across replicates, that would have been missed by analyzing them separately.</p>
+<p>Next, I performed preprocessing to remove adapter sequences and low-quality bases that might interfere with duplicate detection. I didn’t collapse read pairs together or discard read pairs with overall low quality.</p>
+<pre><code>for p in $(cat prefixes.txt); do echo $p; fastp -i raw/${p}_1.fastq.gz -I raw/${p}_2.fastq.gz -o preproc/${p}_fastp_1.fastq.gz -O preproc/${p}_fastp_2.fastq.gz --failed_out preproc/${p}_fastp_failed.fastq.gz --cut_tail --correction --detect_adapter_for_pe --adapter_fasta adapters.fa  --trim_poly_x --thread 16 -Q -L; done</code></pre>
+<p>I then took each pair of preprocessed FASTQ files and downsampled them to specified numbers of reads, from 10,000 to 100,000,000 in units of 1 OOM, using <code>seqtk sample</code>. I performed downsampling 3 times for each read count, with the intent of calculating duplication rate separately for each replicate and taking the average.</p>
+<p>Finally, I ran deduplication with Clumpify on each downsampled pair of read files, as well as the raw preprocessed files, and recorded the fraction of reads discarded in each case. I performed this twice, using different Clumpify settings each time:</p>
+<ol type="1">
+<li>
+<p>First, I attempted to perform deduplication in a maximally comprehensive way, using Clumpify’s <code>unpair repair</code> options to identify and remove duplicates in opposite orientation across read pairs (as discussed <a href="https://data.securebio.org/wills-public-notebook/notebooks/2023-11-02_project-runway-comparison.html#conclusions">here</a>). This configuration finds more duplicates, but (a) might overestimate duplicates in cases where only one read in a pair matches another read, and (b) causes memory-related errors for large subsample sizes.</p>
+<pre><code>clumpify.sh in=&lt;in1&gt; in2=&lt;in2&gt; out=&lt;out1&gt; out2=&lt;out2&gt; dedupe containment unpair repair</code></pre>
+</li>
+<li>
+<p>Second, due to the aforementioned issues with approach 1, I repeated deduplication without Clumpify’s <code>unpair repair</code> options enabled, providing a lower-bound estimate of duplication levels which should be more consistent with estimates provided by, for example, FASTQC.</p>
+<pre><code> clumpify.sh in=&lt;in1&gt; in2=&lt;in2&gt; out=&lt;out1&gt; out2=&lt;out2&gt; dedupe containment</code></pre>
+</li>
+</ol></section><section id="results" class="level1"><h1>Results</h1>
+<p>Plotting the fraction of duplicate reads (with or without Clumpify’s <code>unpair repair</code> options) gives us the following result:</p>
+<div class="cell">
+<details><summary>Code</summary><div class="sourceCode" id="cb4"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="co"># Import data</span></span>
+<span><span class="va">data_dir</span> <span class="op">&lt;-</span> <span class="st">"../data/2023-11-06_pr-dedup/"</span></span>
+<span><span class="va">n_dup_path</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html">file.path</a></span><span class="op">(</span><span class="va">data_dir</span>, <span class="st">"n_dup.csv"</span><span class="op">)</span></span>
+<span><span class="va">n_dup</span> <span class="op">&lt;-</span> <span class="fu">read_csv</span><span class="op">(</span><span class="va">n_dup_path</span>, show_col_types <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span> <span class="op">%&gt;%</span> </span>
+<span>  <span class="fu">mutate</span><span class="op">(</span>n_dup <span class="op">=</span> <span class="va">read_pairs_in</span> <span class="op">-</span> <span class="va">read_pairs_out</span>, p_dup <span class="op">=</span> <span class="va">n_dup</span><span class="op">/</span><span class="va">read_pairs_in</span>,</span>
+<span>         o_dup <span class="op">=</span> <span class="va">p_dup</span><span class="op">/</span><span class="op">(</span><span class="fl">1</span><span class="op">-</span><span class="va">p_dup</span><span class="op">)</span><span class="op">)</span></span>
+<span></span>
+<span><span class="co"># Reshape data</span></span>
+<span><span class="va">n_dup_flat</span> <span class="op">&lt;-</span> <span class="va">n_dup</span> <span class="op">%&gt;%</span> <span class="fu">group_by</span><span class="op">(</span><span class="va">sample</span>, <span class="va">unpair_repair</span>, <span class="va">read_pairs_in</span><span class="op">)</span> <span class="op">%&gt;%</span></span>
+<span>  <span class="fu">summarize</span><span class="op">(</span>p_dup_mean <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">p_dup</span><span class="op">)</span>, p_dup_sd <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">p_dup</span><span class="op">)</span>, p_dup_min <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="fl">0</span>,<span class="va">p_dup_mean</span><span class="op">-</span><span class="va">p_dup_sd</span><span class="op">)</span>, p_dup_max <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">min</a></span><span class="op">(</span><span class="fl">1</span>, <span class="va">p_dup_mean</span><span class="op">+</span><span class="va">p_dup_sd</span><span class="op">)</span>,</span>
+<span>            o_dup_mean <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/mean.html">mean</a></span><span class="op">(</span><span class="va">o_dup</span><span class="op">)</span>, o_dup_sd <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/stats/sd.html">sd</a></span><span class="op">(</span><span class="va">o_dup</span><span class="op">)</span>, o_dup_min <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">max</a></span><span class="op">(</span><span class="fl">0</span>,<span class="va">o_dup_mean</span><span class="op">-</span><span class="va">o_dup_sd</span><span class="op">)</span>, o_dup_max <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Extremes.html">min</a></span><span class="op">(</span><span class="fl">1</span>, <span class="va">o_dup_mean</span><span class="op">+</span><span class="va">o_dup_sd</span><span class="op">)</span>, .groups <span class="op">=</span> <span class="st">"drop"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="co"># Plot data</span></span>
+<span><span class="va">g_dup_flat_base</span> <span class="op">&lt;-</span> <span class="fu">ggplot</span><span class="op">(</span><span class="va">n_dup_flat</span>, <span class="fu">aes</span><span class="op">(</span>x<span class="op">=</span><span class="va">read_pairs_in</span>, y<span class="op">=</span><span class="va">p_dup_mean</span>, color <span class="op">=</span> <span class="va">sample</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">geom_line</span><span class="op">(</span><span class="op">)</span> <span class="op">+</span> <span class="fu">geom_errorbar</span><span class="op">(</span><span class="fu">aes</span><span class="op">(</span>ymin<span class="op">=</span><span class="va">p_dup_min</span>, ymax<span class="op">=</span><span class="va">p_dup_max</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> <span class="fu">geom_point</span><span class="op">(</span>shape <span class="op">=</span> <span class="fl">16</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">scale_x_log10</span><span class="op">(</span>name <span class="op">=</span> <span class="st">"# Input Read Pairs"</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">facet_grid</span><span class="op">(</span><span class="va">.</span><span class="op">~</span><span class="va">unpair_repair</span>, labeller <span class="op">=</span> <span class="st">"label_both"</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">scale_color_brewer</span><span class="op">(</span>palette <span class="op">=</span> <span class="st">"Dark2"</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="va">theme_base</span> <span class="op">+</span> <span class="fu">theme</span><span class="op">(</span>aspect.ratio <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span>
+<span><span class="va">g_dup_flat_lin</span> <span class="op">&lt;-</span> <span class="va">g_dup_flat_base</span> <span class="op">+</span> </span>
+<span>  <span class="fu">scale_y_continuous</span><span class="op">(</span>name <span class="op">=</span> <span class="st">"Fraction of duplicate reads"</span>, limits <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">0.41</span><span class="op">)</span>, breaks <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span>,<span class="fl">0.1</span><span class="op">)</span>, expand <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">0</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">g_dup_flat_log</span> <span class="op">&lt;-</span> <span class="va">g_dup_flat_base</span> <span class="op">+</span> <span class="fu">scale_y_log10</span><span class="op">(</span>name <span class="op">=</span> <span class="st">"Fraction of duplicate reads"</span><span class="op">)</span></span>
+<span></span>
+<span><span class="co"># Show plats</span></span>
+<span><span class="va">g_dup_flat_lin</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details><div class="cell-output-display">
+<p><img src="2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-2-1.png" class="img-fluid" width="672"></p>
+</div>
+<details><summary>Code</summary><div class="sourceCode" id="cb5"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">g_dup_flat_log</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details><div class="cell-output-display">
+<p><img src="2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-2-2.png" class="img-fluid" width="672"></p>
+</div>
+</div>
+<p>We can see that the fraction of duplicates reaches quite high levels as we approach the full read count, especially when <code>unpair repair</code> is enabled. It also looks like the gradient of increase is quite high on the linear-log plot, which would suggest that further OOM increases in read depth might result in quite large increases in the fraction of duplicates. It’s also apparent that, for whatever reason, D23-13406 has substantially fewer duplicates at any given read depth than the other two samples.</p>
+<p>However, further interpretation of these results, including extrapolation to greater read depths, is made difficult by the lack of a theoretical model for what we expect to see. It’s also not clear what mode of visualization (Linear-log? Log-linear? Log-log? Fraction of duplicates expressed as probabilities or odds? Etc) is most meaningful for interpretation.</p>
+<p>To start resolving some of these roadblocks, I spent some time working on a very simple model of read duplication, to see what it might tell us about the expected pattern of duplicates as a function of read depth.</p>
+</section><section id="a-very-very-simple-model-of-read-duplication" class="level1"><h1>A very very simple model of read duplication</h1>
+<ul>
+<li><p>Imagine a sample containing <span class="math inline">\(N\)</span> distinct molecules, which are uniformly amplified up to <span class="math inline">\(M = N \times 2^C\)</span> molecules by a perfectly unbiased <span class="math inline">\(C\)</span>-cycle PCR reaction. Adapters are ligated and the resulting library is washed across the flow cell to generate <span class="math inline">\(X\)</span> total clusters (again, without bias<a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>). These clusters are then sequenced by a process that generates optical duplicates at some rate <span class="math inline">\(O\)</span>, for a total expectation of <span class="math inline">\(R=X\times(1+O)\)</span> reads.</p></li>
+<li><p>Each cluster is selected from the <span class="math inline">\(M\)</span> molecules in the library, without replacement. When <span class="math inline">\(C\)</span> is large and amplification is uniform across molecules, this is well-approximated by selecting from the <span class="math inline">\(N\)</span> input molecules with replacement. Under these conditions, the number of clusters generated from input molecule <span class="math inline">\(i\)</span> is approximately <span class="math inline">\(X_i \sim \mathrm{B}(X,N^{-1})\)</span>.</p></li>
+<li><p>The number of optical duplicates generated from a given input molecule is then approximately <span class="math inline">\(P_i \sim \mathrm{B}(X_i,O)\)</span>, and the total number of reads corresponding to a given input molecule is thus <span class="math inline">\(R_i = X_i + P_i\)</span>. The number of duplicates generated from that molecule is then <span class="math inline">\(D_i = \max(0,R_i - 1)\)</span>, and the overall fraction of duplicates is <span class="math inline">\(F=\frac{D}{R}=\frac{\sum_iD_i}{\sum_iR_i}\)</span> .</p></li>
+<li>
+<p>The expected fraction of duplicates under this model can be estimated analytically as follows<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a>:</p>
+<ul>
+<li><p><span class="math inline">\(E(R_i)=\sum_{r=1}^{2X} r \cdot P(R_i = r) = \sum_{r=1}^{2X}r\cdot\left[\sum_{k=0}^{r}P(X_i = k)\cdot{}P(P_i=r-k|X_i=k)\right]\)</span></p></li>
+<li><p><span class="math inline">\(E(D_i) = \sum_{r=1}^{2X} (r-1) \cdot P(R_i = r)\)</span></p></li>
+<li><p><span class="math inline">\(E(D) = E\left(\sum_i D_i\right) = N \cdot E(D_i)\)</span></p></li>
+<li>
+<p><span class="math inline">\(R = \sum_i R_i = \sum_r r \cdot \mathbb{N}(R_i = r)\)</span></p>
+<ul>
+<li><p>When <span class="math inline">\(N\)</span> is large, <span class="math inline">\(\mathbb{N}(R_i = r) \approx N \cdot P(R_i = r)\)</span>, and so <span class="math inline">\(R \approx \sum_r r \cdot N \cdot P(R_i = r) = N \cdot E(R_i)\)</span></p></li>
+<li><p>Hence <span class="math inline">\(E\left(\frac{1}{R}\right) \approx \frac{1}{N \cdot E(R_i)}\)</span></p></li>
+</ul>
+</li>
+<li><p>Thus <span class="math inline">\(E(F) = E\left(\frac{D}{R}\right) = E(D) \cdot E\left(\frac{1}{R}\right) \approx \frac{N \cdot E(D_i)}{N \cdot E(R_i)} = \frac{E(D_i)}{E(R_i)}\)</span></p></li>
+</ul>
+</li>
+</ul>
+<p>Using computational implementations of these formulae, we can investigate how the fraction of duplicates varies with the number of clusters for different parameter values.</p>
+<div class="cell">
+<details><summary>Code</summary><div class="sourceCode" id="cb6"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="co"># Define auxiliary functions</span></span>
+<span><span class="va">log_p_clusters</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">n_clusters</span>, <span class="va">total_clusters</span>, <span class="va">n_molecules</span><span class="op">)</span> <span class="fu"><a href="https://rdrr.io/r/stats/Binomial.html">dbinom</a></span><span class="op">(</span><span class="va">n_clusters</span>, <span class="va">total_clusters</span>, <span class="fl">1</span><span class="op">/</span><span class="va">n_molecules</span>, log<span class="op">=</span><span class="cn">TRUE</span><span class="op">)</span></span>
+<span><span class="va">log_p_opt_dups</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">n_opt_dups</span>, <span class="va">n_clusters</span>, <span class="va">p_opt_dup</span><span class="op">)</span> <span class="fu"><a href="https://rdrr.io/r/stats/Binomial.html">dbinom</a></span><span class="op">(</span><span class="va">n_opt_dups</span>, <span class="va">n_clusters</span>, <span class="va">p_opt_dup</span>, log<span class="op">=</span><span class="cn">TRUE</span><span class="op">)</span></span>
+<span><span class="va">p_reads</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">n_reads</span>, <span class="va">total_clusters</span>, <span class="va">n_molecules</span>, <span class="va">p_opt_dup</span><span class="op">)</span><span class="op">{</span></span>
+<span>  <span class="va">n_clusters</span> <span class="op">&lt;-</span> <span class="va">n_reads</span> <span class="op">-</span> <span class="op">(</span><span class="fl">0</span><span class="op">:</span><span class="op">(</span><span class="va">n_reads</span><span class="op">/</span><span class="fl">2</span><span class="op">)</span><span class="op">)</span></span>
+<span>  <span class="va">log_p_n_clusters</span> <span class="op">&lt;-</span> <span class="fu">log_p_clusters</span><span class="op">(</span><span class="va">n_clusters</span>, <span class="va">total_clusters</span>, <span class="va">n_molecules</span><span class="op">)</span></span>
+<span>  <span class="va">log_p_n_opt_dups</span> <span class="op">&lt;-</span> <span class="fu">log_p_opt_dups</span><span class="op">(</span><span class="va">n_reads</span> <span class="op">-</span> <span class="va">n_clusters</span>, <span class="va">n_clusters</span>, <span class="va">p_opt_dup</span><span class="op">)</span></span>
+<span>  <span class="va">log_p_reads_clusters</span> <span class="op">&lt;-</span> <span class="va">log_p_n_clusters</span> <span class="op">+</span> <span class="va">log_p_n_opt_dups</span></span>
+<span>  <span class="kw"><a href="https://rdrr.io/r/base/function.html">return</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/Log.html">exp</a></span><span class="op">(</span><span class="va">log_p_reads_clusters</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="op">}</span></span>
+<span></span>
+<span><span class="co"># Define main function</span></span>
+<span><span class="va">exp_duplicates</span> <span class="op">&lt;-</span> <span class="kw">function</span><span class="op">(</span><span class="va">n_molecules</span>, <span class="va">total_clusters</span>, <span class="va">p_opt_dup</span>, <span class="va">initial_vector_length</span> <span class="op">=</span> <span class="fl">1e8</span><span class="op">)</span><span class="op">{</span></span>
+<span>  <span class="va">pr</span> <span class="op">&lt;-</span> <span class="fu">purrr</span><span class="fu">::</span><span class="fu"><a href="https://purrr.tidyverse.org/reference/partial.html">partial</a></span><span class="op">(</span><span class="va">p_reads</span>, total_clusters <span class="op">=</span> <span class="va">total_clusters</span>, n_molecules <span class="op">=</span> <span class="va">n_molecules</span>, p_opt_dup <span class="op">=</span> <span class="va">p_opt_dup</span><span class="op">)</span></span>
+<span>  <span class="co"># Calculate read count probabilities</span></span>
+<span>  <span class="va">n_reads</span> <span class="op">&lt;-</span> <span class="fl">1</span><span class="op">:</span><span class="va">initial_vector_length</span></span>
+<span>  <span class="va">p_n_reads</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/numeric.html">numeric</a></span><span class="op">(</span><span class="va">initial_vector_length</span><span class="op">)</span></span>
+<span>  <span class="va">break_zero</span> <span class="op">&lt;-</span> <span class="cn">FALSE</span></span>
+<span>  <span class="kw">for</span> <span class="op">(</span><span class="va">n</span> <span class="kw">in</span> <span class="fl">1</span><span class="op">:</span><span class="fu"><a href="https://rdrr.io/r/base/length.html">length</a></span><span class="op">(</span><span class="va">n_reads</span><span class="op">)</span><span class="op">)</span><span class="op">{</span></span>
+<span>    <span class="va">p</span> <span class="op">&lt;-</span> <span class="fu">pr</span><span class="op">(</span><span class="va">n_reads</span><span class="op">[</span><span class="va">n</span><span class="op">]</span><span class="op">)</span></span>
+<span>    <span class="kw">if</span> <span class="op">(</span><span class="va">p</span> <span class="op">==</span> <span class="fl">0</span> <span class="op">&amp;&amp;</span> <span class="va">break_zero</span><span class="op">)</span><span class="op">{</span></span>
+<span>      <span class="va">n_reads</span> <span class="op">&lt;-</span> <span class="va">n_reads</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="va">n</span><span class="op">]</span></span>
+<span>      <span class="va">p_n_reads</span> <span class="op">&lt;-</span> <span class="va">p_n_reads</span><span class="op">[</span><span class="fl">1</span><span class="op">:</span><span class="va">n</span><span class="op">]</span></span>
+<span>      <span class="kw">break</span></span>
+<span>    <span class="op">}</span> <span class="kw">else</span> <span class="kw">if</span> <span class="op">(</span><span class="va">p</span> <span class="op">!=</span> <span class="fl">0</span> <span class="op">&amp;&amp;</span> <span class="op">!</span><span class="va">break_zero</span><span class="op">)</span><span class="op">{</span></span>
+<span>      <span class="va">break_zero</span> <span class="op">&lt;-</span> <span class="cn">TRUE</span></span>
+<span>    <span class="op">}</span></span>
+<span>    <span class="va">p_n_reads</span><span class="op">[</span><span class="va">n</span><span class="op">]</span> <span class="op">&lt;-</span> <span class="va">p</span></span>
+<span>  <span class="op">}</span></span>
+<span>  <span class="co"># Calculate fraction of duplicates</span></span>
+<span>  <span class="va">n_duplicates</span> <span class="op">&lt;-</span> <span class="va">n_reads</span> <span class="op">-</span> <span class="fl">1</span></span>
+<span>  <span class="va">p_duplicates</span> <span class="op">&lt;-</span> <span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">n_duplicates</span> <span class="op">*</span> <span class="va">p_n_reads</span><span class="op">)</span><span class="op">/</span><span class="fu"><a href="https://rdrr.io/r/base/sum.html">sum</a></span><span class="op">(</span><span class="va">n_reads</span> <span class="op">*</span> <span class="va">p_n_reads</span><span class="op">)</span></span>
+<span>  <span class="kw"><a href="https://rdrr.io/r/base/function.html">return</a></span><span class="op">(</span><span class="va">p_duplicates</span><span class="op">)</span></span>
+<span><span class="op">}</span></span>
+<span></span>
+<span><span class="co"># Parameter set 1: N = 1e6, O = 1e-6</span></span>
+<span><span class="va">tab_model_1</span> <span class="op">&lt;-</span> <span class="fu">tibble</span><span class="op">(</span>n_molecules <span class="op">=</span> <span class="fl">1e6</span>,</span>
+<span>                      p_opt_dup <span class="op">=</span> <span class="fl">1e-6</span>,</span>
+<span>                      total_clusters <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="fl">10</span><span class="op">^</span><span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">4</span>,<span class="fl">8</span>,<span class="fl">0.5</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">%&gt;%</span></span>
+<span>  <span class="fu">group_by</span><span class="op">(</span><span class="va">n_molecules</span>, <span class="va">p_opt_dup</span><span class="op">)</span> <span class="op">%&gt;%</span></span>
+<span>  <span class="fu">mutate</span><span class="op">(</span>p_duplicates <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">total_clusters</span>, <span class="kw">function</span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="fu">exp_duplicates</span><span class="op">(</span><span class="va">n_molecules</span>, <span class="va">x</span>, <span class="va">p_opt_dup</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">g_1</span> <span class="op">&lt;-</span> <span class="fu">ggplot</span><span class="op">(</span><span class="va">tab_model_1</span>, <span class="fu">aes</span><span class="op">(</span>x<span class="op">=</span><span class="va">total_clusters</span>, y<span class="op">=</span><span class="va">p_duplicates</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">geom_vline</span><span class="op">(</span><span class="fu">aes</span><span class="op">(</span>xintercept <span class="op">=</span> <span class="va">n_molecules</span><span class="op">)</span>, linetype <span class="op">=</span> <span class="st">"dashed"</span>, colour <span class="op">=</span> <span class="st">"red"</span><span class="op">)</span> <span class="op">+</span> <span class="fu">geom_line</span><span class="op">(</span><span class="op">)</span> <span class="op">+</span> <span class="fu">geom_point</span><span class="op">(</span>shape <span class="op">=</span> <span class="fl">16</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">scale_x_log10</span><span class="op">(</span>name <span class="op">=</span> <span class="st">"# Total Clusters"</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">scale_y_continuous</span><span class="op">(</span>name <span class="op">=</span> <span class="st">"Fraction of duplicate reads"</span>, limits <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span><span class="op">)</span>, breaks <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span>,<span class="fl">0.2</span><span class="op">)</span>, expand <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">0</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
+<span>  <span class="fu">labs</span><span class="op">(</span>title <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html">paste0</a></span><span class="op">(</span><span class="st">"N = "</span>, <span class="va">tab_model_1</span><span class="op">$</span><span class="va">n_molecules</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span>, <span class="st">", O = "</span>, <span class="va">tab_model_1</span><span class="op">$</span><span class="va">p_opt_dup</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="va">theme_base</span> <span class="op">+</span> <span class="fu">theme</span><span class="op">(</span>aspect.ratio <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span>
+<span></span>
+<span><span class="co"># Parameter set 2: N = 1e9, O = 1e-6</span></span>
+<span><span class="va">tab_model_2</span> <span class="op">&lt;-</span> <span class="fu">tibble</span><span class="op">(</span>n_molecules <span class="op">=</span> <span class="fl">1e9</span>,</span>
+<span>                      p_opt_dup <span class="op">=</span> <span class="fl">1e-6</span>,</span>
+<span>                      total_clusters <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="fl">10</span><span class="op">^</span><span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">6</span>,<span class="fl">12</span>,<span class="fl">0.5</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">%&gt;%</span></span>
+<span>  <span class="fu">group_by</span><span class="op">(</span><span class="va">n_molecules</span>, <span class="va">p_opt_dup</span><span class="op">)</span> <span class="op">%&gt;%</span></span>
+<span>  <span class="fu">mutate</span><span class="op">(</span>p_duplicates <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">total_clusters</span>, <span class="kw">function</span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="fu">exp_duplicates</span><span class="op">(</span><span class="va">n_molecules</span>, <span class="va">x</span>, <span class="va">p_opt_dup</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">g_2</span> <span class="op">&lt;-</span> <span class="fu">ggplot</span><span class="op">(</span><span class="va">tab_model_2</span>, <span class="fu">aes</span><span class="op">(</span>x<span class="op">=</span><span class="va">total_clusters</span>, y<span class="op">=</span><span class="va">p_duplicates</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">geom_vline</span><span class="op">(</span><span class="fu">aes</span><span class="op">(</span>xintercept <span class="op">=</span> <span class="va">n_molecules</span><span class="op">)</span>, linetype <span class="op">=</span> <span class="st">"dashed"</span>, colour <span class="op">=</span> <span class="st">"red"</span><span class="op">)</span> <span class="op">+</span> <span class="fu">geom_line</span><span class="op">(</span><span class="op">)</span> <span class="op">+</span> <span class="fu">geom_point</span><span class="op">(</span>shape <span class="op">=</span> <span class="fl">16</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">scale_x_log10</span><span class="op">(</span>name <span class="op">=</span> <span class="st">"# Total Clusters"</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">scale_y_continuous</span><span class="op">(</span>name <span class="op">=</span> <span class="st">"Fraction of duplicate reads"</span>, limits <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span><span class="op">)</span>, breaks <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span>,<span class="fl">0.2</span><span class="op">)</span>, expand <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">0</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
+<span>  <span class="fu">labs</span><span class="op">(</span>title <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html">paste0</a></span><span class="op">(</span><span class="st">"N = "</span>, <span class="va">tab_model_2</span><span class="op">$</span><span class="va">n_molecules</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span>, <span class="st">", O = "</span>, <span class="va">tab_model_2</span><span class="op">$</span><span class="va">p_opt_dup</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="va">theme_base</span> <span class="op">+</span> <span class="fu">theme</span><span class="op">(</span>aspect.ratio <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span>
+<span></span>
+<span><span class="co"># Parameter set 3: N = 1e12, O = 1e-6</span></span>
+<span><span class="va">tab_model_3</span> <span class="op">&lt;-</span> <span class="fu">tibble</span><span class="op">(</span>n_molecules <span class="op">=</span> <span class="fl">1e12</span>,</span>
+<span>                      p_opt_dup <span class="op">=</span> <span class="fl">1e-6</span>,</span>
+<span>                      total_clusters <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/Round.html">round</a></span><span class="op">(</span><span class="fl">10</span><span class="op">^</span><span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">9</span>,<span class="fl">15</span>,<span class="fl">0.5</span><span class="op">)</span><span class="op">)</span><span class="op">)</span> <span class="op">%&gt;%</span></span>
+<span>  <span class="fu">group_by</span><span class="op">(</span><span class="va">n_molecules</span>, <span class="va">p_opt_dup</span><span class="op">)</span> <span class="op">%&gt;%</span></span>
+<span>  <span class="fu">mutate</span><span class="op">(</span>p_duplicates <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/lapply.html">sapply</a></span><span class="op">(</span><span class="va">total_clusters</span>, <span class="kw">function</span><span class="op">(</span><span class="va">x</span><span class="op">)</span> <span class="fu">exp_duplicates</span><span class="op">(</span><span class="va">n_molecules</span>, <span class="va">x</span>, <span class="va">p_opt_dup</span><span class="op">)</span><span class="op">)</span><span class="op">)</span></span>
+<span><span class="va">g_3</span> <span class="op">&lt;-</span> <span class="fu">ggplot</span><span class="op">(</span><span class="va">tab_model_3</span>, <span class="fu">aes</span><span class="op">(</span>x<span class="op">=</span><span class="va">total_clusters</span>, y<span class="op">=</span><span class="va">p_duplicates</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">geom_vline</span><span class="op">(</span><span class="fu">aes</span><span class="op">(</span>xintercept <span class="op">=</span> <span class="va">n_molecules</span><span class="op">)</span>, linetype <span class="op">=</span> <span class="st">"dashed"</span>, colour <span class="op">=</span> <span class="st">"red"</span><span class="op">)</span> <span class="op">+</span> <span class="fu">geom_line</span><span class="op">(</span><span class="op">)</span> <span class="op">+</span> <span class="fu">geom_point</span><span class="op">(</span>shape <span class="op">=</span> <span class="fl">16</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">scale_x_log10</span><span class="op">(</span>name <span class="op">=</span> <span class="st">"# Total Clusters"</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="fu">scale_y_continuous</span><span class="op">(</span>name <span class="op">=</span> <span class="st">"Fraction of duplicate reads"</span>, limits <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span><span class="op">)</span>, breaks <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/seq.html">seq</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">1</span>,<span class="fl">0.2</span><span class="op">)</span>, expand <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html">c</a></span><span class="op">(</span><span class="fl">0</span>,<span class="fl">0</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span> </span>
+<span>  <span class="fu">labs</span><span class="op">(</span>title <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/paste.html">paste0</a></span><span class="op">(</span><span class="st">"N = "</span>, <span class="va">tab_model_3</span><span class="op">$</span><span class="va">n_molecules</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span>, <span class="st">", O = "</span>, <span class="va">tab_model_3</span><span class="op">$</span><span class="va">p_opt_dup</span><span class="op">[</span><span class="fl">1</span><span class="op">]</span><span class="op">)</span><span class="op">)</span> <span class="op">+</span></span>
+<span>  <span class="va">theme_base</span> <span class="op">+</span> <span class="fu">theme</span><span class="op">(</span>aspect.ratio <span class="op">=</span> <span class="fl">1</span><span class="op">)</span></span>
+<span><span class="va">g_1</span> <span class="op">+</span> <span class="va">g_2</span> <span class="op">+</span> <span class="va">g_3</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details><div class="cell-output-display">
+<p><img src="2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-3-1.png" class="img-fluid" width="672"></p>
+</div>
+</div>
+<p>I don’t want to take these results too seriously, since they’re based on an extremely simple model, but there are some qualitative takeaways that I found helpful to keep in mind when looking at the real data. In particular, the general pattern of the model results is that <strong>the fraction of duplicate reads follows a sigmoidal pattern on a linear-log plot</strong>:</p>
+<ul>
+<li><p>When <span class="math inline">\(X &lt;&lt; N\)</span>, the fraction of duplicates <span class="math inline">\(F\)</span> is close to zero.</p></li>
+<li><p>As <span class="math inline">\(X\)</span> approaches <span class="math inline">\(N\)</span>, the fraction of duplicates begins increasing, first slowly and then (after <span class="math inline">\(F\)</span> reaches about 0.1) rapidly, before leveling off after <span class="math inline">\(F\)</span> exceeds about 0.9.</p></li>
+<li><p>When <span class="math inline">\(X &gt;&gt; N\)</span>, <span class="math inline">\(F \approx 1\)</span>.</p></li>
+</ul>
+<p>At least under the assumptions used here, once the fraction of duplicates goes above 15% or so, further OOM increases in the number of clusters (e.g.&nbsp;by buying more or larger flow cells) will lead to a dramatic increase in the fraction of duplicate reads.</p>
+</section><section id="applying-model-takeaways" class="level1"><h1>Applying model takeaways</h1>
+<p>Returning to the results from the BMC data with these modeling results in mind:</p>
+<div class="cell">
+<details><summary>Code</summary><div class="sourceCode" id="cb7"><pre class="downlit sourceCode r code-with-copy"><code class="sourceCode R"><span><span class="va">g_dup_flat_lin</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</details><div class="cell-output-display">
+<p><img src="2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-4-1.png" class="img-fluid" width="672"></p>
+</div>
+</div>
+<p>Assuming that the real data will follow a sigmoidal pattern roughly resembling that from the model, we see that all samples (with the possible exception of D23-13406 when <code>unpair repair</code> is disabled) are in the “danger zone”, such that further OOM increases in read depth will likely lead to a dramatic increase in the fraction of duplicate reads. As such, <strong>it probably isn’t worth paying for a further OOM increase (or even half-OOM increase) in read depth for these samples</strong>.</p>
+</section><section id="conclusions" class="level1"><h1>Conclusions</h1>
+<ul>
+<li>
+<p>Overall, running this analysis was a frustrating experience, due to difficulties finding a configuration of Clumpify (or any other deduplication tool I know of) that (i) I trust to remove duplicates appropriately without predictably over- or under-counting, and (ii) runs well on large FASTQ files. Ultimately, I think we should treat the true level of duplicates as somewhere in between that measured by method 1 (Clumpify with <code>unpair repair</code> enabled) and method 2 (Clumpify with <code>unpair repair</code> disabled).</p>
+<ul>
+<li>For the actual virus-detection pipeline, my current best bet is that we should run method 2 on the full dataset, then method 1 on the specific viral hits identified by Kraken2 (or the alignment tool used for validation, once that’s been implemented). However, we ultimately might want to implement our own tool for dealing with this problem.</li>
+</ul>
+</li>
+<li><p>Nevertheless, I was able to generate rarefaction curves for duplication rate as a function of the number of reads using both methods. The results, in combination with a very simple model I generated to aid interpretation, suggest that we’re probably at about the highest OOM read depth we should aim for in terms of getting useful information from these samples (with the possible exception of D23-13406).</p></li>
+</ul>
+
+
+<!-- -->
+
+</section><div id="quarto-appendix" class="default"><section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes"><h2 class="anchored quarto-appendix-heading">Footnotes</h2>
+<ol>
+<li id="fn1"><p>It might be worth explicitly modeling the difference in behavior between different kinds of duplicates as sequencing depth increases, to see if these intuitions are borne out.<a href="#fnref1" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
+<li id="fn2"><p>Probably the biggest two improvements that could be made to this model in future are (i) introducing biological duplicates, and (ii) introducing sequence-specific bias in PCR amplification and cluster formation.<a href="#fnref2" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
+<li id="fn3"><p>I’d appreciate it if someone else on the team can check my math here.<a href="#fnref3" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
+</ol></section></div></main><!-- /main --><script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button', {
+    text: function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+    }
+  });
+  clipboard.on('success', function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  });
+  const viewSource = window.document.getElementById('quarto-view-source') ||
+                     window.document.getElementById('quarto-code-tools-source');
+  if (viewSource) {
+    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
+    viewSource.addEventListener("click", function(e) {
+      if (sourceUrl) {
+        // rstudio viewer pane
+        if (/\bcapabilities=\b/.test(window.location)) {
+          window.open(sourceUrl);
+        } else {
+          window.location.href = sourceUrl;
+        }
+      } else {
+        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
+        modal.show();
+      }
+      return false;
+    });
+  }
+  function toggleCodeHandler(show) {
+    return function(e) {
+      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
+      for (let i=0; i<detailsSrc.length; i++) {
+        const details = detailsSrc[i].parentElement;
+        if (show) {
+          details.open = true;
+        } else {
+          details.removeAttribute("open");
+        }
+      }
+      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
+      const fromCls = show ? "hidden" : "unhidden";
+      const toCls = show ? "unhidden" : "hidden";
+      for (let i=0; i<cellCodeDivs.length; i++) {
+        const codeDiv = cellCodeDivs[i];
+        if (codeDiv.classList.contains(fromCls)) {
+          codeDiv.classList.remove(fromCls);
+          codeDiv.classList.add(toCls);
+        } 
+      }
+      return false;
+    }
+  }
+  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
+  if (hideAllCode) {
+    hideAllCode.addEventListener("click", toggleCodeHandler(false));
+  }
+  const showAllCode = window.document.getElementById("quarto-show-all-code");
+  if (showAllCode) {
+    showAllCode.addEventListener("click", toggleCodeHandler(true));
+  }
+  function tippyHover(el, contentFn) {
+    const config = {
+      allowHTML: true,
+      content: contentFn,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start'
+    };
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      return note.innerHTML;
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
+<div class="sourceCode" id="cb8" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> "Estimating the effect of read depth on duplication rate for Project Runway DNA data"</span></span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="an">subtitle:</span><span class="co"> "How deep can we go?"</span></span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="an">author:</span><span class="co"> "Will Bradshaw"</span></span>
+<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a><span class="an">date:</span><span class="co"> 2023-11-02</span></span>
+<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
+<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
+<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: true</span></span>
+<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
+<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a><span class="co">    code-link: true</span></span>
+<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a><span class="co">    df-print: paged</span></span>
+<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a><span class="an">editor:</span><span class="co"> visual</span></span>
+<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a><span class="an">title-block-banner:</span><span class="co"> black</span></span>
+<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
+<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-18"><a href="#cb8-18" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb8-19"><a href="#cb8-19" aria-hidden="true" tabindex="-1"></a><span class="co">#| label: load-packages</span></span>
+<span id="cb8-20"><a href="#cb8-20" aria-hidden="true" tabindex="-1"></a><span class="co">#| include: false</span></span>
+<span id="cb8-21"><a href="#cb8-21" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(tidyverse)</span>
+<span id="cb8-22"><a href="#cb8-22" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(cowplot)</span>
+<span id="cb8-23"><a href="#cb8-23" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(patchwork)</span>
+<span id="cb8-24"><a href="#cb8-24" aria-hidden="true" tabindex="-1"></a><span class="fu">library</span>(fastqcr)</span>
+<span id="cb8-25"><a href="#cb8-25" aria-hidden="true" tabindex="-1"></a><span class="fu">source</span>(<span class="st">"../scripts/aux_plot-theme.R"</span>)</span>
+<span id="cb8-26"><a href="#cb8-26" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb8-27"><a href="#cb8-27" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-28"><a href="#cb8-28" aria-hidden="true" tabindex="-1"></a>One relevant question for both Project Runway and other NAO sequencing is: what is the maximum read depth at which we can sequence a given sample while retaining an acceptable level of sequence duplication?</span>
+<span id="cb8-29"><a href="#cb8-29" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-30"><a href="#cb8-30" aria-hidden="true" tabindex="-1"></a>As discussed in a previous entry, duplicate reads can arise in sequencing data from a variety of processes, including true biological duplicates present in the raw sample; processing duplicates arising from amplification and other processes during sample and library prep; and sequencing duplicates arising from various processes on the actual flow cell.</span>
+<span id="cb8-31"><a href="#cb8-31" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-32"><a href="#cb8-32" aria-hidden="true" tabindex="-1"></a>As we sequence more deeply, we expect the fraction of biological and processing duplicates (but not, I think, sequencing duplicates) in our read data to increase. In the former case, this is because we are capturing a larger fraction of all the input molecules in our sample; in the latter, because we are sequencing copies of the same sequence over and over again. Intuitively, I expect the increase in processing duplicates to swamp that in biological duplicates at high read depth, at least for library prep protocols that involve amplification<span class="ot">[^1]</span>.</span>
+<span id="cb8-33"><a href="#cb8-33" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-34"><a href="#cb8-34" aria-hidden="true" tabindex="-1"></a><span class="ot">[^1]: </span>It might be worth explicitly modeling the difference in behavior between different kinds of duplicates as sequencing depth increases, to see if these intuitions are borne out.</span>
+<span id="cb8-35"><a href="#cb8-35" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-36"><a href="#cb8-36" aria-hidden="true" tabindex="-1"></a>One simple approach to investigate the overall effect of read depth on duplication levels in the sample is rarefaction: downsampling a library to different numbers of reads and seeing how the duplication rate changes as a function of read count. In this notebook entry, I apply this approach to sequencing data from the Project Runway initial DNA dataset, to see how duplication rate behaves in this case.</span>
+<span id="cb8-37"><a href="#cb8-37" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-38"><a href="#cb8-38" aria-hidden="true" tabindex="-1"></a><span class="fu"># Methods</span></span>
+<span id="cb8-39"><a href="#cb8-39" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-40"><a href="#cb8-40" aria-hidden="true" tabindex="-1"></a>To analyze the effect of read depth on duplication rates in this data, I first concatenated the raw reads from the two sequencing replicates of each sample together. This allowed the analysis to detect duplicates across replicates, that would have been missed by analyzing them separately.</span>
+<span id="cb8-41"><a href="#cb8-41" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-42"><a href="#cb8-42" aria-hidden="true" tabindex="-1"></a>Next, I performed preprocessing to remove adapter sequences and low-quality bases that might interfere with duplicate detection. I didn't collapse read pairs together or discard read pairs with overall low quality.</span>
+<span id="cb8-43"><a href="#cb8-43" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-44"><a href="#cb8-44" aria-hidden="true" tabindex="-1"></a><span class="in">```         </span></span>
+<span id="cb8-45"><a href="#cb8-45" aria-hidden="true" tabindex="-1"></a><span class="in">for p in $(cat prefixes.txt); do echo $p; fastp -i raw/${p}_1.fastq.gz -I raw/${p}_2.fastq.gz -o preproc/${p}_fastp_1.fastq.gz -O preproc/${p}_fastp_2.fastq.gz --failed_out preproc/${p}_fastp_failed.fastq.gz --cut_tail --correction --detect_adapter_for_pe --adapter_fasta adapters.fa  --trim_poly_x --thread 16 -Q -L; done</span></span>
+<span id="cb8-46"><a href="#cb8-46" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb8-47"><a href="#cb8-47" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-48"><a href="#cb8-48" aria-hidden="true" tabindex="-1"></a>I then took each pair of preprocessed FASTQ files and downsampled them to specified numbers of reads, from 10,000 to 100,000,000 in units of 1 OOM, using <span class="in">`seqtk sample`</span>. I performed downsampling 3 times for each read count, with the intent of calculating duplication rate separately for each replicate and taking the average.</span>
+<span id="cb8-49"><a href="#cb8-49" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-50"><a href="#cb8-50" aria-hidden="true" tabindex="-1"></a>Finally, I ran deduplication with Clumpify on each downsampled pair of read files, as well as the raw preprocessed files, and recorded the fraction of reads discarded in each case. I performed this twice, using different Clumpify settings each time:</span>
+<span id="cb8-51"><a href="#cb8-51" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-52"><a href="#cb8-52" aria-hidden="true" tabindex="-1"></a><span class="ss">1.  </span>First, I attempted to perform deduplication in a maximally comprehensive way, using Clumpify's <span class="in">`unpair repair`</span> options to identify and remove duplicates in opposite orientation across read pairs (as discussed <span class="co">[</span><span class="ot">here</span><span class="co">](https://data.securebio.org/wills-public-notebook/notebooks/2023-11-02_project-runway-comparison.html#conclusions)</span>). This configuration finds more duplicates, but (a) might overestimate duplicates in cases where only one read in a pair matches another read, and (b) causes memory-related errors for large subsample sizes.</span>
+<span id="cb8-53"><a href="#cb8-53" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-54"><a href="#cb8-54" aria-hidden="true" tabindex="-1"></a>    <span class="in">```         </span></span>
+<span id="cb8-55"><a href="#cb8-55" aria-hidden="true" tabindex="-1"></a><span class="in">    clumpify.sh in=&lt;in1&gt; in2=&lt;in2&gt; out=&lt;out1&gt; out2=&lt;out2&gt; dedupe containment unpair repair</span></span>
+<span id="cb8-56"><a href="#cb8-56" aria-hidden="true" tabindex="-1"></a><span class="in">    ```</span></span>
+<span id="cb8-57"><a href="#cb8-57" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-58"><a href="#cb8-58" aria-hidden="true" tabindex="-1"></a><span class="ss">2.  </span>Second, due to the aforementioned issues with approach 1, I repeated deduplication without Clumpify's <span class="in">`unpair repair`</span> options enabled, providing a lower-bound estimate of duplication levels which should be more consistent with estimates provided by, for example, FASTQC.</span>
+<span id="cb8-59"><a href="#cb8-59" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-60"><a href="#cb8-60" aria-hidden="true" tabindex="-1"></a>    <span class="in">```         </span></span>
+<span id="cb8-61"><a href="#cb8-61" aria-hidden="true" tabindex="-1"></a><span class="in">     clumpify.sh in=&lt;in1&gt; in2=&lt;in2&gt; out=&lt;out1&gt; out2=&lt;out2&gt; dedupe containment</span></span>
+<span id="cb8-62"><a href="#cb8-62" aria-hidden="true" tabindex="-1"></a><span class="in">    ```</span></span>
+<span id="cb8-63"><a href="#cb8-63" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-64"><a href="#cb8-64" aria-hidden="true" tabindex="-1"></a><span class="fu"># Results</span></span>
+<span id="cb8-65"><a href="#cb8-65" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-66"><a href="#cb8-66" aria-hidden="true" tabindex="-1"></a>Plotting the fraction of duplicate reads (with or without Clumpify's <span class="in">`unpair repair`</span> options) gives us the following result:</span>
+<span id="cb8-67"><a href="#cb8-67" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-70"><a href="#cb8-70" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb8-71"><a href="#cb8-71" aria-hidden="true" tabindex="-1"></a><span class="co">#| warning: false</span></span>
+<span id="cb8-72"><a href="#cb8-72" aria-hidden="true" tabindex="-1"></a><span class="co"># Import data</span></span>
+<span id="cb8-73"><a href="#cb8-73" aria-hidden="true" tabindex="-1"></a>data_dir <span class="ot">&lt;-</span> <span class="st">"../data/2023-11-06_pr-dedup/"</span></span>
+<span id="cb8-74"><a href="#cb8-74" aria-hidden="true" tabindex="-1"></a>n_dup_path <span class="ot">&lt;-</span> <span class="fu">file.path</span>(data_dir, <span class="st">"n_dup.csv"</span>)</span>
+<span id="cb8-75"><a href="#cb8-75" aria-hidden="true" tabindex="-1"></a>n_dup <span class="ot">&lt;-</span> <span class="fu">read_csv</span>(n_dup_path, <span class="at">show_col_types =</span> <span class="cn">FALSE</span>) <span class="sc">%&gt;%</span> </span>
+<span id="cb8-76"><a href="#cb8-76" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">n_dup =</span> read_pairs_in <span class="sc">-</span> read_pairs_out, <span class="at">p_dup =</span> n_dup<span class="sc">/</span>read_pairs_in,</span>
+<span id="cb8-77"><a href="#cb8-77" aria-hidden="true" tabindex="-1"></a>         <span class="at">o_dup =</span> p_dup<span class="sc">/</span>(<span class="dv">1</span><span class="sc">-</span>p_dup))</span>
+<span id="cb8-78"><a href="#cb8-78" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-79"><a href="#cb8-79" aria-hidden="true" tabindex="-1"></a><span class="co"># Reshape data</span></span>
+<span id="cb8-80"><a href="#cb8-80" aria-hidden="true" tabindex="-1"></a>n_dup_flat <span class="ot">&lt;-</span> n_dup <span class="sc">%&gt;%</span> <span class="fu">group_by</span>(sample, unpair_repair, read_pairs_in) <span class="sc">%&gt;%</span></span>
+<span id="cb8-81"><a href="#cb8-81" aria-hidden="true" tabindex="-1"></a>  <span class="fu">summarize</span>(<span class="at">p_dup_mean =</span> <span class="fu">mean</span>(p_dup), <span class="at">p_dup_sd =</span> <span class="fu">sd</span>(p_dup), <span class="at">p_dup_min =</span> <span class="fu">max</span>(<span class="dv">0</span>,p_dup_mean<span class="sc">-</span>p_dup_sd), <span class="at">p_dup_max =</span> <span class="fu">min</span>(<span class="dv">1</span>, p_dup_mean<span class="sc">+</span>p_dup_sd),</span>
+<span id="cb8-82"><a href="#cb8-82" aria-hidden="true" tabindex="-1"></a>            <span class="at">o_dup_mean =</span> <span class="fu">mean</span>(o_dup), <span class="at">o_dup_sd =</span> <span class="fu">sd</span>(o_dup), <span class="at">o_dup_min =</span> <span class="fu">max</span>(<span class="dv">0</span>,o_dup_mean<span class="sc">-</span>o_dup_sd), <span class="at">o_dup_max =</span> <span class="fu">min</span>(<span class="dv">1</span>, o_dup_mean<span class="sc">+</span>o_dup_sd), <span class="at">.groups =</span> <span class="st">"drop"</span>)</span>
+<span id="cb8-83"><a href="#cb8-83" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-84"><a href="#cb8-84" aria-hidden="true" tabindex="-1"></a><span class="co"># Plot data</span></span>
+<span id="cb8-85"><a href="#cb8-85" aria-hidden="true" tabindex="-1"></a>g_dup_flat_base <span class="ot">&lt;-</span> <span class="fu">ggplot</span>(n_dup_flat, <span class="fu">aes</span>(<span class="at">x=</span>read_pairs_in, <span class="at">y=</span>p_dup_mean, <span class="at">color =</span> sample)) <span class="sc">+</span></span>
+<span id="cb8-86"><a href="#cb8-86" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_line</span>() <span class="sc">+</span> <span class="fu">geom_errorbar</span>(<span class="fu">aes</span>(<span class="at">ymin=</span>p_dup_min, <span class="at">ymax=</span>p_dup_max)) <span class="sc">+</span> <span class="fu">geom_point</span>(<span class="at">shape =</span> <span class="dv">16</span>) <span class="sc">+</span></span>
+<span id="cb8-87"><a href="#cb8-87" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_x_log10</span>(<span class="at">name =</span> <span class="st">"# Input Read Pairs"</span>) <span class="sc">+</span></span>
+<span id="cb8-88"><a href="#cb8-88" aria-hidden="true" tabindex="-1"></a>  <span class="fu">facet_grid</span>(.<span class="sc">~</span>unpair_repair, <span class="at">labeller =</span> <span class="st">"label_both"</span>) <span class="sc">+</span></span>
+<span id="cb8-89"><a href="#cb8-89" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_color_brewer</span>(<span class="at">palette =</span> <span class="st">"Dark2"</span>) <span class="sc">+</span></span>
+<span id="cb8-90"><a href="#cb8-90" aria-hidden="true" tabindex="-1"></a>  theme_base <span class="sc">+</span> <span class="fu">theme</span>(<span class="at">aspect.ratio =</span> <span class="dv">1</span>)</span>
+<span id="cb8-91"><a href="#cb8-91" aria-hidden="true" tabindex="-1"></a>g_dup_flat_lin <span class="ot">&lt;-</span> g_dup_flat_base <span class="sc">+</span> </span>
+<span id="cb8-92"><a href="#cb8-92" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_y_continuous</span>(<span class="at">name =</span> <span class="st">"Fraction of duplicate reads"</span>, <span class="at">limits =</span> <span class="fu">c</span>(<span class="dv">0</span>,<span class="fl">0.41</span>), <span class="at">breaks =</span> <span class="fu">seq</span>(<span class="dv">0</span>,<span class="dv">1</span>,<span class="fl">0.1</span>), <span class="at">expand =</span> <span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">0</span>))</span>
+<span id="cb8-93"><a href="#cb8-93" aria-hidden="true" tabindex="-1"></a>g_dup_flat_log <span class="ot">&lt;-</span> g_dup_flat_base <span class="sc">+</span> <span class="fu">scale_y_log10</span>(<span class="at">name =</span> <span class="st">"Fraction of duplicate reads"</span>)</span>
+<span id="cb8-94"><a href="#cb8-94" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-95"><a href="#cb8-95" aria-hidden="true" tabindex="-1"></a><span class="co"># Show plats</span></span>
+<span id="cb8-96"><a href="#cb8-96" aria-hidden="true" tabindex="-1"></a>g_dup_flat_lin</span>
+<span id="cb8-97"><a href="#cb8-97" aria-hidden="true" tabindex="-1"></a>g_dup_flat_log</span>
+<span id="cb8-98"><a href="#cb8-98" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb8-99"><a href="#cb8-99" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-100"><a href="#cb8-100" aria-hidden="true" tabindex="-1"></a>We can see that the fraction of duplicates reaches quite high levels as we approach the full read count, especially when <span class="in">`unpair repair`</span> is enabled. It also looks like the gradient of increase is quite high on the linear-log plot, which would suggest that further OOM increases in read depth might result in quite large increases in the fraction of duplicates. It's also apparent that, for whatever reason, D23-13406 has substantially fewer duplicates at any given read depth than the other two samples.</span>
+<span id="cb8-101"><a href="#cb8-101" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-102"><a href="#cb8-102" aria-hidden="true" tabindex="-1"></a>However, further interpretation of these results, including extrapolation to greater read depths, is made difficult by the lack of a theoretical model for what we expect to see. It's also not clear what mode of visualization (Linear-log? Log-linear? Log-log? Fraction of duplicates expressed as probabilities or odds? Etc) is most meaningful for interpretation.</span>
+<span id="cb8-103"><a href="#cb8-103" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-104"><a href="#cb8-104" aria-hidden="true" tabindex="-1"></a>To start resolving some of these roadblocks, I spent some time working on a very simple model of read duplication, to see what it might tell us about the expected pattern of duplicates as a function of read depth.</span>
+<span id="cb8-105"><a href="#cb8-105" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-106"><a href="#cb8-106" aria-hidden="true" tabindex="-1"></a><span class="fu"># A very very simple model of read duplication</span></span>
+<span id="cb8-107"><a href="#cb8-107" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-108"><a href="#cb8-108" aria-hidden="true" tabindex="-1"></a><span class="ss">-   </span>Imagine a sample containing $N$ distinct molecules, which are uniformly amplified up to $M = N \times 2^C$ molecules by a perfectly unbiased $C$-cycle PCR reaction. Adapters are ligated and the resulting library is washed across the flow cell to generate $X$ total clusters (again, without bias<span class="ot">[^2]</span>). These clusters are then sequenced by a process that generates optical duplicates at some rate $O$, for a total expectation of $R=X\times(1+O)$ reads.</span>
+<span id="cb8-109"><a href="#cb8-109" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-110"><a href="#cb8-110" aria-hidden="true" tabindex="-1"></a><span class="ss">-   </span>Each cluster is selected from the $M$ molecules in the library, without replacement. When $C$ is large and amplification is uniform across molecules, this is well-approximated by selecting from the $N$ input molecules with replacement. Under these conditions, the number of clusters generated from input molecule $i$ is approximately $X_i \sim \mathrm{B}(X,N^{-1})$.</span>
+<span id="cb8-111"><a href="#cb8-111" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-112"><a href="#cb8-112" aria-hidden="true" tabindex="-1"></a><span class="ss">-   </span>The number of optical duplicates generated from a given input molecule is then approximately $P_i \sim \mathrm{B}(X_i,O)$, and the total number of reads corresponding to a given input molecule is thus $R_i = X_i + P_i$. The number of duplicates generated from that molecule is then $D_i = \max(0,R_i - 1)$, and the overall fraction of duplicates is $F=\frac{D}{R}=\frac{\sum_iD_i}{\sum_iR_i}$ .</span>
+<span id="cb8-113"><a href="#cb8-113" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-114"><a href="#cb8-114" aria-hidden="true" tabindex="-1"></a><span class="ss">-   </span>The expected fraction of duplicates under this model can be estimated analytically as follows<span class="ot">[^3]</span>:</span>
+<span id="cb8-115"><a href="#cb8-115" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-116"><a href="#cb8-116" aria-hidden="true" tabindex="-1"></a><span class="ss">    -   </span>$E(R_i)=\sum_{r=1}^{2X} r \cdot P(R_i = r) = \sum_{r=1}^{2X}r\cdot\left[\sum_{k=0}^{r}P(X_i = k)\cdot{}P(P_i=r-k|X_i=k)\right]$</span>
+<span id="cb8-117"><a href="#cb8-117" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-118"><a href="#cb8-118" aria-hidden="true" tabindex="-1"></a><span class="ss">    -   </span>$E(D_i) = \sum_{r=1}^{2X} (r-1) \cdot P(R_i = r)$</span>
+<span id="cb8-119"><a href="#cb8-119" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-120"><a href="#cb8-120" aria-hidden="true" tabindex="-1"></a><span class="ss">    -   </span>$E(D) = E\left(\sum_i D_i\right) = N \cdot E(D_i)$</span>
+<span id="cb8-121"><a href="#cb8-121" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-122"><a href="#cb8-122" aria-hidden="true" tabindex="-1"></a><span class="ss">    -   </span>$R = \sum_i R_i = \sum_r r \cdot \mathbb{N}(R_i = r)$</span>
+<span id="cb8-123"><a href="#cb8-123" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-124"><a href="#cb8-124" aria-hidden="true" tabindex="-1"></a><span class="ss">        -   </span>When $N$ is large, $\mathbb{N}(R_i = r) \approx N \cdot P(R_i = r)$, and so $R \approx \sum_r r \cdot N \cdot P(R_i = r) = N \cdot E(R_i)$</span>
+<span id="cb8-125"><a href="#cb8-125" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-126"><a href="#cb8-126" aria-hidden="true" tabindex="-1"></a><span class="ss">        -   </span>Hence $E\left(\frac{1}{R}\right) \approx \frac{1}{N \cdot E(R_i)}$</span>
+<span id="cb8-127"><a href="#cb8-127" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-128"><a href="#cb8-128" aria-hidden="true" tabindex="-1"></a><span class="ss">    -   </span>Thus $E(F) = E\left(\frac{D}{R}\right) = E(D) \cdot E\left(\frac{1}{R}\right) \approx \frac{N \cdot E(D_i)}{N \cdot E(R_i)} = \frac{E(D_i)}{E(R_i)}$</span>
+<span id="cb8-129"><a href="#cb8-129" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-130"><a href="#cb8-130" aria-hidden="true" tabindex="-1"></a><span class="ot">[^2]: </span>Probably the biggest two improvements that could be made to this model in future are (i) introducing biological duplicates, and (ii) introducing sequence-specific bias in PCR amplification and cluster formation.</span>
+<span id="cb8-131"><a href="#cb8-131" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-132"><a href="#cb8-132" aria-hidden="true" tabindex="-1"></a><span class="ot">[^3]: </span>I'd appreciate it if someone else on the team can check my math here.</span>
+<span id="cb8-133"><a href="#cb8-133" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-134"><a href="#cb8-134" aria-hidden="true" tabindex="-1"></a>Using computational implementations of these formulae, we can investigate how the fraction of duplicates varies with the number of clusters for different parameter values.</span>
+<span id="cb8-135"><a href="#cb8-135" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-138"><a href="#cb8-138" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb8-139"><a href="#cb8-139" aria-hidden="true" tabindex="-1"></a><span class="co"># Define auxiliary functions</span></span>
+<span id="cb8-140"><a href="#cb8-140" aria-hidden="true" tabindex="-1"></a>log_p_clusters <span class="ot">&lt;-</span> <span class="cf">function</span>(n_clusters, total_clusters, n_molecules) <span class="fu">dbinom</span>(n_clusters, total_clusters, <span class="dv">1</span><span class="sc">/</span>n_molecules, <span class="at">log=</span><span class="cn">TRUE</span>)</span>
+<span id="cb8-141"><a href="#cb8-141" aria-hidden="true" tabindex="-1"></a>log_p_opt_dups <span class="ot">&lt;-</span> <span class="cf">function</span>(n_opt_dups, n_clusters, p_opt_dup) <span class="fu">dbinom</span>(n_opt_dups, n_clusters, p_opt_dup, <span class="at">log=</span><span class="cn">TRUE</span>)</span>
+<span id="cb8-142"><a href="#cb8-142" aria-hidden="true" tabindex="-1"></a>p_reads <span class="ot">&lt;-</span> <span class="cf">function</span>(n_reads, total_clusters, n_molecules, p_opt_dup){</span>
+<span id="cb8-143"><a href="#cb8-143" aria-hidden="true" tabindex="-1"></a>  n_clusters <span class="ot">&lt;-</span> n_reads <span class="sc">-</span> (<span class="dv">0</span><span class="sc">:</span>(n_reads<span class="sc">/</span><span class="dv">2</span>))</span>
+<span id="cb8-144"><a href="#cb8-144" aria-hidden="true" tabindex="-1"></a>  log_p_n_clusters <span class="ot">&lt;-</span> <span class="fu">log_p_clusters</span>(n_clusters, total_clusters, n_molecules)</span>
+<span id="cb8-145"><a href="#cb8-145" aria-hidden="true" tabindex="-1"></a>  log_p_n_opt_dups <span class="ot">&lt;-</span> <span class="fu">log_p_opt_dups</span>(n_reads <span class="sc">-</span> n_clusters, n_clusters, p_opt_dup)</span>
+<span id="cb8-146"><a href="#cb8-146" aria-hidden="true" tabindex="-1"></a>  log_p_reads_clusters <span class="ot">&lt;-</span> log_p_n_clusters <span class="sc">+</span> log_p_n_opt_dups</span>
+<span id="cb8-147"><a href="#cb8-147" aria-hidden="true" tabindex="-1"></a>  <span class="fu">return</span>(<span class="fu">sum</span>(<span class="fu">exp</span>(log_p_reads_clusters)))</span>
+<span id="cb8-148"><a href="#cb8-148" aria-hidden="true" tabindex="-1"></a>}</span>
+<span id="cb8-149"><a href="#cb8-149" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-150"><a href="#cb8-150" aria-hidden="true" tabindex="-1"></a><span class="co"># Define main function</span></span>
+<span id="cb8-151"><a href="#cb8-151" aria-hidden="true" tabindex="-1"></a>exp_duplicates <span class="ot">&lt;-</span> <span class="cf">function</span>(n_molecules, total_clusters, p_opt_dup, <span class="at">initial_vector_length =</span> <span class="fl">1e8</span>){</span>
+<span id="cb8-152"><a href="#cb8-152" aria-hidden="true" tabindex="-1"></a>  pr <span class="ot">&lt;-</span> purrr<span class="sc">::</span><span class="fu">partial</span>(p_reads, <span class="at">total_clusters =</span> total_clusters, <span class="at">n_molecules =</span> n_molecules, <span class="at">p_opt_dup =</span> p_opt_dup)</span>
+<span id="cb8-153"><a href="#cb8-153" aria-hidden="true" tabindex="-1"></a>  <span class="co"># Calculate read count probabilities</span></span>
+<span id="cb8-154"><a href="#cb8-154" aria-hidden="true" tabindex="-1"></a>  n_reads <span class="ot">&lt;-</span> <span class="dv">1</span><span class="sc">:</span>initial_vector_length</span>
+<span id="cb8-155"><a href="#cb8-155" aria-hidden="true" tabindex="-1"></a>  p_n_reads <span class="ot">&lt;-</span> <span class="fu">numeric</span>(initial_vector_length)</span>
+<span id="cb8-156"><a href="#cb8-156" aria-hidden="true" tabindex="-1"></a>  break_zero <span class="ot">&lt;-</span> <span class="cn">FALSE</span></span>
+<span id="cb8-157"><a href="#cb8-157" aria-hidden="true" tabindex="-1"></a>  <span class="cf">for</span> (n <span class="cf">in</span> <span class="dv">1</span><span class="sc">:</span><span class="fu">length</span>(n_reads)){</span>
+<span id="cb8-158"><a href="#cb8-158" aria-hidden="true" tabindex="-1"></a>    p <span class="ot">&lt;-</span> <span class="fu">pr</span>(n_reads[n])</span>
+<span id="cb8-159"><a href="#cb8-159" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> (p <span class="sc">==</span> <span class="dv">0</span> <span class="sc">&amp;&amp;</span> break_zero){</span>
+<span id="cb8-160"><a href="#cb8-160" aria-hidden="true" tabindex="-1"></a>      n_reads <span class="ot">&lt;-</span> n_reads[<span class="dv">1</span><span class="sc">:</span>n]</span>
+<span id="cb8-161"><a href="#cb8-161" aria-hidden="true" tabindex="-1"></a>      p_n_reads <span class="ot">&lt;-</span> p_n_reads[<span class="dv">1</span><span class="sc">:</span>n]</span>
+<span id="cb8-162"><a href="#cb8-162" aria-hidden="true" tabindex="-1"></a>      <span class="cf">break</span></span>
+<span id="cb8-163"><a href="#cb8-163" aria-hidden="true" tabindex="-1"></a>    } <span class="cf">else</span> <span class="cf">if</span> (p <span class="sc">!=</span> <span class="dv">0</span> <span class="sc">&amp;&amp;</span> <span class="sc">!</span>break_zero){</span>
+<span id="cb8-164"><a href="#cb8-164" aria-hidden="true" tabindex="-1"></a>      break_zero <span class="ot">&lt;-</span> <span class="cn">TRUE</span></span>
+<span id="cb8-165"><a href="#cb8-165" aria-hidden="true" tabindex="-1"></a>    }</span>
+<span id="cb8-166"><a href="#cb8-166" aria-hidden="true" tabindex="-1"></a>    p_n_reads[n] <span class="ot">&lt;-</span> p</span>
+<span id="cb8-167"><a href="#cb8-167" aria-hidden="true" tabindex="-1"></a>  }</span>
+<span id="cb8-168"><a href="#cb8-168" aria-hidden="true" tabindex="-1"></a>  <span class="co"># Calculate fraction of duplicates</span></span>
+<span id="cb8-169"><a href="#cb8-169" aria-hidden="true" tabindex="-1"></a>  n_duplicates <span class="ot">&lt;-</span> n_reads <span class="sc">-</span> <span class="dv">1</span></span>
+<span id="cb8-170"><a href="#cb8-170" aria-hidden="true" tabindex="-1"></a>  p_duplicates <span class="ot">&lt;-</span> <span class="fu">sum</span>(n_duplicates <span class="sc">*</span> p_n_reads)<span class="sc">/</span><span class="fu">sum</span>(n_reads <span class="sc">*</span> p_n_reads)</span>
+<span id="cb8-171"><a href="#cb8-171" aria-hidden="true" tabindex="-1"></a>  <span class="fu">return</span>(p_duplicates)</span>
+<span id="cb8-172"><a href="#cb8-172" aria-hidden="true" tabindex="-1"></a>}</span>
+<span id="cb8-173"><a href="#cb8-173" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-174"><a href="#cb8-174" aria-hidden="true" tabindex="-1"></a><span class="co"># Parameter set 1: N = 1e6, O = 1e-6</span></span>
+<span id="cb8-175"><a href="#cb8-175" aria-hidden="true" tabindex="-1"></a>tab_model_1 <span class="ot">&lt;-</span> <span class="fu">tibble</span>(<span class="at">n_molecules =</span> <span class="fl">1e6</span>,</span>
+<span id="cb8-176"><a href="#cb8-176" aria-hidden="true" tabindex="-1"></a>                      <span class="at">p_opt_dup =</span> <span class="fl">1e-6</span>,</span>
+<span id="cb8-177"><a href="#cb8-177" aria-hidden="true" tabindex="-1"></a>                      <span class="at">total_clusters =</span> <span class="fu">round</span>(<span class="dv">10</span><span class="sc">^</span><span class="fu">seq</span>(<span class="dv">4</span>,<span class="dv">8</span>,<span class="fl">0.5</span>))) <span class="sc">%&gt;%</span></span>
+<span id="cb8-178"><a href="#cb8-178" aria-hidden="true" tabindex="-1"></a>  <span class="fu">group_by</span>(n_molecules, p_opt_dup) <span class="sc">%&gt;%</span></span>
+<span id="cb8-179"><a href="#cb8-179" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">p_duplicates =</span> <span class="fu">sapply</span>(total_clusters, <span class="cf">function</span>(x) <span class="fu">exp_duplicates</span>(n_molecules, x, p_opt_dup)))</span>
+<span id="cb8-180"><a href="#cb8-180" aria-hidden="true" tabindex="-1"></a>g_1 <span class="ot">&lt;-</span> <span class="fu">ggplot</span>(tab_model_1, <span class="fu">aes</span>(<span class="at">x=</span>total_clusters, <span class="at">y=</span>p_duplicates)) <span class="sc">+</span></span>
+<span id="cb8-181"><a href="#cb8-181" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_vline</span>(<span class="fu">aes</span>(<span class="at">xintercept =</span> n_molecules), <span class="at">linetype =</span> <span class="st">"dashed"</span>, <span class="at">colour =</span> <span class="st">"red"</span>) <span class="sc">+</span> <span class="fu">geom_line</span>() <span class="sc">+</span> <span class="fu">geom_point</span>(<span class="at">shape =</span> <span class="dv">16</span>) <span class="sc">+</span></span>
+<span id="cb8-182"><a href="#cb8-182" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_x_log10</span>(<span class="at">name =</span> <span class="st">"# Total Clusters"</span>) <span class="sc">+</span></span>
+<span id="cb8-183"><a href="#cb8-183" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_y_continuous</span>(<span class="at">name =</span> <span class="st">"Fraction of duplicate reads"</span>, <span class="at">limits =</span> <span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">1</span>), <span class="at">breaks =</span> <span class="fu">seq</span>(<span class="dv">0</span>,<span class="dv">1</span>,<span class="fl">0.2</span>), <span class="at">expand =</span> <span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">0</span>)) <span class="sc">+</span> </span>
+<span id="cb8-184"><a href="#cb8-184" aria-hidden="true" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">title =</span> <span class="fu">paste0</span>(<span class="st">"N = "</span>, tab_model_1<span class="sc">$</span>n_molecules[<span class="dv">1</span>], <span class="st">", O = "</span>, tab_model_1<span class="sc">$</span>p_opt_dup[<span class="dv">1</span>])) <span class="sc">+</span></span>
+<span id="cb8-185"><a href="#cb8-185" aria-hidden="true" tabindex="-1"></a>  theme_base <span class="sc">+</span> <span class="fu">theme</span>(<span class="at">aspect.ratio =</span> <span class="dv">1</span>)</span>
+<span id="cb8-186"><a href="#cb8-186" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-187"><a href="#cb8-187" aria-hidden="true" tabindex="-1"></a><span class="co"># Parameter set 2: N = 1e9, O = 1e-6</span></span>
+<span id="cb8-188"><a href="#cb8-188" aria-hidden="true" tabindex="-1"></a>tab_model_2 <span class="ot">&lt;-</span> <span class="fu">tibble</span>(<span class="at">n_molecules =</span> <span class="fl">1e9</span>,</span>
+<span id="cb8-189"><a href="#cb8-189" aria-hidden="true" tabindex="-1"></a>                      <span class="at">p_opt_dup =</span> <span class="fl">1e-6</span>,</span>
+<span id="cb8-190"><a href="#cb8-190" aria-hidden="true" tabindex="-1"></a>                      <span class="at">total_clusters =</span> <span class="fu">round</span>(<span class="dv">10</span><span class="sc">^</span><span class="fu">seq</span>(<span class="dv">6</span>,<span class="dv">12</span>,<span class="fl">0.5</span>))) <span class="sc">%&gt;%</span></span>
+<span id="cb8-191"><a href="#cb8-191" aria-hidden="true" tabindex="-1"></a>  <span class="fu">group_by</span>(n_molecules, p_opt_dup) <span class="sc">%&gt;%</span></span>
+<span id="cb8-192"><a href="#cb8-192" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">p_duplicates =</span> <span class="fu">sapply</span>(total_clusters, <span class="cf">function</span>(x) <span class="fu">exp_duplicates</span>(n_molecules, x, p_opt_dup)))</span>
+<span id="cb8-193"><a href="#cb8-193" aria-hidden="true" tabindex="-1"></a>g_2 <span class="ot">&lt;-</span> <span class="fu">ggplot</span>(tab_model_2, <span class="fu">aes</span>(<span class="at">x=</span>total_clusters, <span class="at">y=</span>p_duplicates)) <span class="sc">+</span></span>
+<span id="cb8-194"><a href="#cb8-194" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_vline</span>(<span class="fu">aes</span>(<span class="at">xintercept =</span> n_molecules), <span class="at">linetype =</span> <span class="st">"dashed"</span>, <span class="at">colour =</span> <span class="st">"red"</span>) <span class="sc">+</span> <span class="fu">geom_line</span>() <span class="sc">+</span> <span class="fu">geom_point</span>(<span class="at">shape =</span> <span class="dv">16</span>) <span class="sc">+</span></span>
+<span id="cb8-195"><a href="#cb8-195" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_x_log10</span>(<span class="at">name =</span> <span class="st">"# Total Clusters"</span>) <span class="sc">+</span></span>
+<span id="cb8-196"><a href="#cb8-196" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_y_continuous</span>(<span class="at">name =</span> <span class="st">"Fraction of duplicate reads"</span>, <span class="at">limits =</span> <span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">1</span>), <span class="at">breaks =</span> <span class="fu">seq</span>(<span class="dv">0</span>,<span class="dv">1</span>,<span class="fl">0.2</span>), <span class="at">expand =</span> <span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">0</span>)) <span class="sc">+</span> </span>
+<span id="cb8-197"><a href="#cb8-197" aria-hidden="true" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">title =</span> <span class="fu">paste0</span>(<span class="st">"N = "</span>, tab_model_2<span class="sc">$</span>n_molecules[<span class="dv">1</span>], <span class="st">", O = "</span>, tab_model_2<span class="sc">$</span>p_opt_dup[<span class="dv">1</span>])) <span class="sc">+</span></span>
+<span id="cb8-198"><a href="#cb8-198" aria-hidden="true" tabindex="-1"></a>  theme_base <span class="sc">+</span> <span class="fu">theme</span>(<span class="at">aspect.ratio =</span> <span class="dv">1</span>)</span>
+<span id="cb8-199"><a href="#cb8-199" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-200"><a href="#cb8-200" aria-hidden="true" tabindex="-1"></a><span class="co"># Parameter set 3: N = 1e12, O = 1e-6</span></span>
+<span id="cb8-201"><a href="#cb8-201" aria-hidden="true" tabindex="-1"></a>tab_model_3 <span class="ot">&lt;-</span> <span class="fu">tibble</span>(<span class="at">n_molecules =</span> <span class="fl">1e12</span>,</span>
+<span id="cb8-202"><a href="#cb8-202" aria-hidden="true" tabindex="-1"></a>                      <span class="at">p_opt_dup =</span> <span class="fl">1e-6</span>,</span>
+<span id="cb8-203"><a href="#cb8-203" aria-hidden="true" tabindex="-1"></a>                      <span class="at">total_clusters =</span> <span class="fu">round</span>(<span class="dv">10</span><span class="sc">^</span><span class="fu">seq</span>(<span class="dv">9</span>,<span class="dv">15</span>,<span class="fl">0.5</span>))) <span class="sc">%&gt;%</span></span>
+<span id="cb8-204"><a href="#cb8-204" aria-hidden="true" tabindex="-1"></a>  <span class="fu">group_by</span>(n_molecules, p_opt_dup) <span class="sc">%&gt;%</span></span>
+<span id="cb8-205"><a href="#cb8-205" aria-hidden="true" tabindex="-1"></a>  <span class="fu">mutate</span>(<span class="at">p_duplicates =</span> <span class="fu">sapply</span>(total_clusters, <span class="cf">function</span>(x) <span class="fu">exp_duplicates</span>(n_molecules, x, p_opt_dup)))</span>
+<span id="cb8-206"><a href="#cb8-206" aria-hidden="true" tabindex="-1"></a>g_3 <span class="ot">&lt;-</span> <span class="fu">ggplot</span>(tab_model_3, <span class="fu">aes</span>(<span class="at">x=</span>total_clusters, <span class="at">y=</span>p_duplicates)) <span class="sc">+</span></span>
+<span id="cb8-207"><a href="#cb8-207" aria-hidden="true" tabindex="-1"></a>  <span class="fu">geom_vline</span>(<span class="fu">aes</span>(<span class="at">xintercept =</span> n_molecules), <span class="at">linetype =</span> <span class="st">"dashed"</span>, <span class="at">colour =</span> <span class="st">"red"</span>) <span class="sc">+</span> <span class="fu">geom_line</span>() <span class="sc">+</span> <span class="fu">geom_point</span>(<span class="at">shape =</span> <span class="dv">16</span>) <span class="sc">+</span></span>
+<span id="cb8-208"><a href="#cb8-208" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_x_log10</span>(<span class="at">name =</span> <span class="st">"# Total Clusters"</span>) <span class="sc">+</span></span>
+<span id="cb8-209"><a href="#cb8-209" aria-hidden="true" tabindex="-1"></a>  <span class="fu">scale_y_continuous</span>(<span class="at">name =</span> <span class="st">"Fraction of duplicate reads"</span>, <span class="at">limits =</span> <span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">1</span>), <span class="at">breaks =</span> <span class="fu">seq</span>(<span class="dv">0</span>,<span class="dv">1</span>,<span class="fl">0.2</span>), <span class="at">expand =</span> <span class="fu">c</span>(<span class="dv">0</span>,<span class="dv">0</span>)) <span class="sc">+</span> </span>
+<span id="cb8-210"><a href="#cb8-210" aria-hidden="true" tabindex="-1"></a>  <span class="fu">labs</span>(<span class="at">title =</span> <span class="fu">paste0</span>(<span class="st">"N = "</span>, tab_model_3<span class="sc">$</span>n_molecules[<span class="dv">1</span>], <span class="st">", O = "</span>, tab_model_3<span class="sc">$</span>p_opt_dup[<span class="dv">1</span>])) <span class="sc">+</span></span>
+<span id="cb8-211"><a href="#cb8-211" aria-hidden="true" tabindex="-1"></a>  theme_base <span class="sc">+</span> <span class="fu">theme</span>(<span class="at">aspect.ratio =</span> <span class="dv">1</span>)</span>
+<span id="cb8-212"><a href="#cb8-212" aria-hidden="true" tabindex="-1"></a>g_1 <span class="sc">+</span> g_2 <span class="sc">+</span> g_3</span>
+<span id="cb8-213"><a href="#cb8-213" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb8-214"><a href="#cb8-214" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-215"><a href="#cb8-215" aria-hidden="true" tabindex="-1"></a>I don't want to take these results too seriously, since they're based on an extremely simple model, but there are some qualitative takeaways that I found helpful to keep in mind when looking at the real data. In particular, the general pattern of the model results is that **the fraction of duplicate reads follows a sigmoidal pattern on a linear-log plot**:</span>
+<span id="cb8-216"><a href="#cb8-216" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-217"><a href="#cb8-217" aria-hidden="true" tabindex="-1"></a><span class="ss">-   </span>When $X &lt;&lt; N$, the fraction of duplicates $F$ is close to zero.</span>
+<span id="cb8-218"><a href="#cb8-218" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-219"><a href="#cb8-219" aria-hidden="true" tabindex="-1"></a><span class="ss">-   </span>As $X$ approaches $N$, the fraction of duplicates begins increasing, first slowly and then (after $F$ reaches about 0.1) rapidly, before leveling off after $F$ exceeds about 0.9.</span>
+<span id="cb8-220"><a href="#cb8-220" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-221"><a href="#cb8-221" aria-hidden="true" tabindex="-1"></a><span class="ss">-   </span>When $X &gt;&gt; N$, $F \approx 1$.</span>
+<span id="cb8-222"><a href="#cb8-222" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-223"><a href="#cb8-223" aria-hidden="true" tabindex="-1"></a>At least under the assumptions used here, once the fraction of duplicates goes above 15% or so, further OOM increases in the number of clusters (e.g. by buying more or larger flow cells) will lead to a dramatic increase in the fraction of duplicate reads.</span>
+<span id="cb8-224"><a href="#cb8-224" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-225"><a href="#cb8-225" aria-hidden="true" tabindex="-1"></a><span class="fu"># Applying model takeaways</span></span>
+<span id="cb8-226"><a href="#cb8-226" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-227"><a href="#cb8-227" aria-hidden="true" tabindex="-1"></a>Returning to the results from the BMC data with these modeling results in mind:</span>
+<span id="cb8-228"><a href="#cb8-228" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-231"><a href="#cb8-231" aria-hidden="true" tabindex="-1"></a><span class="in">```{r}</span></span>
+<span id="cb8-232"><a href="#cb8-232" aria-hidden="true" tabindex="-1"></a>g_dup_flat_lin</span>
+<span id="cb8-233"><a href="#cb8-233" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
+<span id="cb8-234"><a href="#cb8-234" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-235"><a href="#cb8-235" aria-hidden="true" tabindex="-1"></a>Assuming that the real data will follow a sigmoidal pattern roughly resembling that from the model, we see that all samples (with the possible exception of D23-13406 when <span class="in">`unpair repair`</span> is disabled) are in the "danger zone", such that further OOM increases in read depth will likely lead to a dramatic increase in the fraction of duplicate reads. As such, **it probably isn't worth paying for a further OOM increase (or even half-OOM increase) in read depth for these samples**.</span>
+<span id="cb8-236"><a href="#cb8-236" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-237"><a href="#cb8-237" aria-hidden="true" tabindex="-1"></a><span class="fu"># Conclusions</span></span>
+<span id="cb8-238"><a href="#cb8-238" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-239"><a href="#cb8-239" aria-hidden="true" tabindex="-1"></a><span class="ss">-   </span>Overall, running this analysis was a frustrating experience, due to difficulties finding a configuration of Clumpify (or any other deduplication tool I know of) that (i) I trust to remove duplicates appropriately without predictably over- or under-counting, and (ii) runs well on large FASTQ files. Ultimately, I think we should treat the true level of duplicates as somewhere in between that measured by method 1 (Clumpify with <span class="in">`unpair repair`</span> enabled) and method 2 (Clumpify with <span class="in">`unpair repair`</span> disabled).</span>
+<span id="cb8-240"><a href="#cb8-240" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-241"><a href="#cb8-241" aria-hidden="true" tabindex="-1"></a><span class="ss">    -   </span>For the actual virus-detection pipeline, my current best bet is that we should run method 2 on the full dataset, then method 1 on the specific viral hits identified by Kraken2 (or the alignment tool used for validation, once that's been implemented). However, we ultimately might want to implement our own tool for dealing with this problem.</span>
+<span id="cb8-242"><a href="#cb8-242" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-243"><a href="#cb8-243" aria-hidden="true" tabindex="-1"></a><span class="ss">-   </span>Nevertheless, I was able to generate rarefaction curves for duplication rate as a function of the number of reads using both methods. The results, in combination with a very simple model I generated to aid interpretation, suggest that we're probably at about the highest OOM read depth we should aim for in terms of getting useful information from these samples (with the possible exception of D23-13406).</span>
+</code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</div></div></div></div></div>
+</div> <!-- /content -->
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-2-1.png b/docs/notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-2-1.png
new file mode 100644
index 0000000..fa3f8e4
Binary files /dev/null and b/docs/notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-2-1.png differ
diff --git a/docs/notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-2-2.png b/docs/notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-2-2.png
new file mode 100644
index 0000000..de354b7
Binary files /dev/null and b/docs/notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-2-2.png differ
diff --git a/docs/notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-3-1.png b/docs/notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-3-1.png
new file mode 100644
index 0000000..81f6c21
Binary files /dev/null and b/docs/notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-3-1.png differ
diff --git a/docs/notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-4-1.png b/docs/notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-4-1.png
new file mode 100644
index 0000000..fa3f8e4
Binary files /dev/null and b/docs/notebooks/2023-11-02_project-runway-dna-deduplication_files/figure-html/unnamed-chunk-4-1.png differ
diff --git a/docs/search.json b/docs/search.json
index dd3ba52..aa5732c 100644
--- a/docs/search.json
+++ b/docs/search.json
@@ -32,7 +32,7 @@
     "href": "index.html",
     "title": "Will's Public NAO Notebook",
     "section": "",
-    "text": "Comparing viral read assignments between pipelines on Project Runway data\n\n\n\n\n\n\n\n\n\nNov 2, 2023\n\n\n\n\n\n\n  \n\n\n\n\nInitial analysis of Project Runway protocol testing data\n\n\n\n\n\n\n\n\n\nOct 31, 2023\n\n\n\n\n\n\n  \n\n\n\n\nComparing options for read deduplication\n\n\nClumpify vs fastp\n\n\n\n\n\n\nOct 19, 2023\n\n\n\n\n\n\n  \n\n\n\n\nComparing Ribodetector and bbduk for rRNA detection\n\n\nIn search of quick rRNA filtering.\n\n\n\n\n\n\nOct 16, 2023\n\n\n\n\n\n\n  \n\n\n\n\nComparing FASTP and AdapterRemoval for MGS pre-processing\n\n\nTwo tools – how do they perform?\n\n\n\n\n\n\nOct 12, 2023\n\n\n\n\n\n\n  \n\n\n\n\nHow does Element AVITI sequencing work?\n\n\nFindings of a shallow investigation\n\n\n\n\n\n\nOct 11, 2023\n\n\n\n\n\n\n  \n\n\n\n\nExtraction experiment 2: high-level results & interpretation\n\n\nComparing RNA yields and quality across extraction kits for settled solids\n\n\n\n\n\n\nSep 21, 2023\n\n\n\n\n\n\nNo matching items"
+    "text": "Comparing viral read assignments between pipelines on Project Runway data\n\n\n\n\n\n\n\n\n\nNov 2, 2023\n\n\n\n\n\n\n  \n\n\n\n\nEstimating the effect of read depth on duplication rate for Project Runway DNA data\n\n\nHow deep can we go?\n\n\n\n\n\n\nNov 2, 2023\n\n\n\n\n\n\n  \n\n\n\n\nInitial analysis of Project Runway protocol testing data\n\n\n\n\n\n\n\n\n\nOct 31, 2023\n\n\n\n\n\n\n  \n\n\n\n\nComparing options for read deduplication\n\n\nClumpify vs fastp\n\n\n\n\n\n\nOct 19, 2023\n\n\n\n\n\n\n  \n\n\n\n\nComparing Ribodetector and bbduk for rRNA detection\n\n\nIn search of quick rRNA filtering.\n\n\n\n\n\n\nOct 16, 2023\n\n\n\n\n\n\n  \n\n\n\n\nComparing FASTP and AdapterRemoval for MGS pre-processing\n\n\nTwo tools – how do they perform?\n\n\n\n\n\n\nOct 12, 2023\n\n\n\n\n\n\n  \n\n\n\n\nHow does Element AVITI sequencing work?\n\n\nFindings of a shallow investigation\n\n\n\n\n\n\nOct 11, 2023\n\n\n\n\n\n\n  \n\n\n\n\nExtraction experiment 2: high-level results & interpretation\n\n\nComparing RNA yields and quality across extraction kits for settled solids\n\n\n\n\n\n\nSep 21, 2023\n\n\n\n\n\n\nNo matching items"
   },
   {
     "objectID": "notebooks/2023-10-12_fastp-vs-adapterremoval.html",
@@ -138,5 +138,19 @@
     "title": "Comparing viral read assignments between pipelines on Project Runway data",
     "section": "",
     "text": "In my last notebook entry, I reviewed some basic initial analyses of Project Runway DNA sequencing data. One notable result was that the number of reads assigned to human-infecting viruses differed significantly between the pipeline I ran for that entry and the current public pipeline. In this entry, I dig into these differences in more depth, to see whether they tell us anything about which tools to incorporate into the next version of the public pipeline.\nAt a high level, there are three main differences between the two pipelines:\n\nThe public pipeline uses AdapterRemoval for removal of adapters and quality trimming, while my pipeline uses FASTP.\nMy pipeline uses bbduk to identify and remove ribosomal reads prior to Kraken analysis, while the public pipeline does not.\nMy pipeline applies deduplication prior to Kraken analysis using clumpify, while the public pipeline applies it after Kraken analysis via a manual method.\n\nIn principle, any of these differences could be responsible for the differences in read assignment we observe. However, since very few reads were identified as ribosomal or as duplicates by the new pipeline, it’s unlikely that these differences are those responsible.\nTo investigate this, I decided to manually identify and trace the reads assigned to human-infecting viruses in both pipelines, to see whether that tells us anything about the likely cause of the differences. To do this, I selected the three samples from the dataset that show the largest difference in the number of assigned human-infecting virus reads (henceforth HV reads):\n\nD23-13405-1 (14 HV reads assigned by the public pipeline vs 8 by the new pipeline)\nD23-13405-2 (12 vs 8)\nD23-13406-2 (17 vs 9)\n\nThe way the public pipeline does deduplication (after Kraken2 analysis, during dashboard generation) makes it difficult to directly extract the final list of HV read IDs for that pipeline, but it is quite easy to do this for the list of reads immediately prior to deduplication. Doing this for the samples specified above returned the following results:\n\nD23-13405-1: 14 read IDs (no reads lost during deduplication)\nD23-13405-2: 14 read IDs (2 reads to Simian Agent 10 lost during deduplication)\nD23-13406-2: 17 read IDs (no reads lost during deduplication)\n\nIn the first and third of these cases, I could thus directly compare the Kraken output from the two pipelines to investigate the source of the disagreements. In the second case, it wasn’t immediately possible to identify which two out of the three Simian Agent 10 reads were considered duplicates in the dashboard, but I was at least able to restrict the possibility to those three reads. (As we’ll see below, information from the new pipeline also helped narrow this down.)\n\nCodedata_dir &lt;- \"../data/2023-11-01_pr-comp\"\nread_status_path &lt;- file.path(data_dir, \"read-status.csv\")\nread_status &lt;- read_csv(read_status_path, show_col_types = FALSE) %&gt;%\n  mutate(status = fct_inorder(status))\ntheme_kit &lt;- theme_base + theme(\n  aspect.ratio = 1/2,\n  axis.text.x = element_text(hjust = 1, angle = 45),\n  axis.title.x = element_blank()\n)\ng_status &lt;- ggplot(read_status, aes(x=sample, y=n_reads, fill=status)) +\n  geom_col(position = \"dodge\") +\n  scale_fill_brewer(palette = \"Set1\", name = \"Status\") +\n  scale_y_continuous(name = \"# Putative HV read pairs\", limits = c(0,10),\n                     breaks = seq(0,10,2), expand = c(0,0)) +\n  theme_base + theme_kit\ng_status\n\n\n\n\nD23-13405-1\n\nIn this case, 8 HV reads were assigned by the new pipeline, and 14 reads were assigned by the the public pipeline.\nAll 8 of the reads assigned by the new pipeline were among the 14 HV reads assigned by the public pipeline.\n\nAmong the 6 remaining reads that were assigned by only the public pipeline:\n\n3 appeared in the list of HV hits for the new pipeline; that is to say, for these four reads, the new pipeline was able to identify hits to human-infecting viruses but not make an overall assignment.\n2 were included in the Kraken2 output for the new pipeline, but were not found to contain any HV k-mers and so were excluded from the list of hits. This is a more extreme case of the above situation: in this case, more stringent trimming by FASTQ has removed the putative HV k-mers.\n1 was found among the reads that FASTP discarded due to not passing quality filters; in this case, read 2 was discarded due to low quality, and read 1 was then discarded due to lacking a read pair.\n\n\n\nIn all 6 cases, therefore, the difference in assignment between the two pipelines was found to be due to difference 1, i.e. the use of different preprocessing tools. In general, FASTP appears to be more stringent than AdapterRemoval in a way that resulted in fewer HV read assignments. But are these reads false positives for the old pipeline, or false negatives for the new one?\n\nTo address this, I extracted the raw sequences of the six read pairs, manually removed adapters, and manually analyzed them with NCBI BLAST (blastn vs viral nt, then vs full nt).\nIn all six cases, no match was found by blastn between the read sequence and the human-infecting virus putatively assigned by Kraken2 in the public pipeline. In four out of six cases, the best match for the read was to a bacterial sequence; in one case, the best match for the forward read was bacterial while the reverse matched a phage; and in one case no significant match was found for either read.\nThese results suggest to me that FASTP’s more stringent trimming and filtering is ensuring true negatives, rather than causing false ones.\n\n\nD23-13405-2\n\nIn this case, 8 HV reads were assigned by the new pipeline, and 14 by the public pipeline excluding deduplication; 2 of the latter were removed during deduplication for the dashboard.\n\nAll 8 of the reads assigned by the new pipeline were among the 14 pre-deduplication HV reads assigned by the public pipeline; however, two of these were among the group of three reads (all to Simian Agent 10) that were collapsed into one by deduplication in the public pipeline.\n\nThis indicates that one of the reads present in the new pipeline results was removed by deduplication in the public pipeline results – that is to say, the two pipelines disagree slightly more than the raw HV read counts would suggest.\nOne read was removed as a duplicate by both pipelines; I discarded this one from consideration, bringing the number of read IDs for consideration down to 13.\n\n\n\nAmong the remaining 5 HV reads from the public pipeline:\n\n4 appeared in the list of HV hits for the new pipeline; that is to say, for these four reads, the new pipeline was able to identify hits to human-infecting viruses but not make an overall assignment.\n1 was discarded by FASTP during quality filtering: read 2 was discarded due to low quality, and read 1 was then discarded due to lacking a read pair.\n\n\n\nAs before, I extracted the raw reads corresponding to these 5 disagreements from the raw sequencing data, removed adapters manually, and ran the resulting sequences through NCBI BLAST (blastn vs viral nt, then vs full nt). 4 out of 5 read pairs showed no match to the assigned virus, while one showed a good, though partial, match.\n\nIn the case of the match, it looks like in both the public pipeline and the new pipeline, Kraken2 only identified a single k-mer from the origin virus (Sandfly fever Turkey virus); however, the public pipeline also identified 3 k-mers assigned to taxid 1 (root), while these were trimmed away in the new pipeline, and this was sufficient to prevent the overall assignment.\nI’m not sure how to feel about this case. Ex ante, making an assignment on the basis of a single unique k-mer and three uninformative k-mers feels quite dubious, but ex post it does appear that at least part of the read was correctly assigned.\n\n\n\nInvestigating the pair of reads that were flagged as duplicates by the public pipeline but not the new pipeline, I found that they were a perfect match, but with the sequence of the forward read in one pair matching the reverse read in the second pair.\n\nThis was surprising to me, as it suggests that clumpify won’t remove duplicates if one is in reverse-complement to the other, which seems like a major oversight. I checked this, and indeed Clumpify fails to detect the duplicate in the current state but succeeds if I swap the forward and reverse reads for one of the read pairs.\nThis makes me less excited about using Clumpify for deduplication. UPDATE: I found an option for Clumpify that seems to solve this problem, at least in this case. See Conclusions for more.\nIt’s worth noting that, due to the way FASTQC analyses files, it will also fail to detect RC duplicates.\n\n\nD23-13406-2\n\nIn this case, 9 HV reads were assigned by the new pipeline, and 17 by the public pipeline.\n\nAs before, I confirmed that all 9 HV reads assigned by the new pipeline were also assigned by the public pipeline, leaving 8 disagreements. Of these:\n\n6 appeared in the list of reads for which the new pipeline found HV hits, but wasn’t able to make an overall assignment; in all six of these cases, the HV hits were to the taxon assigned by the public pipeline.\n1 was included in the new pipeline’s Kraken output but had no viral hits.\nThe final clash is the most interesting, as this one was excluded not by FASTP or Kraken, but by bbduk: it was identified as ribosomal and discarded prior to deduplication.\n\n\n\nAs before, I extracted the raw reads corresponding to these 8 disagreements from the raw sequencing data, removed adapters manually, and ran the resulting sequences through NCBI BLAST (blastn vs viral nt, then vs nt).\n\nFor 7 of the 8 disagreements – all those arising from FASTP preprocessing – BLAST found no match between the read sequence and the taxon assigned by the public pipeline. As before, this suggests to be that FASTP is doing a good job preventing false positives through better preprocessing.\nThe BLAST result for the final disagreement is again the most interesting. The forward read indeed showed strong alignment to bacterial rRNA sequences, as found by bbduk. The reverse read, however, showed good alignment to Influenza A virus, which was the virus assigned by Kraken2 in the public pipeline. In this case, it appears we have a chimeric read, which both pipelines are in some sense processing correctly: bbduk is correctly identifying the forward read as ribosomal, and Kraken2 is correctly identifying the reverse read as viral. It’s not a priori obvious to me how we should handle these cases.\n\n\nSanity checking\n\nFinally, I wanted to check that my findings here weren’t just the result of some issue with how I’m using NCBI BLAST – that is, that BLAST as I’m using it is able to detect true positives rather than just failing to find matches all over the place.\nTo check this, I took the 24 read pairs (8 + 7 + 9) from the three samples above that both pipelines agreed arose from human-infecting viruses, and ran these through BLAST in the same way as the disagreed-upon read-pairs above.\nWhile the results weren’t as unequivocal as I’d hoped, they nevertheless showed a strong difference from those for the clashing read pairs. In total, 16/24 read pairs showed strong matching to the assigned virus, and an additional 2/24 showed a short partial match sufficient to explain the Kraken assignment. The remaining 6/24 failed to match the assigned virus. In total, 75% (18/24) of agreed-upon sequences showed a match to the assigned virus, compared to only 10% (2/20) for the clashing sequences.\nThis cautiously updates me further toward believing that the FASTP component of the new pipeline is mostly doing well at correctly rejecting true negatives, at least compared to the current public pipeline. That said, it also suggests that either (a) BLAST is generating a significant number of false negatives, or (b) even the new pipeline is generating a significant number of false positives.\n\n\nCoderead_hit_path &lt;- file.path(data_dir, \"read-hit-count.csv\")\nread_hit &lt;- read_csv(read_hit_path, show_col_types = FALSE) %&gt;%\n  mutate(status = fct_inorder(status),\n         p_hit = n_hit/n_reads)\ng_hit &lt;- ggplot(read_hit, aes(x=sample, y=p_hit, fill=status)) +\n  geom_col(position = \"dodge\") +\n  scale_fill_brewer(palette = \"Set3\", name = \"Status\") +\n  scale_y_continuous(name = \"% reads matching HV assignment\", limits = c(0,1),\n                     breaks = seq(0,1,0.2), expand = c(0,0), labels = function(y) y*100) +\n  theme_base + theme_kit\ng_hit\n\n\n\n\nConclusions\n\nMy updates from this exercise are different for different parts of the pipeline.\n\nFor difference 1 (preprocessing tool) I mostly found that, in cases where the new pipeline rejects a read due to preprocessing and the public pipeline does not, that read appears to be a true negative. I’m not super confident about this, since I don’t 100% trust BLAST to not be producing false negatives here, but overall I think the evidence points to FASTP doing a better job here than AdapterRemoval.\n\nI’d ideally like to shift to a version of the pipeline where we’re not relying on Kraken to make assignment decisions, and are instead running all Kraken hits through an alignment-based validation pipeline to determine final assignments. I’d be interested in seeing how these results look after making that change.\n\n\nFor difference 2 (ribodepletion) results here are equivocal. The single read pair I inspected that got removed during ribodepletion appears to include both a true ribosomal read and a true viral read. I think some internal discussion is needed to decide how to handle these cases.\n\nFor difference 3 (deduplication) I initially updated negatively about Clumpify, which appeared to be unable to handle duplicates where the forward and reverse reads in a pair are switched (this is also a case where FASTQC will be unable to detect these duplicates).\n\nHowever, I found an option for Clumpify which addresses this issue, at least in this case. Specifically, one can configure Clumpify to unpair reads, perform deduplication on the forward and reverse reads all together, and then restore pairing. This successfully removes this class of duplicates.\nI’m a little worried that this approach might sometimes cause complete loss of all duplicate reads (rather than all-but-one-pair) when the best-quality duplicate differs between forward and reverse reads. I tried this out by artificially modifying the quality scores for the duplicate reads from D23-13405-2, and this doesn’t seem to be the case at least in this instance: when quality across read pairs was concordant, I was able to control which read pair survived as expected, and when it was discordant, one read pair survived anyway. Still, this remains a niggling doubt."
+  },
+  {
+    "objectID": "notebooks/2023-11-02_project-runway-dna-deduplication.html",
+    "href": "notebooks/2023-11-02_project-runway-dna-deduplication.html",
+    "title": "Estimating the effect of read depth on duplication rate for Project Runway DNA data",
+    "section": "",
+    "text": "One relevant question for both Project Runway and other NAO sequencing is: what is the maximum read depth at which we can sequence a given sample while retaining an acceptable level of sequence duplication?\nAs discussed in a previous entry, duplicate reads can arise in sequencing data from a variety of processes, including true biological duplicates present in the raw sample; processing duplicates arising from amplification and other processes during sample and library prep; and sequencing duplicates arising from various processes on the actual flow cell.\nAs we sequence more deeply, we expect the fraction of biological and processing duplicates (but not, I think, sequencing duplicates) in our read data to increase. In the former case, this is because we are capturing a larger fraction of all the input molecules in our sample; in the latter, because we are sequencing copies of the same sequence over and over again. Intuitively, I expect the increase in processing duplicates to swamp that in biological duplicates at high read depth, at least for library prep protocols that involve amplification1.\nOne simple approach to investigate the overall effect of read depth on duplication levels in the sample is rarefaction: downsampling a library to different numbers of reads and seeing how the duplication rate changes as a function of read count. In this notebook entry, I apply this approach to sequencing data from the Project Runway initial DNA dataset, to see how duplication rate behaves in this case."
+  },
+  {
+    "objectID": "notebooks/2023-11-02_project-runway-dna-deduplication.html#footnotes",
+    "href": "notebooks/2023-11-02_project-runway-dna-deduplication.html#footnotes",
+    "title": "Estimating the effect of read depth on duplication rate for Project Runway DNA data",
+    "section": "Footnotes",
+    "text": "Footnotes\n\nIt might be worth explicitly modeling the difference in behavior between different kinds of duplicates as sequencing depth increases, to see if these intuitions are borne out.↩︎\nProbably the biggest two improvements that could be made to this model in future are (i) introducing biological duplicates, and (ii) introducing sequence-specific bias in PCR amplification and cluster formation.↩︎\nI’d appreciate it if someone else on the team can check my math here.↩︎"
   }
 ]
\ No newline at end of file
diff --git a/notebooks/2023-11-02_project-runway-dna-deduplication.qmd b/notebooks/2023-11-02_project-runway-dna-deduplication.qmd
new file mode 100644
index 0000000..7baa9de
--- /dev/null
+++ b/notebooks/2023-11-02_project-runway-dna-deduplication.qmd
@@ -0,0 +1,235 @@
+---
+title: "Estimating the effect of read depth on duplication rate for Project Runway DNA data"
+subtitle: "How deep can we go?"
+author: "Will Bradshaw"
+date: 2023-11-02
+format:
+  html:
+    code-fold: true
+    code-tools: true
+    code-link: true
+    df-print: paged
+editor: visual
+title-block-banner: black
+---
+
+```{r}
+#| label: load-packages
+#| include: false
+library(tidyverse)
+library(cowplot)
+library(patchwork)
+library(fastqcr)
+source("../scripts/aux_plot-theme.R")
+```
+
+One relevant question for both Project Runway and other NAO sequencing is: what is the maximum read depth at which we can sequence a given sample while retaining an acceptable level of sequence duplication?
+
+As discussed in a previous entry, duplicate reads can arise in sequencing data from a variety of processes, including true biological duplicates present in the raw sample; processing duplicates arising from amplification and other processes during sample and library prep; and sequencing duplicates arising from various processes on the actual flow cell.
+
+As we sequence more deeply, we expect the fraction of biological and processing duplicates (but not, I think, sequencing duplicates) in our read data to increase. In the former case, this is because we are capturing a larger fraction of all the input molecules in our sample; in the latter, because we are sequencing copies of the same sequence over and over again. Intuitively, I expect the increase in processing duplicates to swamp that in biological duplicates at high read depth, at least for library prep protocols that involve amplification[^1].
+
+[^1]: It might be worth explicitly modeling the difference in behavior between different kinds of duplicates as sequencing depth increases, to see if these intuitions are borne out.
+
+One simple approach to investigate the overall effect of read depth on duplication levels in the sample is rarefaction: downsampling a library to different numbers of reads and seeing how the duplication rate changes as a function of read count. In this notebook entry, I apply this approach to sequencing data from the Project Runway initial DNA dataset, to see how duplication rate behaves in this case.
+
+# Methods
+
+To analyze the effect of read depth on duplication rates in this data, I first concatenated the raw reads from the two sequencing replicates of each sample together. This allowed the analysis to detect duplicates across replicates, that would have been missed by analyzing them separately.
+
+Next, I performed preprocessing to remove adapter sequences and low-quality bases that might interfere with duplicate detection. I didn't collapse read pairs together or discard read pairs with overall low quality.
+
+```         
+for p in $(cat prefixes.txt); do echo $p; fastp -i raw/${p}_1.fastq.gz -I raw/${p}_2.fastq.gz -o preproc/${p}_fastp_1.fastq.gz -O preproc/${p}_fastp_2.fastq.gz --failed_out preproc/${p}_fastp_failed.fastq.gz --cut_tail --correction --detect_adapter_for_pe --adapter_fasta adapters.fa  --trim_poly_x --thread 16 -Q -L; done
+```
+
+I then took each pair of preprocessed FASTQ files and downsampled them to specified numbers of reads, from 10,000 to 100,000,000 in units of 1 OOM, using `seqtk sample`. I performed downsampling 3 times for each read count, with the intent of calculating duplication rate separately for each replicate and taking the average.
+
+Finally, I ran deduplication with Clumpify on each downsampled pair of read files, as well as the raw preprocessed files, and recorded the fraction of reads discarded in each case. I performed this twice, using different Clumpify settings each time:
+
+1.  First, I attempted to perform deduplication in a maximally comprehensive way, using Clumpify's `unpair repair` options to identify and remove duplicates in opposite orientation across read pairs (as discussed [here](https://data.securebio.org/wills-public-notebook/notebooks/2023-11-02_project-runway-comparison.html#conclusions)). This configuration finds more duplicates, but (a) might overestimate duplicates in cases where only one read in a pair matches another read, and (b) causes memory-related errors for large subsample sizes.
+
+    ```         
+    clumpify.sh in=<in1> in2=<in2> out=<out1> out2=<out2> dedupe containment unpair repair
+    ```
+
+2.  Second, due to the aforementioned issues with approach 1, I repeated deduplication without Clumpify's `unpair repair` options enabled, providing a lower-bound estimate of duplication levels which should be more consistent with estimates provided by, for example, FASTQC.
+
+    ```         
+     clumpify.sh in=<in1> in2=<in2> out=<out1> out2=<out2> dedupe containment
+    ```
+
+# Results
+
+Plotting the fraction of duplicate reads (with or without Clumpify's `unpair repair` options) gives us the following result:
+
+```{r}
+#| warning: false
+# Import data
+data_dir <- "../data/2023-11-06_pr-dedup/"
+n_dup_path <- file.path(data_dir, "n_dup.csv")
+n_dup <- read_csv(n_dup_path, show_col_types = FALSE) %>% 
+  mutate(n_dup = read_pairs_in - read_pairs_out, p_dup = n_dup/read_pairs_in,
+         o_dup = p_dup/(1-p_dup))
+
+# Reshape data
+n_dup_flat <- n_dup %>% group_by(sample, unpair_repair, read_pairs_in) %>%
+  summarize(p_dup_mean = mean(p_dup), p_dup_sd = sd(p_dup), p_dup_min = max(0,p_dup_mean-p_dup_sd), p_dup_max = min(1, p_dup_mean+p_dup_sd),
+            o_dup_mean = mean(o_dup), o_dup_sd = sd(o_dup), o_dup_min = max(0,o_dup_mean-o_dup_sd), o_dup_max = min(1, o_dup_mean+o_dup_sd), .groups = "drop")
+
+# Plot data
+g_dup_flat_base <- ggplot(n_dup_flat, aes(x=read_pairs_in, y=p_dup_mean, color = sample)) +
+  geom_line() + geom_errorbar(aes(ymin=p_dup_min, ymax=p_dup_max)) + geom_point(shape = 16) +
+  scale_x_log10(name = "# Input Read Pairs") +
+  facet_grid(.~unpair_repair, labeller = "label_both") +
+  scale_color_brewer(palette = "Dark2") +
+  theme_base + theme(aspect.ratio = 1)
+g_dup_flat_lin <- g_dup_flat_base + 
+  scale_y_continuous(name = "Fraction of duplicate reads", limits = c(0,0.41), breaks = seq(0,1,0.1), expand = c(0,0))
+g_dup_flat_log <- g_dup_flat_base + scale_y_log10(name = "Fraction of duplicate reads")
+
+# Show plats
+g_dup_flat_lin
+g_dup_flat_log
+```
+
+We can see that the fraction of duplicates reaches quite high levels as we approach the full read count, especially when `unpair repair` is enabled. It also looks like the gradient of increase is quite high on the linear-log plot, which would suggest that further OOM increases in read depth might result in quite large increases in the fraction of duplicates. It's also apparent that, for whatever reason, D23-13406 has substantially fewer duplicates at any given read depth than the other two samples.
+
+However, further interpretation of these results, including extrapolation to greater read depths, is made difficult by the lack of a theoretical model for what we expect to see. It's also not clear what mode of visualization (Linear-log? Log-linear? Log-log? Fraction of duplicates expressed as probabilities or odds? Etc) is most meaningful for interpretation.
+
+To start resolving some of these roadblocks, I spent some time working on a very simple model of read duplication, to see what it might tell us about the expected pattern of duplicates as a function of read depth.
+
+# A very very simple model of read duplication
+
+-   Imagine a sample containing $N$ distinct molecules, which are uniformly amplified up to $M = N \times 2^C$ molecules by a perfectly unbiased $C$-cycle PCR reaction. Adapters are ligated and the resulting library is washed across the flow cell to generate $X$ total clusters (again, without bias[^2]). These clusters are then sequenced by a process that generates optical duplicates at some rate $O$, for a total expectation of $R=X\times(1+O)$ reads.
+
+-   Each cluster is selected from the $M$ molecules in the library, without replacement. When $C$ is large and amplification is uniform across molecules, this is well-approximated by selecting from the $N$ input molecules with replacement. Under these conditions, the number of clusters generated from input molecule $i$ is approximately $X_i \sim \mathrm{B}(X,N^{-1})$.
+
+-   The number of optical duplicates generated from a given input molecule is then approximately $P_i \sim \mathrm{B}(X_i,O)$, and the total number of reads corresponding to a given input molecule is thus $R_i = X_i + P_i$. The number of duplicates generated from that molecule is then $D_i = \max(0,R_i - 1)$, and the overall fraction of duplicates is $F=\frac{D}{R}=\frac{\sum_iD_i}{\sum_iR_i}$ .
+
+-   The expected fraction of duplicates under this model can be estimated analytically as follows[^3]:
+
+    -   $E(R_i)=\sum_{r=1}^{2X} r \cdot P(R_i = r) = \sum_{r=1}^{2X}r\cdot\left[\sum_{k=0}^{r}P(X_i = k)\cdot{}P(P_i=r-k|X_i=k)\right]$
+
+    -   $E(D_i) = \sum_{r=1}^{2X} (r-1) \cdot P(R_i = r)$
+
+    -   $E(D) = E\left(\sum_i D_i\right) = N \cdot E(D_i)$
+
+    -   $R = \sum_i R_i = \sum_r r \cdot \mathbb{N}(R_i = r)$
+
+        -   When $N$ is large, $\mathbb{N}(R_i = r) \approx N \cdot P(R_i = r)$, and so $R \approx \sum_r r \cdot N \cdot P(R_i = r) = N \cdot E(R_i)$
+
+        -   Hence $E\left(\frac{1}{R}\right) \approx \frac{1}{N \cdot E(R_i)}$
+
+    -   Thus $E(F) = E\left(\frac{D}{R}\right) = E(D) \cdot E\left(\frac{1}{R}\right) \approx \frac{N \cdot E(D_i)}{N \cdot E(R_i)} = \frac{E(D_i)}{E(R_i)}$
+
+[^2]: Probably the biggest two improvements that could be made to this model in future are (i) introducing biological duplicates, and (ii) introducing sequence-specific bias in PCR amplification and cluster formation.
+
+[^3]: I'd appreciate it if someone else on the team can check my math here.
+
+Using computational implementations of these formulae, we can investigate how the fraction of duplicates varies with the number of clusters for different parameter values.
+
+```{r}
+# Define auxiliary functions
+log_p_clusters <- function(n_clusters, total_clusters, n_molecules) dbinom(n_clusters, total_clusters, 1/n_molecules, log=TRUE)
+log_p_opt_dups <- function(n_opt_dups, n_clusters, p_opt_dup) dbinom(n_opt_dups, n_clusters, p_opt_dup, log=TRUE)
+p_reads <- function(n_reads, total_clusters, n_molecules, p_opt_dup){
+  n_clusters <- n_reads - (0:(n_reads/2))
+  log_p_n_clusters <- log_p_clusters(n_clusters, total_clusters, n_molecules)
+  log_p_n_opt_dups <- log_p_opt_dups(n_reads - n_clusters, n_clusters, p_opt_dup)
+  log_p_reads_clusters <- log_p_n_clusters + log_p_n_opt_dups
+  return(sum(exp(log_p_reads_clusters)))
+}
+
+# Define main function
+exp_duplicates <- function(n_molecules, total_clusters, p_opt_dup, initial_vector_length = 1e8){
+  pr <- purrr::partial(p_reads, total_clusters = total_clusters, n_molecules = n_molecules, p_opt_dup = p_opt_dup)
+  # Calculate read count probabilities
+  n_reads <- 1:initial_vector_length
+  p_n_reads <- numeric(initial_vector_length)
+  break_zero <- FALSE
+  for (n in 1:length(n_reads)){
+    p <- pr(n_reads[n])
+    if (p == 0 && break_zero){
+      n_reads <- n_reads[1:n]
+      p_n_reads <- p_n_reads[1:n]
+      break
+    } else if (p != 0 && !break_zero){
+      break_zero <- TRUE
+    }
+    p_n_reads[n] <- p
+  }
+  # Calculate fraction of duplicates
+  n_duplicates <- n_reads - 1
+  p_duplicates <- sum(n_duplicates * p_n_reads)/sum(n_reads * p_n_reads)
+  return(p_duplicates)
+}
+
+# Parameter set 1: N = 1e6, O = 1e-6
+tab_model_1 <- tibble(n_molecules = 1e6,
+                      p_opt_dup = 1e-6,
+                      total_clusters = round(10^seq(4,8,0.5))) %>%
+  group_by(n_molecules, p_opt_dup) %>%
+  mutate(p_duplicates = sapply(total_clusters, function(x) exp_duplicates(n_molecules, x, p_opt_dup)))
+g_1 <- ggplot(tab_model_1, aes(x=total_clusters, y=p_duplicates)) +
+  geom_vline(aes(xintercept = n_molecules), linetype = "dashed", colour = "red") + geom_line() + geom_point(shape = 16) +
+  scale_x_log10(name = "# Total Clusters") +
+  scale_y_continuous(name = "Fraction of duplicate reads", limits = c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) + 
+  labs(title = paste0("N = ", tab_model_1$n_molecules[1], ", O = ", tab_model_1$p_opt_dup[1])) +
+  theme_base + theme(aspect.ratio = 1)
+
+# Parameter set 2: N = 1e9, O = 1e-6
+tab_model_2 <- tibble(n_molecules = 1e9,
+                      p_opt_dup = 1e-6,
+                      total_clusters = round(10^seq(6,12,0.5))) %>%
+  group_by(n_molecules, p_opt_dup) %>%
+  mutate(p_duplicates = sapply(total_clusters, function(x) exp_duplicates(n_molecules, x, p_opt_dup)))
+g_2 <- ggplot(tab_model_2, aes(x=total_clusters, y=p_duplicates)) +
+  geom_vline(aes(xintercept = n_molecules), linetype = "dashed", colour = "red") + geom_line() + geom_point(shape = 16) +
+  scale_x_log10(name = "# Total Clusters") +
+  scale_y_continuous(name = "Fraction of duplicate reads", limits = c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) + 
+  labs(title = paste0("N = ", tab_model_2$n_molecules[1], ", O = ", tab_model_2$p_opt_dup[1])) +
+  theme_base + theme(aspect.ratio = 1)
+
+# Parameter set 3: N = 1e12, O = 1e-6
+tab_model_3 <- tibble(n_molecules = 1e12,
+                      p_opt_dup = 1e-6,
+                      total_clusters = round(10^seq(9,15,0.5))) %>%
+  group_by(n_molecules, p_opt_dup) %>%
+  mutate(p_duplicates = sapply(total_clusters, function(x) exp_duplicates(n_molecules, x, p_opt_dup)))
+g_3 <- ggplot(tab_model_3, aes(x=total_clusters, y=p_duplicates)) +
+  geom_vline(aes(xintercept = n_molecules), linetype = "dashed", colour = "red") + geom_line() + geom_point(shape = 16) +
+  scale_x_log10(name = "# Total Clusters") +
+  scale_y_continuous(name = "Fraction of duplicate reads", limits = c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) + 
+  labs(title = paste0("N = ", tab_model_3$n_molecules[1], ", O = ", tab_model_3$p_opt_dup[1])) +
+  theme_base + theme(aspect.ratio = 1)
+g_1 + g_2 + g_3
+```
+
+I don't want to take these results too seriously, since they're based on an extremely simple model, but there are some qualitative takeaways that I found helpful to keep in mind when looking at the real data. In particular, the general pattern of the model results is that **the fraction of duplicate reads follows a sigmoidal pattern on a linear-log plot**:
+
+-   When $X << N$, the fraction of duplicates $F$ is close to zero.
+
+-   As $X$ approaches $N$, the fraction of duplicates begins increasing, first slowly and then (after $F$ reaches about 0.1) rapidly, before leveling off after $F$ exceeds about 0.9.
+
+-   When $X >> N$, $F \approx 1$.
+
+At least under the assumptions used here, once the fraction of duplicates goes above 15% or so, further OOM increases in the number of clusters (e.g. by buying more or larger flow cells) will lead to a dramatic increase in the fraction of duplicate reads.
+
+# Applying model takeaways
+
+Returning to the results from the BMC data with these modeling results in mind:
+
+```{r}
+g_dup_flat_lin
+```
+
+Assuming that the real data will follow a sigmoidal pattern roughly resembling that from the model, we see that all samples (with the possible exception of D23-13406 when `unpair repair` is disabled) are in the "danger zone", such that further OOM increases in read depth will likely lead to a dramatic increase in the fraction of duplicate reads. As such, **it probably isn't worth paying for a further OOM increase (or even half-OOM increase) in read depth for these samples**.
+
+# Conclusions
+
+-   Overall, running this analysis was a frustrating experience, due to difficulties finding a configuration of Clumpify (or any other deduplication tool I know of) that (i) I trust to remove duplicates appropriately without predictably over- or under-counting, and (ii) runs well on large FASTQ files. Ultimately, I think we should treat the true level of duplicates as somewhere in between that measured by method 1 (Clumpify with `unpair repair` enabled) and method 2 (Clumpify with `unpair repair` disabled).
+
+    -   For the actual virus-detection pipeline, my current best bet is that we should run method 2 on the full dataset, then method 1 on the specific viral hits identified by Kraken2 (or the alignment tool used for validation, once that's been implemented). However, we ultimately might want to implement our own tool for dealing with this problem.
+
+-   Nevertheless, I was able to generate rarefaction curves for duplication rate as a function of the number of reads using both methods. The results, in combination with a very simple model I generated to aid interpretation, suggest that we're probably at about the highest OOM read depth we should aim for in terms of getting useful information from these samples (with the possible exception of D23-13406).