diff --git a/data/2024-05-06_munk/Munk-PRJEB13831/adapters.fasta b/data/2024-05-06_munk/Munk-PRJEB13831/adapters.fasta new file mode 100644 index 0000000..f7569f7 --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB13831/adapters.fasta @@ -0,0 +1,65 @@ +>0 +ACACTCTTTCCCTACACGACGCTCTTCCGATCT +>1 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>2 +CTGTCTCTTATACACATCTGACGCTGCCGACGA +>3 +unspecified +>4 +CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC +T +>5 +CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +>6 +TTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGG +>7 +GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG +>8 +TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG +>9 +GTTCTAGTTCAGTTCTAGTTCAGTTCTAGTTCAGTTCTAGTTCAGTTCTAGTTCAGTTCT +>10 +CTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCT +>11 +TAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAA +>12 +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>13 +CTGTCTCTTATACACATCTCCGAGCCCACGAGAC +>14 +AGATCGGAAGAGCACACGTCTGAACTCCAGTCA +>15 +CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +>16 +AGAACTGAACTAGAACTGAACTAGAACTGAACTAGAACTGAACTAGAACTGAACTAGAAC +>17 +CCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAA +>18 +GAACTAGAACTGAACTAGAACTGAACTAGAACTGAACTAGAACTGAACTAGAACTGAACT +>19 +GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG +>20 +heifigepsna +>21 +GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +>22 +TGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>23 +CTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATA +>24 +CAAGCAGAAGACGGCATACGAGAT +>25 +AGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAG +>26 +GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +>27 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC +>28 +TTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTA +>29 +AGTTCAGTTCTAGTTCAGTTCTAGTTCAGTTCTAGTTCAGTTCTAGTTCAGTTCTAGTTC +>30 +TATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAG +>31 +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT diff --git a/data/2024-05-06_munk/Munk-PRJEB13831/hv_clade_counts.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB13831/hv_clade_counts.tsv.gz new file mode 100644 index 0000000..f525cd8 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB13831/hv_clade_counts.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB13831/hv_hits_blast_paired.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB13831/hv_hits_blast_paired.tsv.gz new file mode 100644 index 0000000..875a70c Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB13831/hv_hits_blast_paired.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB13831/hv_hits_putative_filtered.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB13831/hv_hits_putative_filtered.tsv.gz new file mode 100644 index 0000000..79cb7cd Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB13831/hv_hits_putative_filtered.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB13831/qc_adapter_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB13831/qc_adapter_stats.tsv.gz new file mode 100644 index 0000000..c900581 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB13831/qc_adapter_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB13831/qc_basic_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB13831/qc_basic_stats.tsv.gz new file mode 100644 index 0000000..649899c Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB13831/qc_basic_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB13831/qc_quality_base_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB13831/qc_quality_base_stats.tsv.gz new file mode 100644 index 0000000..2553cc8 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB13831/qc_quality_base_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB13831/qc_quality_sequence_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB13831/qc_quality_sequence_stats.tsv.gz new file mode 100644 index 0000000..c7e2418 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB13831/qc_quality_sequence_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB13831/sample-metadata.csv b/data/2024-05-06_munk/Munk-PRJEB13831/sample-metadata.csv new file mode 100644 index 0000000..df7ff02 --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB13831/sample-metadata.csv @@ -0,0 +1,247 @@ +library,sample,country,city,date,dataset,bioproject +ERR1713331,ERR1713331,Albania,Tirana,2016-02-01,Munk 2022,PRJEB13831 +ERR1713332,ERR1713332,Australia,Canberra,2016-02-04,Munk 2022,PRJEB13831 +ERR1713333,ERR1713333,Australia,Melbourne,2016-02-08,Munk 2022,PRJEB13831 +ERR1713335,ERR1713335,Bulgaria,Sofia,2016-02-04,Munk 2022,PRJEB13831 +ERR1713336,ERR1713336,Brazil,Belo Horizonte,2016-02-16,Munk 2022,PRJEB13831 +ERR1713337,ERR1713337,Brazil,Belem,2016-02-15,Munk 2022,PRJEB13831 +ERR1713338,ERR1713338,Botswana,Gaborone,2016-01-25,Munk 2022,PRJEB13831 +ERR1713339,ERR1713339,Canada,Regina,2016-01-26,Munk 2022,PRJEB13831 +ERR1713340,ERR1713340,Canada,Calgary,2016-02-01,Munk 2022,PRJEB13831 +ERR1713341,ERR1713341,Canada,Toronto,2016-02-01,Munk 2022,PRJEB13831 +ERR1713342,ERR1713342,Canada,Ottawa,2016-01-26,Munk 2022,PRJEB13831 +ERR1713343,ERR1713343,Switzerland,Bern,2016-01-27,Munk 2022,PRJEB13831 +ERR1713344,ERR1713344,China,Guangzhou,2016-01-28,Munk 2022,PRJEB13831 +ERR1713345,ERR1713345,Cote d'Ivoire,Abidjan,2016-01-25,Munk 2022,PRJEB13831 +ERR1713346,ERR1713346,Colombia,Bogota,2016-01-27,Munk 2022,PRJEB13831 +ERR1713347,ERR1713347,Czech Republic,Prague,2016-02-01,Munk 2022,PRJEB13831 +ERR1713348,ERR1713348,Germany,Berlin,2016-02-08,Munk 2022,PRJEB13831 +ERR1713349,ERR1713349,Denmark,Copenhagen,2016-03-02,Munk 2022,PRJEB13831 +ERR1713350,ERR1713350,Denmark,Copenhagen,2016-03-02,Munk 2022,PRJEB13831 +ERR1713351,ERR1713351,Denmark,Copenhagen,2016-03-02,Munk 2022,PRJEB13831 +ERR1713352,ERR1713352,Ecuador,Quito,2016-02-01,Munk 2022,PRJEB13831 +ERR1713353,ERR1713353,Ecuador,"San Cristobal Island, Gal?pagos",2016-02-06,Munk 2022,PRJEB13831 +ERR1713354,ERR1713354,Spain,Barcelona,2016-02-10,Munk 2022,PRJEB13831 +ERR1713355,ERR1713355,Ethiopia,Addis Ababa,2016-02-01,Munk 2022,PRJEB13831 +ERR1713356,ERR1713356,Finland,Helsinki,2016-01-26,Munk 2022,PRJEB13831 +ERR1713357,ERR1713357,Georgia,Tbilisi,2016-01-25,Munk 2022,PRJEB13831 +ERR1713358,ERR1713358,Ghana,Tamale,2016-01-30,Munk 2022,PRJEB13831 +ERR1713359,ERR1713359,Gambia,Banjul,2016-01-28,Munk 2022,PRJEB13831 +ERR1713360,ERR1713360,Croatia,Zagreb,2016-02-01,Munk 2022,PRJEB13831 +ERR1713361,ERR1713361,Hungary,Budapest,2016-02-02,Munk 2022,PRJEB13831 +ERR1713362,ERR1713362,India,Kochi,2016-01-27,Munk 2022,PRJEB13831 +ERR1713363,ERR1713363,Ireland,Galway,2016-02-10,Munk 2022,PRJEB13831 +ERR1713364,ERR1713364,Iran,Tehran,2016-02-02,Munk 2022,PRJEB13831 +ERR1713365,ERR1713365,Iceland,Reykjavik,2016-02-02,Munk 2022,PRJEB13831 +ERR1713366,ERR1713366,Israel,Jerusalem,2016-01-31,Munk 2022,PRJEB13831 +ERR1713367,ERR1713367,Italy,Rome,2016-02-02,Munk 2022,PRJEB13831 +ERR1713368,ERR1713368,Kazakhstan,Almaty,2016-02-02,Munk 2022,PRJEB13831 +ERR1713369,ERR1713369,Kenya,Nairobi,2016-02-08,Munk 2022,PRJEB13831 +ERR1713370,ERR1713370,Cambodia,Phnom Penh,2016-02-05,Munk 2022,PRJEB13831 +ERR1713371,ERR1713371,Sri Lanka,Colombo,2016-02-28,Munk 2022,PRJEB13831 +ERR1713372,ERR1713372,Luxembourg,Luxembourg-city,2016-02-01,Munk 2022,PRJEB13831 +ERR1713373,ERR1713373,Latvia,Riga,2016-02-01,Munk 2022,PRJEB13831 +ERR1713374,ERR1713374,Moldova,,,Munk 2022,PRJEB13831 +ERR1713375,ERR1713375,North Macedonia,Skopje,2016-02-02,Munk 2022,PRJEB13831 +ERR1713376,ERR1713376,Malta,Xghajra,2016-02-01,Munk 2022,PRJEB13831 +ERR1713377,ERR1713377,Malaysia,Kuala Lumpur,2016-02-04,Munk 2022,PRJEB13831 +ERR1713378,ERR1713378,Nigeria,Lagos,2016-02-02,Munk 2022,PRJEB13831 +ERR1713379,ERR1713379,Netherlands,Amsterdam,2016-01-27,Munk 2022,PRJEB13831 +ERR1713380,ERR1713380,Norway,Oslo,2016-02-03,Munk 2022,PRJEB13831 +ERR1713381,ERR1713381,Nepal,Kathmandu,2016-01-30,Munk 2022,PRJEB13831 +ERR1713382,ERR1713382,New Zealand,Dunedin,2016-01-27,Munk 2022,PRJEB13831 +ERR1713383,ERR1713383,Poland,Pulawy,2016-02-02,Munk 2022,PRJEB13831 +ERR1713384,ERR1713384,Pakistan,Karachi,2016-01-27,Munk 2022,PRJEB13831 +ERR1713385,ERR1713385,Peru,Lima,2016-02-04,Munk 2022,PRJEB13831 +ERR1713386,ERR1713386,Senegal,Dakar,2016-02-02,Munk 2022,PRJEB13831 +ERR1713387,ERR1713387,Singapore,"Singapore, Clementi",2016-02-03,Munk 2022,PRJEB13831 +ERR1713388,ERR1713388,Serbia,Belgrade,2016-01-27,Munk 2022,PRJEB13831 +ERR1713389,ERR1713389,Slovakia,in Bratislava,2016-02-02,Munk 2022,PRJEB13831 +ERR1713390,ERR1713390,Slovenia,Ljubljana,2016-01-25,Munk 2022,PRJEB13831 +ERR1713391,ERR1713391,Sweden,Uppsala,2016-02-02,Munk 2022,PRJEB13831 +ERR1713392,ERR1713392,Sweden,Gothenburg,2016-02-11,Munk 2022,PRJEB13831 +ERR1713393,ERR1713393,Chad,N'Djamena,2016-01-26,Munk 2022,PRJEB13831 +ERR1713394,ERR1713394,Togo,Lome,2016-01-29,Munk 2022,PRJEB13831 +ERR1713395,ERR1713395,Turkey,Ankara,2016-02-03,Munk 2022,PRJEB13831 +ERR1713396,ERR1713396,Tanzania,Moshi,2016-02-03,Munk 2022,PRJEB13831 +ERR1713397,ERR1713397,USA,"Atlanta, GA",2016-02-02,Munk 2022,PRJEB13831 +ERR1713398,ERR1713398,USA,"Seattle, WA",2016-02-01,Munk 2022,PRJEB13831 +ERR1713399,ERR1713399,USA,"Chicago, IL",2016-02-24,Munk 2022,PRJEB13831 +ERR1713400,ERR1713400,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB13831 +ERR1713401,ERR1713401,USA,"Portland, OR",2016-02-10,Munk 2022,PRJEB13831 +ERR1713402,ERR1713402,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB13831 +ERR1713403,ERR1713403,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB13831 +ERR1713404,ERR1713404,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB13831 +ERR1713405,ERR1713405,USA,,,Munk 2022,PRJEB13831 +ERR1713406,ERR1713406,USA,"Boulder, CO",2016-03-28,Munk 2022,PRJEB13831 +ERR1713407,ERR1713407,Viet Nam,Ho Chi Minh,2016-01-27,Munk 2022,PRJEB13831 +ERR1713408,ERR1713408,Kosovo,Prishtina,2016-02-03,Munk 2022,PRJEB13831 +ERR1713409,ERR1713409,South Africa,Pretoria,2016-02-04,Munk 2022,PRJEB13831 +ERR1713410,ERR1713410,Zambia,Lusaka,2016-01-25,Munk 2022,PRJEB13831 +ERR1713411,ERR1713411,Zambia,Kitwe,2016-01-12,Munk 2022,PRJEB13831 +ERR1725938,ERR1725938,Albania,Tirana,2016-02-01,Munk 2022,PRJEB13831 +ERR1725939,ERR1725939,Australia,Canberra,2016-02-04,Munk 2022,PRJEB13831 +ERR1725940,ERR1725940,Australia,Canberra,2016-02-04,Munk 2022,PRJEB13831 +ERR1725941,ERR1725941,Australia,Melbourne,2016-02-08,Munk 2022,PRJEB13831 +ERR1725942,ERR1725942,Australia,Melbourne,2016-02-08,Munk 2022,PRJEB13831 +ERR1725943,ERR1725943,Australia,Melbourne,2016-02-08,Munk 2022,PRJEB13831 +ERR1725944,ERR1725944,Australia,Melbourne,2016-02-08,Munk 2022,PRJEB13831 +ERR1725947,ERR1725947,Bulgaria,Sofia,2016-02-04,Munk 2022,PRJEB13831 +ERR1725948,ERR1725948,Bulgaria,Sofia,2016-02-04,Munk 2022,PRJEB13831 +ERR1725949,ERR1725949,Brazil,Belo Horizonte,2016-02-16,Munk 2022,PRJEB13831 +ERR1725950,ERR1725950,Brazil,Belem,2016-02-15,Munk 2022,PRJEB13831 +ERR1725951,ERR1725951,Brazil,Belem,2016-02-15,Munk 2022,PRJEB13831 +ERR1725952,ERR1725952,Botswana,Gaborone,2016-01-25,Munk 2022,PRJEB13831 +ERR1725953,ERR1725953,Botswana,Gaborone,2016-01-25,Munk 2022,PRJEB13831 +ERR1725954,ERR1725954,Canada,Regina,2016-01-26,Munk 2022,PRJEB13831 +ERR1725955,ERR1725955,Canada,Calgary,2016-02-01,Munk 2022,PRJEB13831 +ERR1725956,ERR1725956,Canada,Calgary,2016-02-01,Munk 2022,PRJEB13831 +ERR1725957,ERR1725957,Canada,Calgary,2016-02-01,Munk 2022,PRJEB13831 +ERR1725958,ERR1725958,Canada,Calgary,2016-02-01,Munk 2022,PRJEB13831 +ERR1725959,ERR1725959,Canada,Toronto,2016-02-01,Munk 2022,PRJEB13831 +ERR1725960,ERR1725960,Canada,Toronto,2016-02-01,Munk 2022,PRJEB13831 +ERR1725961,ERR1725961,Switzerland,Bern,2016-01-27,Munk 2022,PRJEB13831 +ERR1725962,ERR1725962,Switzerland,Bern,2016-01-27,Munk 2022,PRJEB13831 +ERR1725963,ERR1725963,Switzerland,Bern,2016-01-27,Munk 2022,PRJEB13831 +ERR1725964,ERR1725964,Switzerland,Bern,2016-01-27,Munk 2022,PRJEB13831 +ERR1725965,ERR1725965,Cote d'Ivoire,Abidjan,2016-01-25,Munk 2022,PRJEB13831 +ERR1725966,ERR1725966,Czech Republic,Prague,2016-02-01,Munk 2022,PRJEB13831 +ERR1725967,ERR1725967,Germany,Berlin,2016-02-08,Munk 2022,PRJEB13831 +ERR1725968,ERR1725968,Germany,Berlin,2016-02-08,Munk 2022,PRJEB13831 +ERR1725969,ERR1725969,Spain,Barcelona,2016-02-10,Munk 2022,PRJEB13831 +ERR1725970,ERR1725970,Spain,Barcelona,2016-02-10,Munk 2022,PRJEB13831 +ERR1725971,ERR1725971,Spain,Barcelona,2016-02-10,Munk 2022,PRJEB13831 +ERR1725972,ERR1725972,Spain,Barcelona,2016-02-10,Munk 2022,PRJEB13831 +ERR1725973,ERR1725973,Finland,Helsinki,2016-01-26,Munk 2022,PRJEB13831 +ERR1725974,ERR1725974,Georgia,Tbilisi,2016-01-25,Munk 2022,PRJEB13831 +ERR1725975,ERR1725975,Georgia,Tbilisi,2016-01-25,Munk 2022,PRJEB13831 +ERR1725976,ERR1725976,Croatia,Zagreb,2016-02-01,Munk 2022,PRJEB13831 +ERR1725977,ERR1725977,Croatia,Zagreb,2016-02-01,Munk 2022,PRJEB13831 +ERR1725978,ERR1725978,India,Kochi,2016-01-27,Munk 2022,PRJEB13831 +ERR1725979,ERR1725979,Ireland,Galway,2016-02-10,Munk 2022,PRJEB13831 +ERR1725980,ERR1725980,Ireland,Galway,2016-02-10,Munk 2022,PRJEB13831 +ERR1725981,ERR1725981,Iran,Tehran,2016-02-02,Munk 2022,PRJEB13831 +ERR1725982,ERR1725982,Iceland,Reykjavik,2016-02-02,Munk 2022,PRJEB13831 +ERR1725983,ERR1725983,Israel,Jerusalem,2016-01-31,Munk 2022,PRJEB13831 +ERR1725984,ERR1725984,Israel,Jerusalem,2016-01-31,Munk 2022,PRJEB13831 +ERR1725985,ERR1725985,Kenya,Nairobi,2016-02-08,Munk 2022,PRJEB13831 +ERR1725986,ERR1725986,Kenya,Nairobi,2016-02-08,Munk 2022,PRJEB13831 +ERR1725987,ERR1725987,Latvia,Riga,2016-02-01,Munk 2022,PRJEB13831 +ERR1725988,ERR1725988,Moldova,,,Munk 2022,PRJEB13831 +ERR1725989,ERR1725989,Moldova,,,Munk 2022,PRJEB13831 +ERR1725990,ERR1725990,North Macedonia,Skopje,2016-02-02,Munk 2022,PRJEB13831 +ERR1725991,ERR1725991,North Macedonia,Skopje,2016-02-02,Munk 2022,PRJEB13831 +ERR1725992,ERR1725992,Malta,Xghajra,2016-02-01,Munk 2022,PRJEB13831 +ERR1725993,ERR1725993,Malta,Xghajra,2016-02-01,Munk 2022,PRJEB13831 +ERR1725994,ERR1725994,Malaysia,Kuala Lumpur,2016-02-04,Munk 2022,PRJEB13831 +ERR1725995,ERR1725995,Malaysia,Kuala Lumpur,2016-02-04,Munk 2022,PRJEB13831 +ERR1725996,ERR1725996,Nigeria,Lagos,2016-02-02,Munk 2022,PRJEB13831 +ERR1725997,ERR1725997,Netherlands,Amsterdam,2016-01-27,Munk 2022,PRJEB13831 +ERR1725998,ERR1725998,Netherlands,Amsterdam,2016-01-27,Munk 2022,PRJEB13831 +ERR1725999,ERR1725999,Norway,Oslo,2016-02-03,Munk 2022,PRJEB13831 +ERR1726000,ERR1726000,Poland,Pulawy,2016-02-02,Munk 2022,PRJEB13831 +ERR1726001,ERR1726001,Singapore,"Singapore, Clementi",2016-02-03,Munk 2022,PRJEB13831 +ERR1726002,ERR1726002,Serbia,Belgrade,2016-01-27,Munk 2022,PRJEB13831 +ERR1726003,ERR1726003,Slovakia,in Bratislava,2016-02-02,Munk 2022,PRJEB13831 +ERR1726004,ERR1726004,Slovenia,Ljubljana,2016-01-25,Munk 2022,PRJEB13831 +ERR1726005,ERR1726005,Sweden,Gothenburg,2016-02-11,Munk 2022,PRJEB13831 +ERR1726006,ERR1726006,Sweden,Gothenburg,2016-02-11,Munk 2022,PRJEB13831 +ERR1726007,ERR1726007,Chad,N'Djamena,2016-01-26,Munk 2022,PRJEB13831 +ERR1726008,ERR1726008,Chad,N'Djamena,2016-01-26,Munk 2022,PRJEB13831 +ERR1726009,ERR1726009,Turkey,Ankara,2016-02-03,Munk 2022,PRJEB13831 +ERR1726010,ERR1726010,USA,"Seattle, WA",2016-02-01,Munk 2022,PRJEB13831 +ERR1726011,ERR1726011,USA,"Seattle, WA",2016-02-01,Munk 2022,PRJEB13831 +ERR1726012,ERR1726012,USA,"Chicago, IL",2016-02-24,Munk 2022,PRJEB13831 +ERR1726013,ERR1726013,USA,"Chicago, IL",2016-02-24,Munk 2022,PRJEB13831 +ERR1726014,ERR1726014,USA,"Chicago, IL",2016-02-24,Munk 2022,PRJEB13831 +ERR1726015,ERR1726015,USA,"Chicago, IL",2016-02-24,Munk 2022,PRJEB13831 +ERR1726016,ERR1726016,USA,"Portland, OR",2016-02-10,Munk 2022,PRJEB13831 +ERR1726017,ERR1726017,USA,"Portland, OR",2016-02-10,Munk 2022,PRJEB13831 +ERR1726018,ERR1726018,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB13831 +ERR1726019,ERR1726019,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB13831 +ERR1726020,ERR1726020,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB13831 +ERR1726021,ERR1726021,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB13831 +ERR1726022,ERR1726022,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB13831 +ERR1726023,ERR1726023,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB13831 +ERR1726024,ERR1726024,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB13831 +ERR1726025,ERR1726025,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB13831 +ERR1726026,ERR1726026,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB13831 +ERR1726027,ERR1726027,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB13831 +ERR1726028,ERR1726028,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB13831 +ERR1726029,ERR1726029,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB13831 +ERR1726030,ERR1726030,South Africa,Pretoria,2016-02-04,Munk 2022,PRJEB13831 +ERR1726031,ERR1726031,Zambia,Lusaka,2016-01-25,Munk 2022,PRJEB13831 +ERR1726032,ERR1726032,Zambia,Kitwe,2016-01-12,Munk 2022,PRJEB13831 +ERR1726033,ERR1726033,Zambia,Kitwe,2016-01-12,Munk 2022,PRJEB13831 +ERR1726034,ERR1726034,Zambia,Kitwe,2016-01-12,Munk 2022,PRJEB13831 +ERR1726035,ERR1726035,Zambia,Kitwe,2016-01-12,Munk 2022,PRJEB13831 +ERR2592244,ERR2592244,Albania,Tirana,2016-02-01,Munk 2022,PRJEB13831 +ERR2592245,ERR2592245,Australia,Canberra,2016-02-04,Munk 2022,PRJEB13831 +ERR2592247,ERR2592247,Bulgaria,Sofia,2016-02-04,Munk 2022,PRJEB13831 +ERR2592248,ERR2592248,Brazil,Belo Horizonte,2016-02-16,Munk 2022,PRJEB13831 +ERR2592249,ERR2592249,Botswana,Gaborone,2016-01-25,Munk 2022,PRJEB13831 +ERR2592250,ERR2592250,Canada,Toronto,2016-02-01,Munk 2022,PRJEB13831 +ERR2592251,ERR2592251,Switzerland,Bern,2016-01-27,Munk 2022,PRJEB13831 +ERR2592252,ERR2592252,Cote d'Ivoire,Abidjan,2016-01-25,Munk 2022,PRJEB13831 +ERR2592253,ERR2592253,Czech Republic,Prague,2016-02-01,Munk 2022,PRJEB13831 +ERR2592254,ERR2592254,Germany,Berlin,2016-02-08,Munk 2022,PRJEB13831 +ERR2592255,ERR2592255,Finland,Helsinki,2016-01-26,Munk 2022,PRJEB13831 +ERR2592256,ERR2592256,Georgia,Tbilisi,2016-01-25,Munk 2022,PRJEB13831 +ERR2592257,ERR2592257,Croatia,Zagreb,2016-02-01,Munk 2022,PRJEB13831 +ERR2592258,ERR2592258,India,Kochi,2016-01-27,Munk 2022,PRJEB13831 +ERR2592259,ERR2592259,Iran,Tehran,2016-02-02,Munk 2022,PRJEB13831 +ERR2592260,ERR2592260,Iceland,Reykjavik,2016-02-02,Munk 2022,PRJEB13831 +ERR2592261,ERR2592261,Kenya,Nairobi,2016-02-08,Munk 2022,PRJEB13831 +ERR2592262,ERR2592262,Latvia,Riga,2016-02-01,Munk 2022,PRJEB13831 +ERR2592263,ERR2592263,North Macedonia,Skopje,2016-02-02,Munk 2022,PRJEB13831 +ERR2592264,ERR2592264,Malaysia,Kuala Lumpur,2016-02-04,Munk 2022,PRJEB13831 +ERR2592265,ERR2592265,Nigeria,Lagos,2016-02-02,Munk 2022,PRJEB13831 +ERR2592266,ERR2592266,Netherlands,Amsterdam,2016-01-27,Munk 2022,PRJEB13831 +ERR2592267,ERR2592267,Norway,Oslo,2016-02-03,Munk 2022,PRJEB13831 +ERR2592268,ERR2592268,Poland,Pulawy,2016-02-02,Munk 2022,PRJEB13831 +ERR2592269,ERR2592269,Singapore,"Singapore, Clementi",2016-02-03,Munk 2022,PRJEB13831 +ERR2592270,ERR2592270,Serbia,Belgrade,2016-01-27,Munk 2022,PRJEB13831 +ERR2592271,ERR2592271,Slovenia,Ljubljana,2016-01-25,Munk 2022,PRJEB13831 +ERR2592272,ERR2592272,Chad,N'Djamena,2016-01-26,Munk 2022,PRJEB13831 +ERR2592273,ERR2592273,Turkey,Ankara,2016-02-03,Munk 2022,PRJEB13831 +ERR2592274,ERR2592274,USA,"Seattle, WA",2016-02-01,Munk 2022,PRJEB13831 +ERR2592275,ERR2592275,USA,"Chicago, IL",2016-02-24,Munk 2022,PRJEB13831 +ERR2592276,ERR2592276,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB13831 +ERR2592277,ERR2592277,USA,"Portland, OR",2016-02-10,Munk 2022,PRJEB13831 +ERR2592278,ERR2592278,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB13831 +ERR2592279,ERR2592279,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB13831 +ERR2592280,ERR2592280,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB13831 +ERR2592281,ERR2592281,South Africa,Pretoria,2016-02-04,Munk 2022,PRJEB13831 +ERR2592282,ERR2592282,Zambia,Lusaka,2016-01-25,Munk 2022,PRJEB13831 +ERR2592283,ERR2592283,Zambia,Kitwe,2016-01-12,Munk 2022,PRJEB13831 +ERR2592328,ERR2592328,Australia,Canberra,2016-02-04,Munk 2022,PRJEB13831 +ERR2592329,ERR2592329,Canada,Regina,2016-01-26,Munk 2022,PRJEB13831 +ERR2592331,ERR2592331,Switzerland,Bern,2016-01-27,Munk 2022,PRJEB13831 +ERR2592332,ERR2592332,Germany,Berlin,2016-02-08,Munk 2022,PRJEB13831 +ERR2592333,ERR2592333,Ecuador,"San Cristobal Island, Gal?pagos",2016-02-06,Munk 2022,PRJEB13831 +ERR2592334,ERR2592334,Spain,Barcelona,2016-02-10,Munk 2022,PRJEB13831 +ERR2592335,ERR2592335,Croatia,Zagreb,2016-02-01,Munk 2022,PRJEB13831 +ERR2592336,ERR2592336,Latvia,Riga,2016-02-01,Munk 2022,PRJEB13831 +ERR2592337,ERR2592337,Malaysia,Kuala Lumpur,2016-02-04,Munk 2022,PRJEB13831 +ERR2592338,ERR2592338,Netherlands,Amsterdam,2016-01-27,Munk 2022,PRJEB13831 +ERR2592339,ERR2592339,Chad,N'Djamena,2016-01-26,Munk 2022,PRJEB13831 +ERR2592340,ERR2592340,USA,"Chicago, IL",2016-02-24,Munk 2022,PRJEB13831 +ERR2592341,ERR2592341,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB13831 +ERR2592342,ERR2592342,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB13831 +ERR2592343,ERR2592343,Zambia,Kitwe,2016-01-12,Munk 2022,PRJEB13831 +ERR9855084,ERR9855084,China,Guangzhou,2016-02-02,Munk 2022,PRJEB13831 +ERR9855085,ERR9855085,Malaysia,Kuala Lumpur,2016-02-05,Munk 2022,PRJEB13831 +ERR9855086,ERR9855086,Australia,Melbourne,2016-08-02,Munk 2022,PRJEB13831 +ERR9855087,ERR9855087,Canada,Regina,2016-01-26,Munk 2022,PRJEB13831 +ERR9855088,ERR9855088,Canada,Regina,2016-01-26,Munk 2022,PRJEB13831 +ERR9855089,ERR9855089,Germany,Berlin,2016-02-09,Munk 2022,PRJEB13831 +ERR9855090,ERR9855090,Tanzania,Moshi,2016-02-04,Munk 2022,PRJEB13831 +ERR9855091,ERR9855091,Brazil,Belem,2016-02-17,Munk 2022,PRJEB13831 +ERR9855092,ERR9855092,Germany,Berlin,2016-02-09,Munk 2022,PRJEB13831 +ERR9855093,ERR9855093,Iran,Tehran,2016-02-03,Munk 2022,PRJEB13831 +ERR9855094,ERR9855094,Australia,Melbourne,2016-08-02,Munk 2022,PRJEB13831 +ERR9855095,ERR9855095,Iran,Tehran,2016-02-03,Munk 2022,PRJEB13831 +ERR9855096,ERR9855096,Tanzania,Moshi,2016-02-04,Munk 2022,PRJEB13831 +ERR9855097,ERR9855097,Brazil,Belem,2016-02-17,Munk 2022,PRJEB13831 +ERR9855098,ERR9855098,Malaysia,Kuala Lumpur,2016-02-05,Munk 2022,PRJEB13831 +ERR9855099,ERR9855099,China,Guangzhou,2016-02-02,Munk 2022,PRJEB13831 \ No newline at end of file diff --git a/data/2024-05-06_munk/Munk-PRJEB13831/taxonomic_composition.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB13831/taxonomic_composition.tsv.gz new file mode 100644 index 0000000..aba2815 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB13831/taxonomic_composition.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27054/adapters.fasta b/data/2024-05-06_munk/Munk-PRJEB27054/adapters.fasta new file mode 100644 index 0000000..286a4ca --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB27054/adapters.fasta @@ -0,0 +1,69 @@ +>0 +CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +>1 +TAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAA +>2 +CTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATA +>3 +unspecified +>4 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC +>5 +CTGTCTCTTATACACATCTGACGCTGCCGACGA +>6 +GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG +>7 +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +>8 +GAACTAGAACTGAACTAGAACTGAACTAGAACTGAACTAGAACTGAACTAGAACTGAACT +>9 +TTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGG +>10 +heifigepsna +>11 +GAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA +>12 +AGAACTGAACTAGAACTGAACTAGAACTGAACTAGAACTGAACTAGAACTGAACTAGAAC +>13 +CCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAATAACCCTAA +>14 +TATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAG +>15 +ACACTCTTTCCCTACACGACGCTCTTCCGATCT +>16 +GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG +>17 +AGATCGGAAGAGCACACGTCTGAACTCCAGTCA +>18 +TGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>19 +CTGTCTCTTATACACATCTCCGAGCCCACGAGAC +>20 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>21 +GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +>22 +AGTTCAGTTCTAGTTCAGTTCTAGTTCAGTTCTAGTTCAGTTCTAGTTCAGTTCTAGTTC +>23 +CTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCT +>24 +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>25 +TCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTCTC +>26 +CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC +T +>27 +GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +>28 +TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG +>29 +CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +>30 +AGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAG +>31 +CAAGCAGAAGACGGCATACGAGAT +>32 +TTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTAGGGTTATTA +>33 +GTTCTAGTTCAGTTCTAGTTCAGTTCTAGTTCAGTTCTAGTTCAGTTCTAGTTCAGTTCT diff --git a/data/2024-05-06_munk/Munk-PRJEB27054/hv_clade_counts.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27054/hv_clade_counts.tsv.gz new file mode 100644 index 0000000..cabc844 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27054/hv_clade_counts.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27054/hv_hits_blast_paired.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27054/hv_hits_blast_paired.tsv.gz new file mode 100644 index 0000000..3454031 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27054/hv_hits_blast_paired.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27054/hv_hits_putative_filtered.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27054/hv_hits_putative_filtered.tsv.gz new file mode 100644 index 0000000..674c175 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27054/hv_hits_putative_filtered.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27054/qc_adapter_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27054/qc_adapter_stats.tsv.gz new file mode 100644 index 0000000..83258d8 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27054/qc_adapter_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27054/qc_basic_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27054/qc_basic_stats.tsv.gz new file mode 100644 index 0000000..cc7ee3e Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27054/qc_basic_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27054/qc_quality_base_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27054/qc_quality_base_stats.tsv.gz new file mode 100644 index 0000000..854d07e Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27054/qc_quality_base_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27054/qc_quality_sequence_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27054/qc_quality_sequence_stats.tsv.gz new file mode 100644 index 0000000..c7a153d Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27054/qc_quality_sequence_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27054/sample-metadata.csv b/data/2024-05-06_munk/Munk-PRJEB27054/sample-metadata.csv new file mode 100644 index 0000000..e78c8d5 --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB27054/sample-metadata.csv @@ -0,0 +1,236 @@ +library,sample,country,city,date,dataset,bioproject +ERR2607371,ERR2607371,Albania,Tirana,2016-01-02,Munk 2022,PRJEB27054 +ERR2607372,ERR2607372,Albania,Tirana,2016-01-02,Munk 2022,PRJEB27054 +ERR2607373,ERR2607373,Albania,Tirana,2016-01-02,Munk 2022,PRJEB27054 +ERR2607374,ERR2607374,Australia,Woden,2016-04-02,Munk 2022,PRJEB27054 +ERR2607375,ERR2607375,Australia,Woden,2016-04-02,Munk 2022,PRJEB27054 +ERR2607376,ERR2607376,Australia,Woden,2016-04-02,Munk 2022,PRJEB27054 +ERR2607377,ERR2607377,Australia,Woden,2016-04-02,Munk 2022,PRJEB27054 +ERR2607378,ERR2607378,Australia,Melbourne,2016-08-02,Munk 2022,PRJEB27054 +ERR2607379,ERR2607379,Australia,Melbourne,2016-08-02,Munk 2022,PRJEB27054 +ERR2607380,ERR2607380,Australia,Melbourne,2016-08-02,Munk 2022,PRJEB27054 +ERR2607381,ERR2607381,Australia,Melbourne,2016-08-02,Munk 2022,PRJEB27054 +ERR2607382,ERR2607382,Australia,Melbourne,2016-08-02,Munk 2022,PRJEB27054 +ERR2607383,ERR2607383,Australia,Woden,2016-04-02,Munk 2022,PRJEB27054 +ERR2607384,ERR2607384,Austria,Vienna,2016-02-16,Munk 2022,PRJEB27054 +ERR2607385,ERR2607385,Austria,Vienna,2016-02-16,Munk 2022,PRJEB27054 +ERR2607386,ERR2607386,Austria,Vienna,2016-02-16,Munk 2022,PRJEB27054 +ERR2607387,ERR2607387,Austria,Vienna,2016-02-16,Munk 2022,PRJEB27054 +ERR2607388,ERR2607388,Bulgaria,Sofia,2016-02-15,Munk 2022,PRJEB27054 +ERR2607389,ERR2607389,Bulgaria,Sofia,2016-02-15,Munk 2022,PRJEB27054 +ERR2607390,ERR2607390,Bulgaria,Sofia,2016-02-15,Munk 2022,PRJEB27054 +ERR2607391,ERR2607391,Bulgaria,Sofia,2016-02-15,Munk 2022,PRJEB27054 +ERR2607392,ERR2607392,Brazil,Belem,2016-02-16,Munk 2022,PRJEB27054 +ERR2607393,ERR2607393,Brazil,Belem,2016-02-16,Munk 2022,PRJEB27054 +ERR2607394,ERR2607394,Brazil,Belem,2016-02-15,Munk 2022,PRJEB27054 +ERR2607395,ERR2607395,Brazil,Belem,2016-02-15,Munk 2022,PRJEB27054 +ERR2607396,ERR2607396,Brazil,Belem,2016-02-15,Munk 2022,PRJEB27054 +ERR2607397,ERR2607397,Brazil,Belem,2016-02-16,Munk 2022,PRJEB27054 +ERR2607398,ERR2607398,Botswana,Palapye,2016-01-25,Munk 2022,PRJEB27054 +ERR2607399,ERR2607399,Botswana,Palapye,2016-01-25,Munk 2022,PRJEB27054 +ERR2607400,ERR2607400,Botswana,Palapye,2016-01-25,Munk 2022,PRJEB27054 +ERR2607401,ERR2607401,Botswana,Palapye,2016-01-25,Munk 2022,PRJEB27054 +ERR2607402,ERR2607402,Canada,Regina,2016-01-26,Munk 2022,PRJEB27054 +ERR2607403,ERR2607403,Canada,Regina,2016-01-26,Munk 2022,PRJEB27054 +ERR2607404,ERR2607404,Canada,Calgary,2016-01-02,Munk 2022,PRJEB27054 +ERR2607405,ERR2607405,Canada,Calgary,2016-01-02,Munk 2022,PRJEB27054 +ERR2607406,ERR2607406,Canada,Calgary,2016-01-02,Munk 2022,PRJEB27054 +ERR2607407,ERR2607407,Canada,Calgary,2016-01-02,Munk 2022,PRJEB27054 +ERR2607408,ERR2607408,Canada,Calgary,2016-01-02,Munk 2022,PRJEB27054 +ERR2607409,ERR2607409,Canada,Toronto,2016-01-02,Munk 2022,PRJEB27054 +ERR2607410,ERR2607410,Canada,Toronto,2016-01-02,Munk 2022,PRJEB27054 +ERR2607411,ERR2607411,Canada,Toronto,2016-01-02,Munk 2022,PRJEB27054 +ERR2607412,ERR2607412,Canada,Toronto,2016-01-02,Munk 2022,PRJEB27054 +ERR2607413,ERR2607413,Canada,Toronto,2016-01-02,Munk 2022,PRJEB27054 +ERR2607414,ERR2607414,Canada,Ottawa,2016-01-26,Munk 2022,PRJEB27054 +ERR2607415,ERR2607415,Canada,Regina,2016-01-26,Munk 2022,PRJEB27054 +ERR2607416,ERR2607416,Switzerland,Liebefeld,2016-01-27,Munk 2022,PRJEB27054 +ERR2607417,ERR2607417,Switzerland,Liebefeld,2016-01-27,Munk 2022,PRJEB27054 +ERR2607418,ERR2607418,Switzerland,Liebefeld,2016-01-27,Munk 2022,PRJEB27054 +ERR2607419,ERR2607419,Switzerland,Liebefeld,2016-01-27,Munk 2022,PRJEB27054 +ERR2607420,ERR2607420,Switzerland,Liebefeld,2016-01-27,Munk 2022,PRJEB27054 +ERR2607421,ERR2607421,Switzerland,Liebefeld,2016-01-27,Munk 2022,PRJEB27054 +ERR2607422,ERR2607422,Switzerland,Liebefeld,2016-01-27,Munk 2022,PRJEB27054 +ERR2607423,ERR2607423,China,Guangzhou,2016-01-28,Munk 2022,PRJEB27054 +ERR2607424,ERR2607424,Cote d'Ivoire,Abidjan,2016-01-25,Munk 2022,PRJEB27054 +ERR2607425,ERR2607425,Cote d'Ivoire,Abidjan,2016-01-25,Munk 2022,PRJEB27054 +ERR2607426,ERR2607426,Cote d'Ivoire,Abidjan,2016-01-25,Munk 2022,PRJEB27054 +ERR2607427,ERR2607427,Colombia,Bogota,2016-01-27,Munk 2022,PRJEB27054 +ERR2607428,ERR2607428,Czech Republic,Brno,2016-01-02,Munk 2022,PRJEB27054 +ERR2607429,ERR2607429,Czech Republic,Brno,2016-01-02,Munk 2022,PRJEB27054 +ERR2607430,ERR2607430,Czech Republic,Brno,2016-01-02,Munk 2022,PRJEB27054 +ERR2607431,ERR2607431,Germany,Berlin,2016-08-02,Munk 2022,PRJEB27054 +ERR2607432,ERR2607432,Germany,Berlin,2016-02-08,Munk 2022,PRJEB27054 +ERR2607433,ERR2607433,Germany,Berlin,2016-08-02,Munk 2022,PRJEB27054 +ERR2607434,ERR2607434,Germany,Berlin,2016-08-02,Munk 2022,PRJEB27054 +ERR2607435,ERR2607435,Germany,Berlin,2016-08-02,Munk 2022,PRJEB27054 +ERR2607436,ERR2607436,Denmark,Avedore,2016-02-03,Munk 2022,PRJEB27054 +ERR2607437,ERR2607437,Denmark,Damhusaaen,2016-02-03,Munk 2022,PRJEB27054 +ERR2607438,ERR2607438,Denmark,Lynetten,2016-02-03,Munk 2022,PRJEB27054 +ERR2607439,ERR2607439,Ecuador,Galapagos,2016-06-02,Munk 2022,PRJEB27054 +ERR2607440,ERR2607440,Ecuador,Galapagos,2016-06-02,Munk 2022,PRJEB27054 +ERR2607441,ERR2607441,Ecuador,Quito,2016-01-02,Munk 2022,PRJEB27054 +ERR2607442,ERR2607442,Spain,Barcelona,2016-10-02,Munk 2022,PRJEB27054 +ERR2607443,ERR2607443,Spain,Barcelona,2016-10-02,Munk 2022,PRJEB27054 +ERR2607444,ERR2607444,Spain,Barcelona,2016-10-02,Munk 2022,PRJEB27054 +ERR2607445,ERR2607445,Spain,Barcelona,2016-10-02,Munk 2022,PRJEB27054 +ERR2607446,ERR2607446,Spain,Barcelona,2016-10-02,Munk 2022,PRJEB27054 +ERR2607447,ERR2607447,Spain,Barcelona,2016-10-02,Munk 2022,PRJEB27054 +ERR2607448,ERR2607448,Ethiopia,Addis Ababa,2016-01-02,Munk 2022,PRJEB27054 +ERR2607449,ERR2607449,Finland,Helsinki,2016-01-26,Munk 2022,PRJEB27054 +ERR2607450,ERR2607450,Finland,Helsinki,2016-01-26,Munk 2022,PRJEB27054 +ERR2607451,ERR2607451,Finland,Helsinki,2016-01-26,Munk 2022,PRJEB27054 +ERR2607452,ERR2607452,Georgia,Tbilisi,2016-01-25,Munk 2022,PRJEB27054 +ERR2607453,ERR2607453,Georgia,Tbilisi,2016-01-25,Munk 2022,PRJEB27054 +ERR2607454,ERR2607454,Georgia,Tbilisi,2016-01-25,Munk 2022,PRJEB27054 +ERR2607455,ERR2607455,Georgia,Tbilisi,2016-01-25,Munk 2022,PRJEB27054 +ERR2607456,ERR2607456,Ghana,Tamale,2016-01-30,Munk 2022,PRJEB27054 +ERR2607457,ERR2607457,Gambia,Banjul,2016-01-28,Munk 2022,PRJEB27054 +ERR2607458,ERR2607458,Croatia,Zagreb,2016-01-02,Munk 2022,PRJEB27054 +ERR2607459,ERR2607459,Croatia,Zagreb,2016-01-02,Munk 2022,PRJEB27054 +ERR2607460,ERR2607460,Croatia,Zagreb,2016-01-02,Munk 2022,PRJEB27054 +ERR2607461,ERR2607461,Croatia,Zagreb,2016-01-02,Munk 2022,PRJEB27054 +ERR2607462,ERR2607462,Croatia,Zagreb,2016-01-02,Munk 2022,PRJEB27054 +ERR2607463,ERR2607463,Hungary,Budapest,2016-02-02,Munk 2022,PRJEB27054 +ERR2607464,ERR2607464,India,Kochi,2016-01-27,Munk 2022,PRJEB27054 +ERR2607465,ERR2607465,India,Cochin,2016-01-27,Munk 2022,PRJEB27054 +ERR2607466,ERR2607466,India,Cochin,2016-01-27,Munk 2022,PRJEB27054 +ERR2607467,ERR2607467,Ireland,Dublin,2016-10-02,Munk 2022,PRJEB27054 +ERR2607468,ERR2607468,Ireland,Dublin,2016-10-02,Munk 2022,PRJEB27054 +ERR2607469,ERR2607469,Ireland,Dublin,2016-10-02,Munk 2022,PRJEB27054 +ERR2607470,ERR2607470,Iran,Tehran,2016-02-02,Munk 2022,PRJEB27054 +ERR2607471,ERR2607471,Iran,Tehran,2016-02-02,Munk 2022,PRJEB27054 +ERR2607472,ERR2607472,Iran,Tehran,2016-02-02,Munk 2022,PRJEB27054 +ERR2607473,ERR2607473,Iceland,Reykjavik,2016-02-02,Munk 2022,PRJEB27054 +ERR2607474,ERR2607474,Iceland,Reykjavik,2016-02-02,Munk 2022,PRJEB27054 +ERR2607475,ERR2607475,Iceland,Reykjavik,2016-02-02,Munk 2022,PRJEB27054 +ERR2607476,ERR2607476,Israel,Jerusalem,2016-01-31,Munk 2022,PRJEB27054 +ERR2607477,ERR2607477,Israel,Jerusalem,2016-01-31,Munk 2022,PRJEB27054 +ERR2607478,ERR2607478,Israel,Jerusalem,2016-01-31,Munk 2022,PRJEB27054 +ERR2607479,ERR2607479,Italy,Rome,2016-02-02,Munk 2022,PRJEB27054 +ERR2607480,ERR2607480,Kazakhstan,Almaty,2016-02-02,Munk 2022,PRJEB27054 +ERR2607481,ERR2607481,Kenya,Thika,2016-08-02,Munk 2022,PRJEB27054 +ERR2607482,ERR2607482,Kenya,Thika,2016-08-02,Munk 2022,PRJEB27054 +ERR2607483,ERR2607483,Kenya,Thika,2016-08-02,Munk 2022,PRJEB27054 +ERR2607484,ERR2607484,Kenya,Thika,2016-08-02,Munk 2022,PRJEB27054 +ERR2607485,ERR2607485,Cambodia,Phnom Penh,2016-05-02,Munk 2022,PRJEB27054 +ERR2607486,ERR2607486,Sri Lanka,Colombo,2016-02-28,Munk 2022,PRJEB27054 +ERR2607487,ERR2607487,Luxembourg,Belvaux,2016-01-02,Munk 2022,PRJEB27054 +ERR2607488,ERR2607488,Latvia,Riga,2016-01-02,Munk 2022,PRJEB27054 +ERR2607489,ERR2607489,Latvia,Riga,2016-01-02,Munk 2022,PRJEB27054 +ERR2607490,ERR2607490,Latvia,Riga,2016-01-02,Munk 2022,PRJEB27054 +ERR2607491,ERR2607491,Latvia,Riga,2016-01-02,Munk 2022,PRJEB27054 +ERR2607492,ERR2607492,Moldova,Chisinau,2016-02-04,Munk 2022,PRJEB27054 +ERR2607493,ERR2607493,Moldova,Chisinau,2016-02-04,Munk 2022,PRJEB27054 +ERR2607494,ERR2607494,Moldova,Chisinau,2016-02-04,Munk 2022,PRJEB27054 +ERR2607495,ERR2607495,North Macedonia,Skopje,2016-02-02,Munk 2022,PRJEB27054 +ERR2607496,ERR2607496,North Macedonia,Skopje,2016-02-02,Munk 2022,PRJEB27054 +ERR2607497,ERR2607497,North Macedonia,Skopje,2016-02-02,Munk 2022,PRJEB27054 +ERR2607498,ERR2607498,North Macedonia,Skopje,2016-02-02,Munk 2022,PRJEB27054 +ERR2607499,ERR2607499,Malta,St. Venera,2016-01-02,Munk 2022,PRJEB27054 +ERR2607500,ERR2607500,Malta,St. Venera,2016-01-02,Munk 2022,PRJEB27054 +ERR2607501,ERR2607501,Malta,St. Venera,2016-01-02,Munk 2022,PRJEB27054 +ERR2607502,ERR2607502,Malaysia,Kuala Lumpur,2016-04-02,Munk 2022,PRJEB27054 +ERR2607503,ERR2607503,Malaysia,Kuala Lumpur,2016-04-02,Munk 2022,PRJEB27054 +ERR2607504,ERR2607504,Malaysia,Kuala Lumpur,2016-04-02,Munk 2022,PRJEB27054 +ERR2607505,ERR2607505,Malaysia,Kuala Lumpur,2016-04-02,Munk 2022,PRJEB27054 +ERR2607506,ERR2607506,Malaysia,Kuala Lumpur,2016-04-02,Munk 2022,PRJEB27054 +ERR2607507,ERR2607507,Nigeria,Lagos,2016-02-02,Munk 2022,PRJEB27054 +ERR2607508,ERR2607508,Nigeria,Lagos,2016-02-02,Munk 2022,PRJEB27054 +ERR2607509,ERR2607509,Nigeria,Lagos,2016-02-02,Munk 2022,PRJEB27054 +ERR2607510,ERR2607510,Netherlands,Bilthoven,2016-01-27,Munk 2022,PRJEB27054 +ERR2607511,ERR2607511,Netherlands,Bilthoven,2016-01-27,Munk 2022,PRJEB27054 +ERR2607512,ERR2607512,Netherlands,Bilthoven,2016-01-27,Munk 2022,PRJEB27054 +ERR2607513,ERR2607513,Netherlands,Bilthoven,2016-01-27,Munk 2022,PRJEB27054 +ERR2607514,ERR2607514,Netherlands,Bilthoven,2016-01-27,Munk 2022,PRJEB27054 +ERR2607515,ERR2607515,Norway,Oslo,2016-03-02,Munk 2022,PRJEB27054 +ERR2607516,ERR2607516,Norway,Oslo,2016-03-02,Munk 2022,PRJEB27054 +ERR2607517,ERR2607517,Norway,Oslo,2016-03-02,Munk 2022,PRJEB27054 +ERR2607518,ERR2607518,Nepal,Kathmandu,2016-01-30,Munk 2022,PRJEB27054 +ERR2607519,ERR2607519,New Zealand,Dunedin,2016-01-27,Munk 2022,PRJEB27054 +ERR2607520,ERR2607520,Pakistan,Karachi,2016-01-27,Munk 2022,PRJEB27054 +ERR2607521,ERR2607521,Peru,Lima,2016-04-02,Munk 2022,PRJEB27054 +ERR2607522,ERR2607522,Poland,Pulawy,2016-02-02,Munk 2022,PRJEB27054 +ERR2607523,ERR2607523,Poland,Pulawy,2016-02-02,Munk 2022,PRJEB27054 +ERR2607524,ERR2607524,Poland,Pulawy,2016-02-02,Munk 2022,PRJEB27054 +ERR2607525,ERR2607525,Senegal,Dakar,2016-02-02,Munk 2022,PRJEB27054 +ERR2607526,ERR2607526,Singapore,Singapore,2016-03-02,Munk 2022,PRJEB27054 +ERR2607527,ERR2607527,Singapore,Singapore,2016-03-02,Munk 2022,PRJEB27054 +ERR2607528,ERR2607528,Singapore,Singapore,2016-03-02,Munk 2022,PRJEB27054 +ERR2607529,ERR2607529,Serbia,Belgrade,2016-01-27,Munk 2022,PRJEB27054 +ERR2607530,ERR2607530,Serbia,Belgrade,2016-01-27,Munk 2022,PRJEB27054 +ERR2607531,ERR2607531,Serbia,Belgrade,2016-01-27,Munk 2022,PRJEB27054 +ERR2607532,ERR2607532,Slovakia,Bratislava,2016-02-02,Munk 2022,PRJEB27054 +ERR2607533,ERR2607533,Slovakia,Bratislava,2016-02-02,Munk 2022,PRJEB27054 +ERR2607534,ERR2607534,Slovenia,Ljubljana,2016-01-25,Munk 2022,PRJEB27054 +ERR2607535,ERR2607535,Slovenia,Ljubljana,2016-01-25,Munk 2022,PRJEB27054 +ERR2607536,ERR2607536,Slovenia,Ljubljana,2016-01-25,Munk 2022,PRJEB27054 +ERR2607537,ERR2607537,Sweden,Gothenburg,2016-11-02,Munk 2022,PRJEB27054 +ERR2607538,ERR2607538,Sweden,Gothenburg,2016-11-02,Munk 2022,PRJEB27054 +ERR2607539,ERR2607539,Sweden,Gothenburg,2016-11-02,Munk 2022,PRJEB27054 +ERR2607540,ERR2607540,Sweden,Uppsala,2016-02-02,Munk 2022,PRJEB27054 +ERR2607541,ERR2607541,Chad,N'Djamena,2016-01-26,Munk 2022,PRJEB27054 +ERR2607542,ERR2607542,Chad,N'Djamena,2016-01-26,Munk 2022,PRJEB27054 +ERR2607543,ERR2607543,Chad,N'Djamena,2016-01-26,Munk 2022,PRJEB27054 +ERR2607544,ERR2607544,Chad,N'Djamena,2016-01-26,Munk 2022,PRJEB27054 +ERR2607545,ERR2607545,Chad,N'Djamena,2016-01-26,Munk 2022,PRJEB27054 +ERR2607546,ERR2607546,Togo,Lome,2016-01-26,Munk 2022,PRJEB27054 +ERR2607547,ERR2607547,Turkey,Ankara,2016-01-29,Munk 2022,PRJEB27054 +ERR2607548,ERR2607548,Turkey,Ankara,2016-01-29,Munk 2022,PRJEB27054 +ERR2607549,ERR2607549,Turkey,Ankara,2016-01-29,Munk 2022,PRJEB27054 +ERR2607550,ERR2607550,Tanzania,Moshi,2016-03-02,Munk 2022,PRJEB27054 +ERR2607551,ERR2607551,USA,"Seattle, WA",2016-01-02,Munk 2022,PRJEB27054 +ERR2607552,ERR2607552,USA,"Seattle, WA",2016-01-02,Munk 2022,PRJEB27054 +ERR2607553,ERR2607553,USA,"Seattle, WA",2016-01-02,Munk 2022,PRJEB27054 +ERR2607554,ERR2607554,USA,"Seattle, WA",2016-01-02,Munk 2022,PRJEB27054 +ERR2607555,ERR2607555,USA,"Chicago, IL",2016-02-24,Munk 2022,PRJEB27054 +ERR2607556,ERR2607556,USA,"Chicago, IL",2016-02-24,Munk 2022,PRJEB27054 +ERR2607557,ERR2607557,USA,"Chicago, IL",2016-02-24,Munk 2022,PRJEB27054 +ERR2607558,ERR2607558,USA,"Chicago, IL",2016-02-24,Munk 2022,PRJEB27054 +ERR2607559,ERR2607559,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607560,ERR2607560,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607561,ERR2607561,USA,"Chicago, IL",2016-02-24,Munk 2022,PRJEB27054 +ERR2607562,ERR2607562,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607563,ERR2607563,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607564,ERR2607564,USA,"Portland, OR",2016-10-02,Munk 2022,PRJEB27054 +ERR2607565,ERR2607565,USA,"Portland, OR",2016-10-02,Munk 2022,PRJEB27054 +ERR2607566,ERR2607566,USA,"Portland, OR",2016-10-02,Munk 2022,PRJEB27054 +ERR2607567,ERR2607567,USA,"Portland, OR",2016-10-02,Munk 2022,PRJEB27054 +ERR2607568,ERR2607568,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB27054 +ERR2607569,ERR2607569,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB27054 +ERR2607570,ERR2607570,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB27054 +ERR2607571,ERR2607571,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB27054 +ERR2607572,ERR2607572,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB27054 +ERR2607573,ERR2607573,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB27054 +ERR2607574,ERR2607574,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB27054 +ERR2607575,ERR2607575,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB27054 +ERR2607576,ERR2607576,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB27054 +ERR2607577,ERR2607577,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB27054 +ERR2607578,ERR2607578,USA,"El Paso, TX",2016-02-22,Munk 2022,PRJEB27054 +ERR2607579,ERR2607579,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607580,ERR2607580,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607581,ERR2607581,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607582,ERR2607582,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607583,ERR2607583,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607584,ERR2607584,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607585,ERR2607585,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607586,ERR2607586,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607587,ERR2607587,USA,"El Paso, TX",2016-02-23,Munk 2022,PRJEB27054 +ERR2607588,ERR2607588,USA,South Adams County,2016-01-01,Munk 2022,PRJEB27054 +ERR2607589,ERR2607589,USA,"Boulder, CO",2016-03-28,Munk 2022,PRJEB27054 +ERR2607590,ERR2607590,USA,"Atlanta, GA",2016-02-02,Munk 2022,PRJEB27054 +ERR2607591,ERR2607591,Viet Nam,Ho Chi Minh,2016-01-27,Munk 2022,PRJEB27054 +ERR2607592,ERR2607592,Kosovo,Prishtina,2016-03-02,Munk 2022,PRJEB27054 +ERR2607593,ERR2607593,South Africa,Johannesburg,2016-04-02,Munk 2022,PRJEB27054 +ERR2607594,ERR2607594,South Africa,Johannesburg,2016-04-02,Munk 2022,PRJEB27054 +ERR2607595,ERR2607595,South Africa,Johannesburg,2016-04-02,Munk 2022,PRJEB27054 +ERR2607596,ERR2607596,Zambia,Lusaka,2016-01-25,Munk 2022,PRJEB27054 +ERR2607597,ERR2607597,Zambia,Lusaka,2016-01-25,Munk 2022,PRJEB27054 +ERR2607598,ERR2607598,Zambia,Kitwe,2016-12-01,Munk 2022,PRJEB27054 +ERR2607599,ERR2607599,Zambia,Kitwe,2016-12-01,Munk 2022,PRJEB27054 +ERR2607600,ERR2607600,Zambia,Kitwe,2016-12-01,Munk 2022,PRJEB27054 +ERR2607601,ERR2607601,Zambia,Kitwe,2016-12-01,Munk 2022,PRJEB27054 +ERR2607602,ERR2607602,Zambia,Kitwe,2016-12-01,Munk 2022,PRJEB27054 +ERR2607603,ERR2607603,Zambia,Kitwe,2016-12-01,Munk 2022,PRJEB27054 +ERR2607604,ERR2607604,Zambia,Kitwe,2016-12-01,Munk 2022,PRJEB27054 +ERR2607605,ERR2607605,Zambia,Lusaka,2016-01-25,Munk 2022,PRJEB27054 diff --git a/data/2024-05-06_munk/Munk-PRJEB27054/taxonomic_composition.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27054/taxonomic_composition.tsv.gz new file mode 100644 index 0000000..6395d5e Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27054/taxonomic_composition.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27621/adapters.fasta b/data/2024-05-06_munk/Munk-PRJEB27621/adapters.fasta new file mode 100644 index 0000000..8841421 --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB27621/adapters.fasta @@ -0,0 +1,61 @@ +>0 +ACACTCTTTCCCTACACGACGCTCTTCCGATCT +>1 +ACACACACACACACACACACACACACACACACACACACACACACACACACACACACACAC +>2 +CAAGCAGAAGACGGCATACGAGAT +>3 +TGCTGCTGCT +>4 +TGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG +>5 +CACACACACACACACACACACACACACACACACACACACACACACACACACACACACACA +>6 +CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +>7 +CTGTCTCTTATACACATCTCCGAGCCCACGAGAC +>8 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC +>9 +GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG +>10 +heifigepsna +>11 +unspecified +>12 +GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +>13 +CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +>14 +AGCAGCAGCA +>15 +AGATCGGAAGAGCACACGTCTGAACTCCAGTCA +>16 +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>17 +CTGTCTCTTATACACATCTGACGCTGCCGACGA +>18 +TACATACATACATACATACATACATACATACATACATACATACATACATACATACATACA +>19 +CATACATACATACATACATACATACATACATACATACATACATACATACATACATACATA +>20 +TATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATG +>21 +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +>22 +TGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTA +>23 +TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG +>24 +GTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGT +>25 +TGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>26 +GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG +>27 +CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC +T +>28 +GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +>29 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT diff --git a/data/2024-05-06_munk/Munk-PRJEB27621/hv_clade_counts.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27621/hv_clade_counts.tsv.gz new file mode 100644 index 0000000..faf8878 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27621/hv_clade_counts.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27621/hv_hits_blast_paired.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27621/hv_hits_blast_paired.tsv.gz new file mode 100644 index 0000000..707f91d Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27621/hv_hits_blast_paired.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27621/hv_hits_putative_filtered.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27621/hv_hits_putative_filtered.tsv.gz new file mode 100644 index 0000000..2ab0bcd Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27621/hv_hits_putative_filtered.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27621/qc_adapter_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27621/qc_adapter_stats.tsv.gz new file mode 100644 index 0000000..68b5572 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27621/qc_adapter_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27621/qc_basic_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27621/qc_basic_stats.tsv.gz new file mode 100644 index 0000000..928a3e1 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27621/qc_basic_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27621/qc_quality_base_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27621/qc_quality_base_stats.tsv.gz new file mode 100644 index 0000000..4bc3755 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27621/qc_quality_base_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27621/qc_quality_sequence_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27621/qc_quality_sequence_stats.tsv.gz new file mode 100644 index 0000000..832b8e9 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27621/qc_quality_sequence_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB27621/sample-metadata.csv b/data/2024-05-06_munk/Munk-PRJEB27621/sample-metadata.csv new file mode 100644 index 0000000..38b4c21 --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB27621/sample-metadata.csv @@ -0,0 +1,176 @@ +library,sample,country,city,date,dataset,bioproject +ERR2683114,ERR2683114,Ecuador,Galapagos,2017-06-27,Munk 2022,PRJEB27621 +ERR2683115,ERR2683115,Malawi,Kasungu,2017-06-17,Munk 2022,PRJEB27621 +ERR2683116,ERR2683116,Malawi,Mzuzu,2017-06-17,Munk 2022,PRJEB27621 +ERR2683117,ERR2683117,Malawi,Zomba,2017-06-13,Munk 2022,PRJEB27621 +ERR2683118,ERR2683118,Malawi,Blantyre,2017-06-13,Munk 2022,PRJEB27621 +ERR2683119,ERR2683119,Iceland,Reykjavik,2017-06-18,Munk 2022,PRJEB27621 +ERR2683120,ERR2683120,Poland,Pulawy,2017-06-05,Munk 2022,PRJEB27621 +ERR2683121,ERR2683121,Nigeria,Ilorin,2017-06-15,Munk 2022,PRJEB27621 +ERR2683122,ERR2683122,Nigeria,Ilorin,2017-06-18,Munk 2022,PRJEB27621 +ERR2683123,ERR2683123,Cote d'Ivoire,Yamoussoukro,2017-06-12,Munk 2022,PRJEB27621 +ERR2683124,ERR2683124,Nigeria,Ilorin,2017-06-18,Munk 2022,PRJEB27621 +ERR2683125,ERR2683125,Nigeria,Zaria,2017-06-12,Munk 2022,PRJEB27621 +ERR2683126,ERR2683126,Nigeria,Zaria,2017-06-12,Munk 2022,PRJEB27621 +ERR2683127,ERR2683127,Nigeria,Zaria,2017-06-12,Munk 2022,PRJEB27621 +ERR2683128,ERR2683128,Nigeria,Ibadan,2017-07-18,Munk 2022,PRJEB27621 +ERR2683129,ERR2683129,Nigeria,Ibadan,2017-07-18,Munk 2022,PRJEB27621 +ERR2683130,ERR2683130,Nicaragua,Managua,2017-06-29,Munk 2022,PRJEB27621 +ERR2683131,ERR2683131,Nicaragua,Matagalpa,2017-06-25,Munk 2022,PRJEB27621 +ERR2683132,ERR2683132,Nicaragua,Granada,2017-07-02,Munk 2022,PRJEB27621 +ERR2683133,ERR2683133,Nicaragua,Masaya,2017-07-02,Munk 2022,PRJEB27621 +ERR2683134,ERR2683134,Malaysia,Kuala Lumpur,2017-06-15,Munk 2022,PRJEB27621 +ERR2683135,ERR2683135,Tanzania,Moshi,2017-06-12,Munk 2022,PRJEB27621 +ERR2683136,ERR2683136,Estonia,Tartu,2017-06-19,Munk 2022,PRJEB27621 +ERR2683137,ERR2683137,Estonia,Tallinn,2017-06-19,Munk 2022,PRJEB27621 +ERR2683138,ERR2683138,Estonia,Parnu,2017-06-13,Munk 2022,PRJEB27621 +ERR2683139,ERR2683139,Estonia,Narva,2017-06-07,Munk 2022,PRJEB27621 +ERR2683140,ERR2683140,Thailand,Pathum Thani,2017-06-15,Munk 2022,PRJEB27621 +ERR2683141,ERR2683141,Georgia,Kutaisi,2017-06-19,Munk 2022,PRJEB27621 +ERR2683142,ERR2683142,Croatia,Zagreb,2017-06-07,Munk 2022,PRJEB27621 +ERR2683143,ERR2683143,South Africa,Pretoria,2017-06-05,Munk 2022,PRJEB27621 +ERR2683144,ERR2683144,Luxembourg,Luxembourg,2017-06-13,Munk 2022,PRJEB27621 +ERR2683145,ERR2683145,Malaysia,Klang,2017-06-16,Munk 2022,PRJEB27621 +ERR2683146,ERR2683146,Italy,Rome,2017-06-05,Munk 2022,PRJEB27621 +ERR2683147,ERR2683147,South Korea,Gwangju,2017-06-15,Munk 2022,PRJEB27621 +ERR2683148,ERR2683148,Greece,Thessaloniki,2017-06-19,Munk 2022,PRJEB27621 +ERR2683149,ERR2683149,Australia,Melbourne,2017-06-14,Munk 2022,PRJEB27621 +ERR2683150,ERR2683150,Portugal,Porto,2017-06-13,Munk 2022,PRJEB27621 +ERR2683151,ERR2683151,United Kingdom,Edinburgh,2017-06-12,Munk 2022,PRJEB27621 +ERR2683152,ERR2683152,Canada,Regina,2017-06-13,Munk 2022,PRJEB27621 +ERR2683153,ERR2683153,Canada,Vancouver,2017-06-23,Munk 2022,PRJEB27621 +ERR2683154,ERR2683154,Canada,Calgary,2017-06-13,Munk 2022,PRJEB27621 +ERR2683155,ERR2683155,Canada,Ottawa,2017-06-14,Munk 2022,PRJEB27621 +ERR2683156,ERR2683156,Cameroon,Yaounde,2017-06-15,Munk 2022,PRJEB27621 +ERR2683157,ERR2683157,Bulgaria,Sofia,2017-06-18,Munk 2022,PRJEB27621 +ERR2683158,ERR2683158,United Kingdom,Newcastle Upon Tyne,2017-07-29,Munk 2022,PRJEB27621 +ERR2683159,ERR2683159,Canada,Toronto,2017-06-14,Munk 2022,PRJEB27621 +ERR2683160,ERR2683160,Uganda,Kampala,2017-06-19,Munk 2022,PRJEB27621 +ERR2683161,ERR2683161,India,Udupi,2017-06-19,Munk 2022,PRJEB27621 +ERR2683162,ERR2683162,Norway,Oslo,2017-06-07,Munk 2022,PRJEB27621 +ERR2683163,ERR2683163,Pakistan,Karachi,2017-06-12,Munk 2022,PRJEB27621 +ERR2683164,ERR2683164,Pakistan,Hyderabad,2017-06-15,Munk 2022,PRJEB27621 +ERR2683165,ERR2683165,Burkina Faso,Ouagadougou,2017-06-13,Munk 2022,PRJEB27621 +ERR2683166,ERR2683166,Burkina Faso,Bobo Dioulasso,2017-06-16,Munk 2022,PRJEB27621 +ERR2683167,ERR2683167,Hong Kong,Hong Kong,2017-06-23,Munk 2022,PRJEB27621 +ERR2683168,ERR2683168,Singapore,Singapore,2017-06-14,Munk 2022,PRJEB27621 +ERR2683169,ERR2683169,Paraguay,Costanera,2017-06-15,Munk 2022,PRJEB27621 +ERR2683170,ERR2683170,Nepal,Kathmandu,2017-06,Munk 2022,PRJEB27621 +ERR2683171,ERR2683171,Cambodia,Phnom Penh,2017-06-13,Munk 2022,PRJEB27621 +ERR2683172,ERR2683172,Guatemala,Guatemala City,2017-06-15,Munk 2022,PRJEB27621 +ERR2683173,ERR2683173,Lithuania,Vilnius,2017-08-02,Munk 2022,PRJEB27621 +ERR2683174,ERR2683174,Kenya,Nairobi,2017-06-13,Munk 2022,PRJEB27621 +ERR2683175,ERR2683175,Kazakhstan,Almaty,2017-07-12,Munk 2022,PRJEB27621 +ERR2683176,ERR2683176,USA,Chapel Hill,2017-06-20,Munk 2022,PRJEB27621 +ERR2683177,ERR2683177,North Macedonia,Skopje,2017-06-15,Munk 2022,PRJEB27621 +ERR2683178,ERR2683178,Viet Nam,Hanoi,2017-06-16,Munk 2022,PRJEB27621 +ERR2683179,ERR2683179,Ghana,Tamale,2017-06-18,Munk 2022,PRJEB27621 +ERR2683180,ERR2683180,Iran,Tehran,2017-06-11,Munk 2022,PRJEB27621 +ERR2683181,ERR2683181,Morocco,Casablanca,2017-07-17,Munk 2022,PRJEB27621 +ERR2683182,ERR2683182,Sri Lanka,Raddolugama,2017-07-11,Munk 2022,PRJEB27621 +ERR2683183,ERR2683183,Sri Lanka,Rathmalana,2017-07-13,Munk 2022,PRJEB27621 +ERR2683184,ERR2683184,Sri Lanka,Colombo,2017-07-25,Munk 2022,PRJEB27621 +ERR2683185,ERR2683185,Sri Lanka,Colombo,2017-07-25,Munk 2022,PRJEB27621 +ERR2683186,ERR2683186,Sri Lanka,Colombo,2017-07-25,Munk 2022,PRJEB27621 +ERR2683187,ERR2683187,Iran,Shiraz,2017-06-15,Munk 2022,PRJEB27621 +ERR2683188,ERR2683188,Brazil,Belem,2017-06-23,Munk 2022,PRJEB27621 +ERR2683189,ERR2683189,USA,"Portland, OR",2017-01-01,Munk 2022,PRJEB27621 +ERR2683190,ERR2683190,Greenland,Sisimiut,2017-06-16,Munk 2022,PRJEB27621 +ERR2683191,ERR2683191,Malaysia,Sungai Petani,2017-06-12,Munk 2022,PRJEB27621 +ERR2683193,ERR2683193,Malaysia,George Town,2017-06-08,Munk 2022,PRJEB27621 +ERR2683194,ERR2683194,Malaysia,Ipoh,2017-08-12,Munk 2022,PRJEB27621 +ERR2683195,ERR2683195,Malaysia,Butterworth,2017-06-08,Munk 2022,PRJEB27621 +ERR2683196,ERR2683196,Japan,Anonymous,2017-06-12,Munk 2022,PRJEB27621 +ERR2683197,ERR2683197,Denmark,Avedore,2017-06-15,Munk 2022,PRJEB27621 +ERR2683198,ERR2683198,Netherlands,Amsterdam,2017-06-12,Munk 2022,PRJEB27621 +ERR2683199,ERR2683199,Netherlands,Utrecht,2017-06-12,Munk 2022,PRJEB27621 +ERR2683200,ERR2683200,Uruguay,Montevideo,2017-06-16,Munk 2022,PRJEB27621 +ERR2683203,ERR2683203,Portugal,Lisbon,2017-06-06,Munk 2022,PRJEB27621 +ERR2683209,ERR2683209,Germany,Dresden,2017-07-09,Munk 2022,PRJEB27621 +ERR2683210,ERR2683210,Albania,Tirana,2017-06-05,Munk 2022,PRJEB27621 +ERR2683211,ERR2683211,Ghana,Accra,2017-06-16,Munk 2022,PRJEB27621 +ERR2683212,ERR2683212,Ghana,Kumasi,2017-06-16,Munk 2022,PRJEB27621 +ERR2683213,ERR2683213,New Zealand,Christchurch,2017-06-06,Munk 2022,PRJEB27621 +ERR2683214,ERR2683214,Spain,Barcelona,2017-06-08,Munk 2022,PRJEB27621 +ERR2683215,ERR2683215,Philippines,Manila,2017-06-29,Munk 2022,PRJEB27621 +ERR2683216,ERR2683216,France,Nantes,2017-06-19,Munk 2022,PRJEB27621 +ERR2683217,ERR2683217,France,Saint Philibert,2017-06-20,Munk 2022,PRJEB27621 +ERR2683218,ERR2683218,Switzerland,Bern,2017-06-11,Munk 2022,PRJEB27621 +ERR2683219,ERR2683219,Slovenia,Ljubljana,2017-06-14,Munk 2022,PRJEB27621 +ERR2683220,ERR2683220,Sweden,Uppsala,2017-06-12,Munk 2022,PRJEB27621 +ERR2683221,ERR2683221,Italy,Verbania,2017-06-12,Munk 2022,PRJEB27621 +ERR2683222,ERR2683222,Italy,Cannobio,2017-06-12,Munk 2022,PRJEB27621 +ERR2683223,ERR2683223,Italy,Novara,2017-06-12,Munk 2022,PRJEB27621 +ERR2683224,ERR2683224,Ireland,Galway,2017-06-12,Munk 2022,PRJEB27621 +ERR2683225,ERR2683225,Greece,Athens,2017-06-09,Munk 2022,PRJEB27621 +ERR2683226,ERR2683226,Austria,Vienna,2017-06-07,Munk 2022,PRJEB27621 +ERR2683227,ERR2683227,Senegal,Dakar,2017-06-15,Munk 2022,PRJEB27621 +ERR2683228,ERR2683228,Montenegro,Podgorica,2017-06-16,Munk 2022,PRJEB27621 +ERR2683229,ERR2683229,Ethiopia,Addis Ababa,2017-06-27,Munk 2022,PRJEB27621 +ERR2683230,ERR2683230,Serbia,Belgrade,2017-06-16,Munk 2022,PRJEB27621 +ERR2683231,ERR2683231,Finland,Turku,2017-06-12,Munk 2022,PRJEB27621 +ERR2683232,ERR2683232,Finland,Oulu,2017-06-12,Munk 2022,PRJEB27621 +ERR2683233,ERR2683233,Finland,Helsinki,2017-06-06,Munk 2022,PRJEB27621 +ERR2683234,ERR2683234,Czech Republic,Prague,2017-06-13,Munk 2022,PRJEB27621 +ERR2683235,ERR2683235,Czech Republic,Brno,2017-06-13,Munk 2022,PRJEB27621 +ERR2683236,ERR2683236,United Kingdom,Falmouth,2017-06-09,Munk 2022,PRJEB27621 +ERR2683237,ERR2683237,Tanzania,Mwanza,2017-06-12,Munk 2022,PRJEB27621 +ERR2683238,ERR2683238,Tanzania,Morogoro,2017-06-17,Munk 2022,PRJEB27621 +ERR2683239,ERR2683239,Paraguay,San Lorenzo,2017-06-16,Munk 2022,PRJEB27621 +ERR2683240,ERR2683240,Paraguay,Asuncion,2017-06-15,Munk 2022,PRJEB27621 +ERR2683241,ERR2683241,Barbados,Bridgetown,2017-06-15,Munk 2022,PRJEB27621 +ERR2683242,ERR2683242,Barbados,Worthing,2017-06-15,Munk 2022,PRJEB27621 +ERR2683243,ERR2683243,USA,Waco,2017-06-15,Munk 2022,PRJEB27621 +ERR2683244,ERR2683244,USA,Austin,2017-06-08,Munk 2022,PRJEB27621 +ERR2683245,ERR2683245,USA,Houston,2017-06-09,Munk 2022,PRJEB27621 +ERR2683246,ERR2683246,Bangladesh,Dhaka,2017-06-14,Munk 2022,PRJEB27621 +ERR2683247,ERR2683247,United Kingdom,Camborne,2017-06-09,Munk 2022,PRJEB27621 +ERR2683248,ERR2683248,USA,Houston,2017-06-09,Munk 2022,PRJEB27621 +ERR2683249,ERR2683249,Slovakia,Bratislava,2017-06-22,Munk 2022,PRJEB27621 +ERR2683250,ERR2683250,Sweden,Gothenburg,2017-06-08,Munk 2022,PRJEB27621 +ERR2683251,ERR2683251,Latvia,Riga,2017-06-06,Munk 2022,PRJEB27621 +ERR2683252,ERR2683252,Latvia,Liepaja,2017-06-12,Munk 2022,PRJEB27621 +ERR2683253,ERR2683253,Germany,Berlin,2017-06-14,Munk 2022,PRJEB27621 +ERR2683254,ERR2683254,Mauritius,Beau Bassin Rose-Hill,2017-06-13,Munk 2022,PRJEB27621 +ERR2683255,ERR2683255,Israel,Beer-Sheva,2017-06-12,Munk 2022,PRJEB27621 +ERR2683256,ERR2683256,Bosnia and Herzegovina,Banja Luka,2017-06-13,Munk 2022,PRJEB27621 +ERR2683257,ERR2683257,Australia,Maningrida,2017-06-20,Munk 2022,PRJEB27621 +ERR2683258,ERR2683258,Chad,N'Djamena,2017-06-11,Munk 2022,PRJEB27621 +ERR2683259,ERR2683259,Australia,Darwin,2017-06-15,Munk 2022,PRJEB27621 +ERR2683260,ERR2683260,Malta,Xghajra,2017-06-12,Munk 2022,PRJEB27621 +ERR2683261,ERR2683261,Malta,Cirkewwa,2017-06-13,Munk 2022,PRJEB27621 +ERR2683262,ERR2683262,Malta,Ghajnsielem,2017-06-13,Munk 2022,PRJEB27621 +ERR2683263,ERR2683263,Hungary,Budapest,2017-06-08,Munk 2022,PRJEB27621 +ERR2683264,ERR2683264,Saudi Arabia,Thuwal,2017-06-05,Munk 2022,PRJEB27621 +ERR2683265,ERR2683265,Saint Lucia,Castries,2017-06-22,Munk 2022,PRJEB27621 +ERR2683266,ERR2683266,Saint Lucia,Gros Islet,2017-06-22,Munk 2022,PRJEB27621 +ERR2683267,ERR2683267,India,Kochi,2017-06-13,Munk 2022,PRJEB27621 +ERR2683268,ERR2683268,Togo,Lome,2017-06-14,Munk 2022,PRJEB27621 +ERR2683269,ERR2683269,Cote d'Ivoire,Abidjan,2017-06-13,Munk 2022,PRJEB27621 +ERR2683270,ERR2683270,Nigeria,Lagos,2017-06-07,Munk 2022,PRJEB27621 +ERR2683271,ERR2683271,Cote d'Ivoire,Bwake,2017-06-12,Munk 2022,PRJEB27621 +ERR2683272,ERR2683272,Botswana,Gaborone,2017-06-09,Munk 2022,PRJEB27621 +ERR2683273,ERR2683273,Taiwan,Taipei,2017-06-06,Munk 2022,PRJEB27621 +ERR2683274,ERR2683274,USA,"Seattle, WA",2017-06-13,Munk 2022,PRJEB27621 +ERR2683275,ERR2683275,USA,Woodinville WA,2017-06-19,Munk 2022,PRJEB27621 +ERR2683276,ERR2683276,Ecuador,Quito,2017-12-14,Munk 2022,PRJEB27621 +ERR2683277,ERR2683277,Peru,Piura City,2017-06-07,Munk 2022,PRJEB27621 +ERR2683278,ERR2683278,Ecuador,Guayaquil,2017-06-14,Munk 2022,PRJEB27621 +ERR2683279,ERR2683279,Ecuador,Cuenca,2017-06-09,Munk 2022,PRJEB27621 +ERR2683280,ERR2683280,Cote d'Ivoire,Bwake,2017-06-12,Munk 2022,PRJEB27621 +ERR2683525,ERR2683525,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683526,ERR2683526,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683527,ERR2683527,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683528,ERR2683528,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683529,ERR2683529,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683530,ERR2683530,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683531,ERR2683531,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683532,ERR2683532,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683533,ERR2683533,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683534,ERR2683534,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683535,ERR2683535,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683536,ERR2683536,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683537,ERR2683537,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR2683538,ERR2683538,Denmark,Lyngby,2017-01-01,Munk 2022,PRJEB27621 +ERR9833719,ERR9833719,Australia,Melbourne,2017-05-12,Munk 2022,PRJEB27621 +ERR9834200,ERR9834200,China,Guangzhou,2017-06-16,Munk 2022,PRJEB27621 diff --git a/data/2024-05-06_munk/Munk-PRJEB27621/taxonomic_composition.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB27621/taxonomic_composition.tsv.gz new file mode 100644 index 0000000..603fe59 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB27621/taxonomic_composition.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40798/adapters.fasta b/data/2024-05-06_munk/Munk-PRJEB40798/adapters.fasta new file mode 100644 index 0000000..4b3a5c7 --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB40798/adapters.fasta @@ -0,0 +1,57 @@ +>0 +TACATACATACATACATACATACATACATACATACATACATACATACATACATACATACA +>1 +TGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>2 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC +>3 +CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +>4 +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>5 +ATACATACATACATACATACATACATACATACATACATACATACATACATACATACATAC +>6 +TATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATG +>7 +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +>8 +CAAGCAGAAGACGGCATACGAGAT +>9 +GTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTAT +>10 +unspecified +>11 +GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +>12 +ACACTCTTTCCCTACACGACGCTCTTCCGATCT +>13 +GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG +>14 +TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG +>15 +GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +>16 +CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC +T +>17 +GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG +>18 +TGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTA +>19 +CATACATACATACATACATACATACATACATACATACATACATACATACATACATACATA +>20 +AGATCGGAAGAGCACACGTCTGAACTCCAGTCA +>21 +CTGTCTCTTATACACATCTCCGAGCCCACGAGAC +>22 +ATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGT +>23 +ACATACATACATACATACATACATACATACATACATACATACATACATACATACATACAT +>24 +CTGTCTCTTATACACATCTGACGCTGCCGACGA +>25 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>26 +heifigepsna +>27 +CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT diff --git a/data/2024-05-06_munk/Munk-PRJEB40798/hv_clade_counts.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40798/hv_clade_counts.tsv.gz new file mode 100644 index 0000000..ccad289 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40798/hv_clade_counts.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40798/hv_hits_blast_paired.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40798/hv_hits_blast_paired.tsv.gz new file mode 100644 index 0000000..eccb21a Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40798/hv_hits_blast_paired.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40798/hv_hits_putative_filtered.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40798/hv_hits_putative_filtered.tsv.gz new file mode 100644 index 0000000..c93d769 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40798/hv_hits_putative_filtered.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40798/qc_adapter_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40798/qc_adapter_stats.tsv.gz new file mode 100644 index 0000000..e7cf5d9 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40798/qc_adapter_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40798/qc_basic_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40798/qc_basic_stats.tsv.gz new file mode 100644 index 0000000..9a11fab Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40798/qc_basic_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40798/qc_quality_base_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40798/qc_quality_base_stats.tsv.gz new file mode 100644 index 0000000..1b5d7a9 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40798/qc_quality_base_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40798/qc_quality_sequence_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40798/qc_quality_sequence_stats.tsv.gz new file mode 100644 index 0000000..556184e Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40798/qc_quality_sequence_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40798/sample-metadata.csv b/data/2024-05-06_munk/Munk-PRJEB40798/sample-metadata.csv new file mode 100644 index 0000000..3f69f26 --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB40798/sample-metadata.csv @@ -0,0 +1,151 @@ +library,sample,country,city,date,dataset,bioproject +ERR4678558,ERR4678558,Austria,Vienna,2018-11-08,Munk 2022,PRJEB40798 +ERR4678559,ERR4678559,South Africa,Pretoria,2018-11-22,Munk 2022,PRJEB40798 +ERR4678560,ERR4678560,Sweden,Gothenburg,2018-11-21,Munk 2022,PRJEB40798 +ERR4678561,ERR4678561,Chad,N'Djamena,2018-11-18,Munk 2022,PRJEB40798 +ERR4678562,ERR4678562,Latvia,Riga,2018-11-16,Munk 2022,PRJEB40798 +ERR4678563,ERR4678563,Latvia,Liepaja,2018-11-07,Munk 2022,PRJEB40798 +ERR4678564,ERR4678564,Israel,Beer-Sheva,2018-11-26,Munk 2022,PRJEB40798 +ERR4678565,ERR4678565,France,Nantes,2018-11-08,Munk 2022,PRJEB40798 +ERR4678566,ERR4678566,France,Saint Philibert,2018-11-26,Munk 2022,PRJEB40798 +ERR4678567,ERR4678567,United Kingdom,Falmouth,2018-11-09,Munk 2022,PRJEB40798 +ERR4678568,ERR4678568,United Kingdom,Camborne,2018-11-09,Munk 2022,PRJEB40798 +ERR4678569,ERR4678569,Guatemala,Guatemala City,2018-11-16,Munk 2022,PRJEB40798 +ERR4678570,ERR4678570,Hungary,Budapest,2018-11-26,Munk 2022,PRJEB40798 +ERR4678571,ERR4678571,Madagascar,Antananarivo,2018-12-05,Munk 2022,PRJEB40798 +ERR4678572,ERR4678572,Croatia,Zagreb,2018-12-05,Munk 2022,PRJEB40798 +ERR4678573,ERR4678573,Spain,Barcelona,2018-11-29,Munk 2022,PRJEB40798 +ERR4678574,ERR4678574,Spain,Lleida,2018-12-03,Munk 2022,PRJEB40798 +ERR4678575,ERR4678575,Hungary,Szekesfehervar,2018-12-06,Munk 2022,PRJEB40798 +ERR4678576,ERR4678576,Germany,Dresden,2018-12-05,Munk 2022,PRJEB40798 +ERR4678577,ERR4678577,Czech Republic,Prague,2018-12-04,Munk 2022,PRJEB40798 +ERR4678578,ERR4678578,Czech Republic,Brno,2018-12-05,Munk 2022,PRJEB40798 +ERR4678579,ERR4678579,Iceland,Reykjavik,2018-12-05,Munk 2022,PRJEB40798 +ERR4678580,ERR4678580,USA,Columbus,2018-11-11,Munk 2022,PRJEB40798 +ERR4678581,ERR4678581,Malaysia,Klang,2018-12-07,Munk 2022,PRJEB40798 +ERR4678582,ERR4678582,Malaysia,Kuala Lumpur,2018-12-07,Munk 2022,PRJEB40798 +ERR4678583,ERR4678583,Greece,Athens,2018-12-07,Munk 2022,PRJEB40798 +ERR4678584,ERR4678584,Portugal,Lisbon,2018-12-11,Munk 2022,PRJEB40798 +ERR4678585,ERR4678585,Saudi Arabia,Thuwal,2018-11-27,Munk 2022,PRJEB40798 +ERR4678586,ERR4678586,Slovenia,Ljubljana,2018-12-11,Munk 2022,PRJEB40798 +ERR4678587,ERR4678587,Thailand,Pathum Thani,2018-12-07,Munk 2022,PRJEB40798 +ERR4678588,ERR4678588,Brazil,Belo Horizonte,2018-12-08,Munk 2022,PRJEB40798 +ERR4678589,ERR4678589,Japan,Hiroshima,2018-11-21,Munk 2022,PRJEB40798 +ERR4678590,ERR4678590,Japan,Hiroshima,2018-11-21,Munk 2022,PRJEB40798 +ERR4678591,ERR4678591,Japan,Hiroshima,2018-11-21,Munk 2022,PRJEB40798 +ERR4678592,ERR4678592,Japan,Hiroshima,2018-11-21,Munk 2022,PRJEB40798 +ERR4678593,ERR4678593,Malta,St. Venera,2018-12-11,Munk 2022,PRJEB40798 +ERR4678594,ERR4678594,Turkey,Hatay,2018-12-04,Munk 2022,PRJEB40798 +ERR4678595,ERR4678595,Brazil,Rio de Janeiro,2018-01-01,Munk 2022,PRJEB40798 +ERR4678596,ERR4678596,India,Udupi,2018-11-28,Munk 2022,PRJEB40798 +ERR4678597,ERR4678597,Tanzania,Morogoro,2018-11-28,Munk 2022,PRJEB40798 +ERR4678598,ERR4678598,Tanzania,Mwanza,2018-11-28,Munk 2022,PRJEB40798 +ERR4678599,ERR4678599,Cote d'Ivoire,Bouake,2018-12-17,Munk 2022,PRJEB40798 +ERR4678600,ERR4678600,South Korea,Daejeon,2018-01-01,Munk 2022,PRJEB40798 +ERR4678601,ERR4678601,Belgium,Ostend,2018-12-09,Munk 2022,PRJEB40798 +ERR4678602,ERR4678602,Belgium,Leuven,2018-12-12,Munk 2022,PRJEB40798 +ERR4678603,ERR4678603,Belgium,Bruges,2018-12-05,Munk 2022,PRJEB40798 +ERR4678604,ERR4678604,Belgium,Deurne,2018-12-05,Munk 2022,PRJEB40798 +ERR4678605,ERR4678605,Belgium,Ghent,2018-12-05,Munk 2022,PRJEB40798 +ERR4678606,ERR4678606,India,Kochi,2018-12-20,Munk 2022,PRJEB40798 +ERR4678607,ERR4678607,USA,"Atlanta, GA",2018-11-29,Munk 2022,PRJEB40798 +ERR4678608,ERR4678608,Uruguay,Montevideo,2018-08-27,Munk 2022,PRJEB40798 +ERR4678609,ERR4678609,Cambodia,Phnom Penh,2019-01-03,Munk 2022,PRJEB40798 +ERR4678610,ERR4678610,Madagascar,Lake Anosy,2018-11-21,Munk 2022,PRJEB40798 +ERR4678611,ERR4678611,Uganda,Kampala,2019-01-10,Munk 2022,PRJEB40798 +ERR4678612,ERR4678612,Madagascar,Canal Andriantany,2018-11-21,Munk 2022,PRJEB40798 +ERR4678613,ERR4678613,Mauritius,Saint Martin,2018-12-18,Munk 2022,PRJEB40798 +ERR4678614,ERR4678614,Mauritius,Grand Baie,2018-12-18,Munk 2022,PRJEB40798 +ERR4678615,ERR4678615,Italy,Novara,2018-12-14,Munk 2022,PRJEB40798 +ERR4678616,ERR4678616,Taiwan,Taipei,2018-12-19,Munk 2022,PRJEB40798 +ERR4678617,ERR4678617,Italy,Verbania,2018-12-13,Munk 2022,PRJEB40798 +ERR4678618,ERR4678618,Italy,Cannobio,2018-12-13,Munk 2022,PRJEB40798 +ERR4678619,ERR4678619,China,Nanjing,2018-12-27,Munk 2022,PRJEB40798 +ERR4678620,ERR4678620,South Korea,Gwangju,2018-12-03,Munk 2022,PRJEB40798 +ERR4678621,ERR4678621,Botswana,Palapye,2018-12-07,Munk 2022,PRJEB40798 +ERR4678622,ERR4678622,South Africa,Tshwane,2018-12-06,Munk 2022,PRJEB40798 +ERR4678623,ERR4678623,China,Suzhou,2018-12-27,Munk 2022,PRJEB40798 +ERR4678624,ERR4678624,Pakistan,Faisalabad,2018-12-27,Munk 2022,PRJEB40798 +ERR4678625,ERR4678625,Bolivia,La Paz City,2019-01-15,Munk 2022,PRJEB40798 +ERR4678626,ERR4678626,Switzerland,Herrenschwanden,2018-11-22,Munk 2022,PRJEB40798 +ERR4678627,ERR4678627,United Kingdom,Edinburgh,2018-11-30,Munk 2022,PRJEB40798 +ERR4678628,ERR4678628,USA,Chapel Hill,2019-01-16,Munk 2022,PRJEB40798 +ERR4678629,ERR4678629,Bangladesh,Dhaka,2018-12-31,Munk 2022,PRJEB40798 +ERR4678630,ERR4678630,Australia,Darwin,2018-12-18,Munk 2022,PRJEB40798 +ERR4678631,ERR4678631,Australia,Maningrida,2018-12-18,Munk 2022,PRJEB40798 +ERR4678632,ERR4678632,Bosnia and Herzegovina,Banja Luka,2018-12-10,Munk 2022,PRJEB40798 +ERR4678633,ERR4678633,Ghana,Tamale,2019-01-19,Munk 2022,PRJEB40798 +ERR4678634,ERR4678634,Japan,Anonymous,2018-12-19,Munk 2022,PRJEB40798 +ERR4678635,ERR4678635,Germany,Berlin,2018-12-13,Munk 2022,PRJEB40798 +ERR4678636,ERR4678636,Ghana,Accra,2018-11-20,Munk 2022,PRJEB40798 +ERR4678637,ERR4678637,Ghana,Accra,2018-11-19,Munk 2022,PRJEB40798 +ERR4678638,ERR4678638,Ghana,Kasoa,2018-11-19,Munk 2022,PRJEB40798 +ERR4678639,ERR4678639,Chile,Valparaiso,2019-01-24,Munk 2022,PRJEB40798 +ERR4678640,ERR4678640,Chile,Santiago,2019-01-24,Munk 2022,PRJEB40798 +ERR4678641,ERR4678641,Nigeria,Ilorin,2018-11-29,Munk 2022,PRJEB40798 +ERR4678642,ERR4678642,Greece,Thessaloniki,2019-01-29,Munk 2022,PRJEB40798 +ERR4678643,ERR4678643,Barbados,Worthing,2018-12-21,Munk 2022,PRJEB40798 +ERR4678644,ERR4678644,Barbados,Bridgetown,2018-12-21,Munk 2022,PRJEB40798 +ERR4678645,ERR4678645,Nigeria,Ilorin,2018-01-01,Munk 2022,PRJEB40798 +ERR4678646,ERR4678646,Senegal,Dakar,2019-01-17,Munk 2022,PRJEB40798 +ERR4678647,ERR4678647,Morocco,Casablanca,2018-01-01,Munk 2022,PRJEB40798 +ERR4678648,ERR4678648,Slovakia,Bratislava,2018-12-05,Munk 2022,PRJEB40798 +ERR4678649,ERR4678649,Spain,Santiago,2019-01-08,Munk 2022,PRJEB40798 +ERR4678650,ERR4678650,Portugal,Porto,2019-01-29,Munk 2022,PRJEB40798 +ERR4678651,ERR4678651,Italy,Rome,2018-11-26,Munk 2022,PRJEB40798 +ERR4678652,ERR4678652,Finland,Oulu,2018-11-01,Munk 2022,PRJEB40798 +ERR4678653,ERR4678653,Finland,Oulu,2018-11-01,Munk 2022,PRJEB40798 +ERR4678654,ERR4678654,Finland,Turku,2018-11-14,Munk 2022,PRJEB40798 +ERR4678655,ERR4678655,Finland,Helsinki,2018-11-13,Munk 2022,PRJEB40798 +ERR4678656,ERR4678656,Finland,Helsinki,2018-11-13,Munk 2022,PRJEB40798 +ERR4678657,ERR4678657,Canada,Calgary,2018-12-10,Munk 2022,PRJEB40798 +ERR4678658,ERR4678658,Canada,Regina,2018-12-19,Munk 2022,PRJEB40798 +ERR4678659,ERR4678659,Canada,Vancouver,2018-12-06,Munk 2022,PRJEB40798 +ERR4678660,ERR4678660,Canada,Ottawa,2018-12-06,Munk 2022,PRJEB40798 +ERR4678661,ERR4678661,Mozambique,Maputo,2018-12-13,Munk 2022,PRJEB40798 +ERR4678662,ERR4678662,Nepal,Kathmandu,2019-02-13,Munk 2022,PRJEB40798 +ERR4678663,ERR4678663,Nigeria,Lagos,2018-12-18,Munk 2022,PRJEB40798 +ERR4678664,ERR4678664,Argentina,Buenos Aires,2018-12-08,Munk 2022,PRJEB40798 +ERR4678665,ERR4678665,Montenegro,Podgorica,2019-02-08,Munk 2022,PRJEB40798 +ERR4678666,ERR4678666,Singapore,Jurong,2019-01-16,Munk 2022,PRJEB40798 +ERR4678667,ERR4678667,Cameroon,Yaounde,2018-11-23,Munk 2022,PRJEB40798 +ERR4678668,ERR4678668,Serbia,Belgrade,2019-01-14,Munk 2022,PRJEB40798 +ERR4678669,ERR4678669,Serbia,Belgrade,2019-01-14,Munk 2022,PRJEB40798 +ERR4678670,ERR4678670,New Zealand,Christchurch,2018-12-17,Munk 2022,PRJEB40798 +ERR4678671,ERR4678671,Denmark,Avedore,2018-11-18,Munk 2022,PRJEB40798 +ERR4678672,ERR4678672,Denmark,Valby,2018-11-18,Munk 2022,PRJEB40798 +ERR4678673,ERR4678673,Denmark,Amager,2018-11-18,Munk 2022,PRJEB40798 +ERR4678674,ERR4678674,Denmark,Amager,2018-11-18,Munk 2022,PRJEB40798 +ERR4678675,ERR4678675,Greenland,Sisimiut,2018-11-08,Munk 2022,PRJEB40798 +ERR4678676,ERR4678676,Netherlands,Utrecht,2018-12-20,Munk 2022,PRJEB40798 +ERR4678677,ERR4678677,Malaysia,Butterworth,2018-12-13,Munk 2022,PRJEB40798 +ERR4678678,ERR4678678,Malaysia,Jelutong,2018-12-13,Munk 2022,PRJEB40798 +ERR4678679,ERR4678679,Malaysia,Sungai Petani,2018-12-14,Munk 2022,PRJEB40798 +ERR4678680,ERR4678680,Togo,Lome,2018-11-25,Munk 2022,PRJEB40798 +ERR4678681,ERR4678681,United Arab Emirates,Dubai,2018-12-10,Munk 2022,PRJEB40798 +ERR4678682,ERR4678682,Poland,Pulawy,2018-12-13,Munk 2022,PRJEB40798 +ERR4678683,ERR4678683,Poland,Pulawy,2018-12-13,Munk 2022,PRJEB40798 +ERR4678684,ERR4678684,Colombia,Mosquera,2019-04-25,Munk 2022,PRJEB40798 +ERR4678685,ERR4678685,Democratic Republic of the Congo,Bukavu,2018-11-17,Munk 2022,PRJEB40798 +ERR4678686,ERR4678686,Democratic Republic of the Congo,Bukavu,2018-11-17,Munk 2022,PRJEB40798 +ERR4678687,ERR4678687,Democratic Republic of the Congo,Bukavu,2018-11-17,Munk 2022,PRJEB40798 +ERR4678688,ERR4678688,Democratic Republic of the Congo,Bukavu,2018-11-17,Munk 2022,PRJEB40798 +ERR4678689,ERR4678689,Democratic Republic of the Congo,Bukavu,2018-12-18,Munk 2022,PRJEB40798 +ERR4678690,ERR4678690,Democratic Republic of the Congo,Bukavu,2018-12-18,Munk 2022,PRJEB40798 +ERR4678691,ERR4678691,Democratic Republic of the Congo,Bukavu,2018-12-18,Munk 2022,PRJEB40798 +ERR4678692,ERR4678692,Democratic Republic of the Congo,Bukavu,2018-12-18,Munk 2022,PRJEB40798 +ERR4678693,ERR4678693,Pakistan,Hyderabad,2017-06-15,Munk 2022,PRJEB40798 +ERR4678694,ERR4678694,USA,Chapel Hill,2018-01-22,Munk 2022,PRJEB40798 +ERR4678695,ERR4678695,USA,Houston,2018-01-19,Munk 2022,PRJEB40798 +ERR4678696,ERR4678696,Nigeria,Ibadan,2017-12-27,Munk 2022,PRJEB40798 +ERR4678697,ERR4678697,Nigeria,Ibadan,2017-12-27,Munk 2022,PRJEB40798 +ERR4678698,ERR4678698,Cote d'Ivoire,Bouake,2018-12-17,Munk 2022,PRJEB40798 +ERR4678699,ERR4678699,Denmark,Lyngby,2019-02-22,Munk 2022,PRJEB40798 +ERR4678700,ERR4678700,Denmark,Lyngby,2019-03-08,Munk 2022,PRJEB40798 +ERR4678701,ERR4678701,Denmark,Lyngby,2019-03-13,Munk 2022,PRJEB40798 +ERR4678702,ERR4678702,Denmark,Lyngby,2019-03-15,Munk 2022,PRJEB40798 +ERR4678703,ERR4678703,Denmark,Lyngby,2019-03-19,Munk 2022,PRJEB40798 +ERR4678704,ERR4678704,Denmark,Lyngby,2019-03-20,Munk 2022,PRJEB40798 +ERR4678705,ERR4678705,Denmark,Lyngby,2019-04-12,Munk 2022,PRJEB40798 +ERR4678706,ERR4678706,Denmark,Lyngby,2019-05-10,Munk 2022,PRJEB40798 +ERR4678707,ERR4678707,Denmark,Lyngby,2019-05-14,Munk 2022,PRJEB40798 diff --git a/data/2024-05-06_munk/Munk-PRJEB40798/taxonomic_composition.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40798/taxonomic_composition.tsv.gz new file mode 100644 index 0000000..8faeea8 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40798/taxonomic_composition.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40815/adapters.fasta b/data/2024-05-06_munk/Munk-PRJEB40815/adapters.fasta new file mode 100644 index 0000000..694323e --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB40815/adapters.fasta @@ -0,0 +1,73 @@ +>0 +CTGTCTCTTATACACATCTGACGCTGCCGACGA +>1 +TTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGC +>2 +unspecified +>3 +TGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>4 +AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCT +>5 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>6 +ATACATACATACATACATACATACATACATACATACATACATACATACATACATACATAC +>7 +heifigepsna +>8 +GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +>9 +AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA +>10 +ATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGT +>11 +CTGTCTCTTATACACATCTCCGAGCCCACGAGAC +>12 +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +>13 +CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +>14 +GTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTAT +>15 +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>16 +CAAGCAGAAGACGGCATACGAGAT +>17 +AGATCGGAAGAGCACACGTCTGAACTCCAGTCA +>18 +ACACTCTTTCCCTACACGACGCTCTTCCGATCT +>19 +GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG +>20 +TACATACATACATACATACATACATACATACATACATACATACATACATACATACATACA +>21 +ACATACATACATACATACATACATACATACATACATACATACATACATACATACATACAT +>22 +AGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTT +>23 +CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +>24 +GGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTA +>25 +TGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTA +>26 +TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG +>27 +TATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATG +>28 +GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +>29 +TAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC +>30 +CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC +T +>31 +TAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCT +>32 +CATACATACATACATACATACATACATACATACATACATACATACATACATACATACATA +>33 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC +>34 +GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA +>35 +GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG diff --git a/data/2024-05-06_munk/Munk-PRJEB40815/hv_clade_counts.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40815/hv_clade_counts.tsv.gz new file mode 100644 index 0000000..4a192d2 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40815/hv_clade_counts.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40815/hv_hits_blast_paired.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40815/hv_hits_blast_paired.tsv.gz new file mode 100644 index 0000000..965830f Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40815/hv_hits_blast_paired.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40815/hv_hits_putative_filtered.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40815/hv_hits_putative_filtered.tsv.gz new file mode 100644 index 0000000..5b7fa3e Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40815/hv_hits_putative_filtered.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40815/qc_adapter_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40815/qc_adapter_stats.tsv.gz new file mode 100644 index 0000000..eb0731c Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40815/qc_adapter_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40815/qc_basic_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40815/qc_basic_stats.tsv.gz new file mode 100644 index 0000000..c5925b4 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40815/qc_basic_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40815/qc_quality_base_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40815/qc_quality_base_stats.tsv.gz new file mode 100644 index 0000000..12b075e Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40815/qc_quality_base_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40815/qc_quality_sequence_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40815/qc_quality_sequence_stats.tsv.gz new file mode 100644 index 0000000..9667b0f Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40815/qc_quality_sequence_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40815/sample-metadata.csv b/data/2024-05-06_munk/Munk-PRJEB40815/sample-metadata.csv new file mode 100644 index 0000000..1ddb558 --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB40815/sample-metadata.csv @@ -0,0 +1,149 @@ +library,sample,country,city,date,dataset,bioproject +ERR4682329,ERR4682329,Thailand,Surat Thani,2017-11-06,Munk 2022,PRJEB40815 +ERR4682330,ERR4682330,Italy,Novara,2017-11-20,Munk 2022,PRJEB40815 +ERR4682331,ERR4682331,Italy,Verbania,2017-11-20,Munk 2022,PRJEB40815 +ERR4682332,ERR4682332,Italy,Cannobio,2017-11-20,Munk 2022,PRJEB40815 +ERR4682333,ERR4682333,Sweden,Gothenburg,2017-11-19,Munk 2022,PRJEB40815 +ERR4682334,ERR4682334,Austria,Vienna,2017-11-21,Munk 2022,PRJEB40815 +ERR4682335,ERR4682335,Portugal,Lisbon,2017-11-22,Munk 2022,PRJEB40815 +ERR4682336,ERR4682336,Cambodia,Phnom Penh,2017-11-20,Munk 2022,PRJEB40815 +ERR4682337,ERR4682337,Croatia,Zagreb,2017-11-13,Munk 2022,PRJEB40815 +ERR4682338,ERR4682338,Greece,Athens,2017-11-21,Munk 2022,PRJEB40815 +ERR4682339,ERR4682339,Nicaragua,Leon,2017-11-21,Munk 2022,PRJEB40815 +ERR4682340,ERR4682340,Switzerland,Bern,2017-11-21,Munk 2022,PRJEB40815 +ERR4682341,ERR4682341,Australia,Darwin,2017-11-21,Munk 2022,PRJEB40815 +ERR4682342,ERR4682342,Australia,Maningrida,2017-11-22,Munk 2022,PRJEB40815 +ERR4682343,ERR4682343,Belgium,Ghent,2017-11-22,Munk 2022,PRJEB40815 +ERR4682344,ERR4682344,Belgium,Bruges,2017-11-22,Munk 2022,PRJEB40815 +ERR4682345,ERR4682345,Belgium,Antwerp South,2017-11-21,Munk 2022,PRJEB40815 +ERR4682346,ERR4682346,Belgium,Deurne,2017-11-22,Munk 2022,PRJEB40815 +ERR4682347,ERR4682347,France,Nantes,2017-11-20,Munk 2022,PRJEB40815 +ERR4682348,ERR4682348,France,Saint Philibert,2017-11-22,Munk 2022,PRJEB40815 +ERR4682349,ERR4682349,Montenegro,Podgorica,2017-11-23,Munk 2022,PRJEB40815 +ERR4682350,ERR4682350,Czech Republic,Prague,2017-11-20,Munk 2022,PRJEB40815 +ERR4682351,ERR4682351,Czech Republic,Brno,2017-11-22,Munk 2022,PRJEB40815 +ERR4682352,ERR4682352,Guatemala,San Miguel Petapa,2017-11-20,Munk 2022,PRJEB40815 +ERR4682353,ERR4682353,South Africa,Pretoria,2017-11-17,Munk 2022,PRJEB40815 +ERR4682354,ERR4682354,Bosnia and Herzegovina,Banja Luka,2017-11-21,Munk 2022,PRJEB40815 +ERR4682355,ERR4682355,Tanzania,Mwanza,2017-11-20,Munk 2022,PRJEB40815 +ERR4682356,ERR4682356,Tanzania,Morogoro,2017-11-21,Munk 2022,PRJEB40815 +ERR4682357,ERR4682357,Benin,Cotonou,2017-11-11,Munk 2022,PRJEB40815 +ERR4682358,ERR4682358,South Korea,Daejeon,2017-11-23,Munk 2022,PRJEB40815 +ERR4682359,ERR4682359,Cote d'Ivoire,Abidjan,2017-11-22,Munk 2022,PRJEB40815 +ERR4682360,ERR4682360,Cote d'Ivoire,Bouake,2017-11-21,Munk 2022,PRJEB40815 +ERR4682361,ERR4682361,Cote d'Ivoire,Yamoussoukro,2017-11-22,Munk 2022,PRJEB40815 +ERR4682362,ERR4682362,Saudi Arabia,Thuwal,2017-11-22,Munk 2022,PRJEB40815 +ERR4682363,ERR4682363,Slovenia,Ljubljana,2017-11-30,Munk 2022,PRJEB40815 +ERR4682364,ERR4682364,Spain,Barcelona,2017-11-30,Munk 2022,PRJEB40815 +ERR4682365,ERR4682365,Slovakia,Bratislava,2017-11-21,Munk 2022,PRJEB40815 +ERR4682366,ERR4682366,Greece,Thessaloniki,2017-11-30,Munk 2022,PRJEB40815 +ERR4682367,ERR4682367,Hong Kong,Hong Kong,2017-11-30,Munk 2022,PRJEB40815 +ERR4682368,ERR4682368,Uganda,Kampala,2017-12-01,Munk 2022,PRJEB40815 +ERR4682369,ERR4682369,Iceland,Reykjavik,2017-11-24,Munk 2022,PRJEB40815 +ERR4682370,ERR4682370,Norway,Oslo,2017-11-23,Munk 2022,PRJEB40815 +ERR4682371,ERR4682371,Malaysia,Kuala Lumpur,2017-11-30,Munk 2022,PRJEB40815 +ERR4682372,ERR4682372,Malaysia,Klang,2017-11-30,Munk 2022,PRJEB40815 +ERR4682373,ERR4682373,Malaysia,Ipoh,2017-11-29,Munk 2022,PRJEB40815 +ERR4682374,ERR4682374,Albania,Tirana,2017-11-20,Munk 2022,PRJEB40815 +ERR4682375,ERR4682375,Germany,Berlin,2017-11-28,Munk 2022,PRJEB40815 +ERR4682376,ERR4682376,Hungary,Budapest,2017-11-30,Munk 2022,PRJEB40815 +ERR4682377,ERR4682377,Germany,Dresden,2017-11-29,Munk 2022,PRJEB40815 +ERR4682378,ERR4682378,Chad,N'Djamena,2017-11-26,Munk 2022,PRJEB40815 +ERR4682379,ERR4682379,Greenland,Sisimiut,2017-11-30,Munk 2022,PRJEB40815 +ERR4682380,ERR4682380,Taiwan,Taipei,2017-11-22,Munk 2022,PRJEB40815 +ERR4682381,ERR4682381,India,Kochi,2017-11-24,Munk 2022,PRJEB40815 +ERR4682382,ERR4682382,United Kingdom,Camborne,2017-11-28,Munk 2022,PRJEB40815 +ERR4682383,ERR4682383,United Kingdom,Falmouth,2017-01-01,Munk 2022,PRJEB40815 +ERR4682384,ERR4682384,Japan,Anonymous,2017-11-27,Munk 2022,PRJEB40815 +ERR4682385,ERR4682385,India,Manipal,2017-11-23,Munk 2022,PRJEB40815 +ERR4682386,ERR4682386,Thailand,Pathum Thani,2017-11-23,Munk 2022,PRJEB40815 +ERR4682387,ERR4682387,Barbados,Bridgetown,2017-11-23,Munk 2022,PRJEB40815 +ERR4682388,ERR4682388,Barbados,Worthing,2017-11-23,Munk 2022,PRJEB40815 +ERR4682389,ERR4682389,Paraguay,San Lorenzo,2017-11-30,Munk 2022,PRJEB40815 +ERR4682390,ERR4682390,Pakistan,Karachi,2017-11-27,Munk 2022,PRJEB40815 +ERR4682391,ERR4682391,Ireland,Galway,2017-11-27,Munk 2022,PRJEB40815 +ERR4682392,ERR4682392,Hungary,Szekesfehervar,2017-11-27,Munk 2022,PRJEB40815 +ERR4682393,ERR4682393,Canada,Quebec,2017-12-05,Munk 2022,PRJEB40815 +ERR4682394,ERR4682394,Italy,Rome,2017-11-20,Munk 2022,PRJEB40815 +ERR4682395,ERR4682395,Poland,Pulawy,2017-11-28,Munk 2022,PRJEB40815 +ERR4682396,ERR4682396,Nigeria,Lagos,2017-12-06,Munk 2022,PRJEB40815 +ERR4682397,ERR4682397,Mauritius,Port Louis,2017-12-01,Munk 2022,PRJEB40815 +ERR4682398,ERR4682398,Belgium,Brussels,2017-11-23,Munk 2022,PRJEB40815 +ERR4682399,ERR4682399,Estonia,Tallinn,2017-12-09,Munk 2022,PRJEB40815 +ERR4682400,ERR4682400,Estonia,Tartu,2017-11-20,Munk 2022,PRJEB40815 +ERR4682401,ERR4682401,Estonia,Parnu,2017-11-28,Munk 2022,PRJEB40815 +ERR4682402,ERR4682402,Estonia,Narva,2017-11-22,Munk 2022,PRJEB40815 +ERR4682403,ERR4682403,Sweden,Uppsala,2017-11-27,Munk 2022,PRJEB40815 +ERR4682404,ERR4682404,Iran,Shiraz,2017-11-30,Munk 2022,PRJEB40815 +ERR4682405,ERR4682405,Luxembourg,Luxembourg,2017-12-01,Munk 2022,PRJEB40815 +ERR4682406,ERR4682406,Portugal,Porto,2017-11-22,Munk 2022,PRJEB40815 +ERR4682407,ERR4682407,South Korea,Gwangju,2017-11-20,Munk 2022,PRJEB40815 +ERR4682408,ERR4682408,Burkina Faso,Ouagadougou,2018-01-04,Munk 2022,PRJEB40815 +ERR4682409,ERR4682409,Burkina Faso,Bobo Dioulasso,2017-12-19,Munk 2022,PRJEB40815 +ERR4682410,ERR4682410,Colombia,Mosquera,2017-12-28,Munk 2022,PRJEB40815 +ERR4682411,ERR4682411,Botswana,Gaborone,2017-12-01,Munk 2022,PRJEB40815 +ERR4682412,ERR4682412,Malawi,Zomba,2017-11-22,Munk 2022,PRJEB40815 +ERR4682413,ERR4682413,Malawi,Chikwawa,2017-11-21,Munk 2022,PRJEB40815 +ERR4682414,ERR4682414,Malawi,Machinga,2017-11-22,Munk 2022,PRJEB40815 +ERR4682415,ERR4682415,Malawi,Mchinji,2017-11-23,Munk 2022,PRJEB40815 +ERR4682416,ERR4682416,Togo,Lome,2017-11-27,Munk 2022,PRJEB40815 +ERR4682417,ERR4682417,Malawi,Lilongwe,2017-11-23,Munk 2022,PRJEB40815 +ERR4682418,ERR4682418,Malawi,Karonga,2017-11-24,Munk 2022,PRJEB40815 +ERR4682419,ERR4682419,Malawi,Nsanje,2017-11-21,Munk 2022,PRJEB40815 +ERR4682420,ERR4682420,Bangladesh,Dhaka,2017-11-25,Munk 2022,PRJEB40815 +ERR4682421,ERR4682421,Kuwait,Kuwait,2017-12-07,Munk 2022,PRJEB40815 +ERR4682422,ERR4682422,Nepal,Kathmandu,2018-01-03,Munk 2022,PRJEB40815 +ERR4682423,ERR4682423,Nepal,Bhaktapur,2018-01-03,Munk 2022,PRJEB40815 +ERR4682424,ERR4682424,USA,Chapel Hill,2017-11-02,Munk 2022,PRJEB40815 +ERR4682425,ERR4682425,Ghana,Tamale,2017-12-09,Munk 2022,PRJEB40815 +ERR4682426,ERR4682426,Cameroon,Yaounde,2017-11-17,Munk 2022,PRJEB40815 +ERR4682427,ERR4682427,United Kingdom,Edinburgh,2017-11-21,Munk 2022,PRJEB40815 +ERR4682428,ERR4682428,Uruguay,Montevideo,2017-12-01,Munk 2022,PRJEB40815 +ERR4682429,ERR4682429,Chile,Santiago,2017-12-04,Munk 2022,PRJEB40815 +ERR4682430,ERR4682430,Chile,Punta Arenas,2017-11-29,Munk 2022,PRJEB40815 +ERR4682431,ERR4682431,Morocco,Casablanca,2017-11-29,Munk 2022,PRJEB40815 +ERR4682432,ERR4682432,Israel,Beer-Sheva,2017-12-06,Munk 2022,PRJEB40815 +ERR4682433,ERR4682433,Canada,Ottawa,2017-11-27,Munk 2022,PRJEB40815 +ERR4682434,ERR4682434,Canada,Regina,2017-11-22,Munk 2022,PRJEB40815 +ERR4682435,ERR4682435,Canada,Calgary,2017-11-21,Munk 2022,PRJEB40815 +ERR4682436,ERR4682436,Canada,Vancouver,2017-11-19,Munk 2022,PRJEB40815 +ERR4682437,ERR4682437,Ghana,Kumasi,2017-11-29,Munk 2022,PRJEB40815 +ERR4682438,ERR4682438,United Kingdom,Newcastle Upon Tyne,2017-12-04,Munk 2022,PRJEB40815 +ERR4682439,ERR4682439,Finland,Oulu,2017-11-21,Munk 2022,PRJEB40815 +ERR4682440,ERR4682440,Finland,Turku,2017-11-22,Munk 2022,PRJEB40815 +ERR4682441,ERR4682441,Finland,Helsinki,2017-11-22,Munk 2022,PRJEB40815 +ERR4682442,ERR4682442,USA,"Seattle, WA",2017-11-28,Munk 2022,PRJEB40815 +ERR4682443,ERR4682443,United Arab Emirates,Dubai,2017-11-28,Munk 2022,PRJEB40815 +ERR4682444,ERR4682444,New Zealand,Christchurch,2017-11-21,Munk 2022,PRJEB40815 +ERR4682445,ERR4682445,USA,Houston,2018-01-19,Munk 2022,PRJEB40815 +ERR4682446,ERR4682446,USA,Austin,2018-01-18,Munk 2022,PRJEB40815 +ERR4682447,ERR4682447,USA,Waco,2018-01-11,Munk 2022,PRJEB40815 +ERR4682448,ERR4682448,Malaysia,Alor Setar,2017-11-23,Munk 2022,PRJEB40815 +ERR4682449,ERR4682449,Malaysia,Jelutong,2017-11-23,Munk 2022,PRJEB40815 +ERR4682450,ERR4682450,Malaysia,Butterworth,2017-11-23,Munk 2022,PRJEB40815 +ERR4682451,ERR4682451,Tanzania,Moshi,2017-11-29,Munk 2022,PRJEB40815 +ERR4682452,ERR4682452,Philippines,Quezon City,2017-12-21,Munk 2022,PRJEB40815 +ERR4682453,ERR4682453,Argentina,Buenos Aires,2017-11-22,Munk 2022,PRJEB40815 +ERR4682454,ERR4682454,Saint Lucia,Castries,2017-01-01,Munk 2022,PRJEB40815 +ERR4682455,ERR4682455,Saint Lucia,Gros Islet,2017-01-01,Munk 2022,PRJEB40815 +ERR4682456,ERR4682456,Ethiopia,Addis Ababa,2017-11-24,Munk 2022,PRJEB40815 +ERR4682457,ERR4682457,Nigeria,Ilorin,2018-01-10,Munk 2022,PRJEB40815 +ERR4682458,ERR4682458,Ecuador,Quito,2017-11-23,Munk 2022,PRJEB40815 +ERR4682459,ERR4682459,Mozambique,Maputo,2017-11-17,Munk 2022,PRJEB40815 +ERR4682460,ERR4682460,Iran,Tehran,2018-10-01,Munk 2022,PRJEB40815 +ERR4682461,ERR4682461,Senegal,Dakar,2018-03-08,Munk 2022,PRJEB40815 +ERR4682462,ERR4682462,Denmark,Valby,2017-11-18,Munk 2022,PRJEB40815 +ERR4682463,ERR4682463,Denmark,Amager,2017-11-18,Munk 2022,PRJEB40815 +ERR4682464,ERR4682464,Denmark,Avedore,2017-11-18,Munk 2022,PRJEB40815 +ERR4682465,ERR4682465,China,Guangzhou,2017-12-21,Munk 2022,PRJEB40815 +ERR4682466,ERR4682466,Viet Nam,Ho Chi Minh,2017-12-27,Munk 2022,PRJEB40815 +ERR4682467,ERR4682467,Serbia,Belgrade,2017-01-01,Munk 2022,PRJEB40815 +ERR4682468,ERR4682468,Netherlands,Utrecht,2017-11-23,Munk 2022,PRJEB40815 +ERR4682469,ERR4682469,Denmark,Lyngby,2017-12-12,Munk 2022,PRJEB40815 +ERR4682470,ERR4682470,Denmark,Lyngby,2017-12-15,Munk 2022,PRJEB40815 +ERR4682471,ERR4682471,Denmark,Lyngby,2017-12-19,Munk 2022,PRJEB40815 +ERR4682472,ERR4682472,Denmark,Lyngby,2018-01-17,Munk 2022,PRJEB40815 +ERR4682473,ERR4682473,Denmark,Lyngby,2018-01-24,Munk 2022,PRJEB40815 +ERR4682474,ERR4682474,Denmark,Lyngby,2018-02-28,Munk 2022,PRJEB40815 +ERR4682475,ERR4682475,Denmark,Lyngby,2018-04-05,Munk 2022,PRJEB40815 +ERR4682476,ERR4682476,Denmark,Lyngby,2018-04-24,Munk 2022,PRJEB40815 \ No newline at end of file diff --git a/data/2024-05-06_munk/Munk-PRJEB40815/taxonomic_composition.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40815/taxonomic_composition.tsv.gz new file mode 100644 index 0000000..42bf0e9 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40815/taxonomic_composition.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40816/adapters.fasta b/data/2024-05-06_munk/Munk-PRJEB40816/adapters.fasta new file mode 100644 index 0000000..d2cea72 --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB40816/adapters.fasta @@ -0,0 +1,49 @@ +>0 +TGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTATGTA +>1 +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>2 +CTGTCTCTTATACACATCTGACGCTGCCGACGA +>3 +CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAG +>4 +ACACTCTTTCCCTACACGACGCTCTTCCGATCT +>5 +CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC +T +>6 +CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +>7 +unspecified +>8 +GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG +>9 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC +>10 +GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +>11 +CTGTCTCTTATACACATCTCCGAGCCCACGAGAC +>12 +TGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>13 +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +>14 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>15 +CAAGCAGAAGACGGCATACGAGAT +>16 +GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +>17 +TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG +>18 +CTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGGCTTAGG +>19 +CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +>20 +heifigepsna +>21 +AGATCGGAAGAGCACACGTCTGAACTCCAGTCA +>22 +GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG +>23 +TACATACATACATACATACATACATACATACATACATACATACATACATACATACATACA diff --git a/data/2024-05-06_munk/Munk-PRJEB40816/hv_clade_counts.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40816/hv_clade_counts.tsv.gz new file mode 100644 index 0000000..0370fdc Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40816/hv_clade_counts.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40816/hv_hits_blast_paired.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40816/hv_hits_blast_paired.tsv.gz new file mode 100644 index 0000000..0a1d204 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40816/hv_hits_blast_paired.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40816/hv_hits_putative_filtered.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40816/hv_hits_putative_filtered.tsv.gz new file mode 100644 index 0000000..374e507 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40816/hv_hits_putative_filtered.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40816/qc_adapter_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40816/qc_adapter_stats.tsv.gz new file mode 100644 index 0000000..56d4d43 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40816/qc_adapter_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40816/qc_basic_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40816/qc_basic_stats.tsv.gz new file mode 100644 index 0000000..057b38a Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40816/qc_basic_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40816/qc_quality_base_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40816/qc_quality_base_stats.tsv.gz new file mode 100644 index 0000000..e651be3 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40816/qc_quality_base_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40816/qc_quality_sequence_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40816/qc_quality_sequence_stats.tsv.gz new file mode 100644 index 0000000..6da00f0 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40816/qc_quality_sequence_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB40816/sample-metadata.csv b/data/2024-05-06_munk/Munk-PRJEB40816/sample-metadata.csv new file mode 100644 index 0000000..75298f7 --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB40816/sample-metadata.csv @@ -0,0 +1,143 @@ +library,sample,country,city,date,dataset,bioproject +ERR4682772,ERR4682772,Singapore,Jurong,2017-11-22,Munk 2022,PRJEB40816 +ERR4682773,ERR4682773,Denmark,Avedore,2018-06-11,Munk 2022,PRJEB40816 +ERR4682774,ERR4682774,Denmark,Valby,2018-06-11,Munk 2022,PRJEB40816 +ERR4682775,ERR4682775,Denmark,Amager,2018-06-11,Munk 2022,PRJEB40816 +ERR4682776,ERR4682776,Sweden,Gothenburg,2018-06-15,Munk 2022,PRJEB40816 +ERR4682777,ERR4682777,Sweden,Uppsala,2018-06-18,Munk 2022,PRJEB40816 +ERR4682778,ERR4682778,Greece,Athens,2018-06-20,Munk 2022,PRJEB40816 +ERR4682779,ERR4682779,Cambodia,Phnom Penh,2018-06-22,Munk 2022,PRJEB40816 +ERR4682780,ERR4682780,Spain,Barcelona,2018-06-21,Munk 2022,PRJEB40816 +ERR4682781,ERR4682781,Belgium,Ostend,2018-06-12,Munk 2022,PRJEB40816 +ERR4682782,ERR4682782,Belgium,Leuven,2018-06-20,Munk 2022,PRJEB40816 +ERR4682783,ERR4682783,Belgium,Harelbeke,2018-06-12,Munk 2022,PRJEB40816 +ERR4682784,ERR4682784,Czech Republic,Prague,2018-06-20,Munk 2022,PRJEB40816 +ERR4682785,ERR4682785,Czech Republic,Brno,2018-06-20,Munk 2022,PRJEB40816 +ERR4682786,ERR4682786,Germany,Dresden,2018-06-21,Munk 2022,PRJEB40816 +ERR4682787,ERR4682787,Malaysia,Kuala Lumpur,2018-05-31,Munk 2022,PRJEB40816 +ERR4682788,ERR4682788,France,Nantes,2018-06-11,Munk 2022,PRJEB40816 +ERR4682789,ERR4682789,France,Saint Philibert,2018-06-20,Munk 2022,PRJEB40816 +ERR4682790,ERR4682790,Saudi Arabia,Thuwal,2018-06-12,Munk 2022,PRJEB40816 +ERR4682791,ERR4682791,Austria,Vienna,2018-06-21,Munk 2022,PRJEB40816 +ERR4682792,ERR4682792,Bosnia and Herzegovina,Banja Luka,2018-06-27,Munk 2022,PRJEB40816 +ERR4682793,ERR4682793,Burkina Faso,Ouagadougou,2018-06-22,Munk 2022,PRJEB40816 +ERR4682794,ERR4682794,Portugal,Lisbon,2018-06-27,Munk 2022,PRJEB40816 +ERR4682795,ERR4682795,Togo,Lome,2018-06-23,Munk 2022,PRJEB40816 +ERR4682796,ERR4682796,Hungary,Budapest,2018-06-25,Munk 2022,PRJEB40816 +ERR4682797,ERR4682797,Italy,Novara,2018-06-29,Munk 2022,PRJEB40816 +ERR4682798,ERR4682798,Italy,Verbania,2018-06-28,Munk 2022,PRJEB40816 +ERR4682799,ERR4682799,Italy,Cannobio,2018-06-28,Munk 2022,PRJEB40816 +ERR4682800,ERR4682800,Slovenia,Ljubljana,2018-06-20,Munk 2022,PRJEB40816 +ERR4682801,ERR4682801,Turkey,Hatay,2018-06-29,Munk 2022,PRJEB40816 +ERR4682802,ERR4682802,Croatia,Zagreb,2018-07-02,Munk 2022,PRJEB40816 +ERR4682803,ERR4682803,South Korea,Daejeon,2018-06,Munk 2022,PRJEB40816 +ERR4682804,ERR4682804,Tanzania,Morogoro,2018-06-25,Munk 2022,PRJEB40816 +ERR4682805,ERR4682805,Tanzania,Mwanza,2018-06-20,Munk 2022,PRJEB40816 +ERR4682806,ERR4682806,Japan,Japan,2018-06-22,Munk 2022,PRJEB40816 +ERR4682807,ERR4682807,United Kingdom,Falmouth,2018-06-29,Munk 2022,PRJEB40816 +ERR4682808,ERR4682808,United Kingdom,Camborne,2018-06-21,Munk 2022,PRJEB40816 +ERR4682809,ERR4682809,Canada,Quebec,2018-06,Munk 2022,PRJEB40816 +ERR4682810,ERR4682810,Pakistan,Faisalabad,2018-06-21,Munk 2022,PRJEB40816 +ERR4682811,ERR4682811,Serbia,Belgrade,2018-06-12,Munk 2022,PRJEB40816 +ERR4682812,ERR4682812,Botswana,Gaborone,2018-07-31,Munk 2022,PRJEB40816 +ERR4682813,ERR4682813,Bulgaria,Sofia,2018-06-07,Munk 2022,PRJEB40816 +ERR4682814,ERR4682814,Netherlands,Amsterdam,2018-07-04,Munk 2022,PRJEB40816 +ERR4682815,ERR4682815,Montenegro,Podgorica,2018-07-09,Munk 2022,PRJEB40816 +ERR4682816,ERR4682816,South Korea,Gwangju,2018-07-10,Munk 2022,PRJEB40816 +ERR4682817,ERR4682817,Mauritius,Saint Martin,2018-07-06,Munk 2022,PRJEB40816 +ERR4682818,ERR4682818,Mauritius,Grand Baie,2018-07-06,Munk 2022,PRJEB40816 +ERR4682819,ERR4682819,Greece,Thessaloniki,2018-07-13,Munk 2022,PRJEB40816 +ERR4682820,ERR4682820,Portugal,Porto,2018-07-02,Munk 2022,PRJEB40816 +ERR4682821,ERR4682821,India,Kochi,2018-07-04,Munk 2022,PRJEB40816 +ERR4682822,ERR4682822,Slovakia,Bratislava,2018-07-03,Munk 2022,PRJEB40816 +ERR4682823,ERR4682823,Poland,Pulawy,2018-07-09,Munk 2022,PRJEB40816 +ERR4682824,ERR4682824,Netherlands,Utrecht,2018-07-02,Munk 2022,PRJEB40816 +ERR4682825,ERR4682825,Taiwan,Taipei,2018-07-09,Munk 2022,PRJEB40816 +ERR4682826,ERR4682826,Italy,Rome,2018-06-26,Munk 2022,PRJEB40816 +ERR4682827,ERR4682827,China,Nanjing,2018-06-21,Munk 2022,PRJEB40816 +ERR4682828,ERR4682828,China,Yangzhou,2018-06-22,Munk 2022,PRJEB40816 +ERR4682829,ERR4682829,Switzerland,Bern,2018-06-28,Munk 2022,PRJEB40816 +ERR4682830,ERR4682830,Canada,Calgary,2018-06-26,Munk 2022,PRJEB40816 +ERR4682831,ERR4682831,Canada,Regina,2018-07-10,Munk 2022,PRJEB40816 +ERR4682832,ERR4682832,Kuwait,Kuwait,2018-07-07,Munk 2022,PRJEB40816 +ERR4682833,ERR4682833,Morocco,Casablanca,2018-06-29,Munk 2022,PRJEB40816 +ERR4682834,ERR4682834,Brazil,Belo Horizonte,2018-07-09,Munk 2022,PRJEB40816 +ERR4682835,ERR4682835,Barbados,Worthing,2018-07-05,Munk 2022,PRJEB40816 +ERR4682836,ERR4682836,Barbados,Bridgetown,2018-07-05,Munk 2022,PRJEB40816 +ERR4682837,ERR4682837,India,Udupi,2018-06-18,Munk 2022,PRJEB40816 +ERR4682838,ERR4682838,Ecuador,Quito,2018-06-28,Munk 2022,PRJEB40816 +ERR4682839,ERR4682839,Cote d'Ivoire,Abidjan,2018-07-07,Munk 2022,PRJEB40816 +ERR4682840,ERR4682840,Cote d'Ivoire,Yamoussoukro,2018-07-07,Munk 2022,PRJEB40816 +ERR4682841,ERR4682841,Cote d'Ivoire,Bouake,2018-07-06,Munk 2022,PRJEB40816 +ERR4682842,ERR4682842,Canada,Ottawa,2018-07-09,Munk 2022,PRJEB40816 +ERR4682843,ERR4682843,Canada,Vancouver,2018-07-13,Munk 2022,PRJEB40816 +ERR4682844,ERR4682844,Argentina,Buenos Aires,2018-07-10,Munk 2022,PRJEB40816 +ERR4682845,ERR4682845,Uganda,Kampala,2018-07-18,Munk 2022,PRJEB40816 +ERR4682846,ERR4682846,Nigeria,Lagos,2018-07-03,Munk 2022,PRJEB40816 +ERR4682847,ERR4682847,Malaysia,Butterworth,2018-06-27,Munk 2022,PRJEB40816 +ERR4682848,ERR4682848,Malaysia,Jelutong,2018-06-27,Munk 2022,PRJEB40816 +ERR4682849,ERR4682849,Malaysia,Sungai Petani,2018-07-06,Munk 2022,PRJEB40816 +ERR4682850,ERR4682850,Malaysia,Kuala Lumpur,2018-06-29,Munk 2022,PRJEB40816 +ERR4682851,ERR4682851,Malaysia,Ipoh,2018-06-28,Munk 2022,PRJEB40816 +ERR4682852,ERR4682852,Malaysia,Klang,2018-06-29,Munk 2022,PRJEB40816 +ERR4682853,ERR4682853,New Zealand,Christchurch,2018-07-04,Munk 2022,PRJEB40816 +ERR4682854,ERR4682854,Latvia,Riga,2018-06-20,Munk 2022,PRJEB40816 +ERR4682855,ERR4682855,Latvia,Liepaja,2018-06-21,Munk 2022,PRJEB40816 +ERR4682856,ERR4682856,United Kingdom,Edinburgh,2018-06-27,Munk 2022,PRJEB40816 +ERR4682857,ERR4682857,Cote d'Ivoire,Yamoussoukro,2018-07-07,Munk 2022,PRJEB40816 +ERR4682858,ERR4682858,Cote d'Ivoire,Bouake,2018-07-06,Munk 2022,PRJEB40816 +ERR4682859,ERR4682859,Chile,Santiago,2018-08-14,Munk 2022,PRJEB40816 +ERR4682860,ERR4682860,Chile,Santiago,2018-08-14,Munk 2022,PRJEB40816 +ERR4682861,ERR4682861,Finland,Turku,2018-06-27,Munk 2022,PRJEB40816 +ERR4682862,ERR4682862,Finland,Helsinki,2018-06-20,Munk 2022,PRJEB40816 +ERR4682863,ERR4682863,Finland,Oulu,2018-06-20,Munk 2022,PRJEB40816 +ERR4682864,ERR4682864,Norway,Oslo,2018-07-18,Munk 2022,PRJEB40816 +ERR4682865,ERR4682865,Greenland,Sisimiut,2018-06-26,Munk 2022,PRJEB40816 +ERR4682866,ERR4682866,Israel,Beer-Sheva,2018-07-07,Munk 2022,PRJEB40816 +ERR4682867,ERR4682867,Benin,Djeffa,2018-08-22,Munk 2022,PRJEB40816 +ERR4682868,ERR4682868,United Kingdom,Newcastle Upon Tyne,2018-06-28,Munk 2022,PRJEB40816 +ERR4682869,ERR4682869,Bangladesh,Dhaka,2018-07-03,Munk 2022,PRJEB40816 +ERR4682870,ERR4682870,Kenya,Nairobi,2018-06-29,Munk 2022,PRJEB40816 +ERR4682871,ERR4682871,USA,"Seattle, WA",2018-06-21,Munk 2022,PRJEB40816 +ERR4682872,ERR4682872,USA,"Seattle, WA",2018-06-20,Munk 2022,PRJEB40816 +ERR4682873,ERR4682873,Saint Lucia,Castries,2018-01-01,Munk 2022,PRJEB40816 +ERR4682874,ERR4682874,Saint Lucia,Gros Islet,2018-01-01,Munk 2022,PRJEB40816 +ERR4682875,ERR4682875,North Macedonia,Skopje,2018-01-01,Munk 2022,PRJEB40816 +ERR4682876,ERR4682876,North Macedonia,Skopje,2018-01-01,Munk 2022,PRJEB40816 +ERR4682877,ERR4682877,Australia,Melbourne,2018-06-19,Munk 2022,PRJEB40816 +ERR4682878,ERR4682878,Australia,Darwin,2018-07-10,Munk 2022,PRJEB40816 +ERR4682879,ERR4682879,Australia,Maningrida,2018-07-10,Munk 2022,PRJEB40816 +ERR4682880,ERR4682880,Cameroon,Yaounde,2018-06-20,Munk 2022,PRJEB40816 +ERR4682881,ERR4682881,Paraguay,San Lorenzo,2018-06-28,Munk 2022,PRJEB40816 +ERR4682882,ERR4682882,Germany,Berlin,2018-07-04,Munk 2022,PRJEB40816 +ERR4682883,ERR4682883,USA,Chapel Hill,2018-09-12,Munk 2022,PRJEB40816 +ERR4682884,ERR4682884,Madagascar,Antananarivo,2018-06-27,Munk 2022,PRJEB40816 +ERR4682885,ERR4682885,Madagascar,Antananarivo,2018-06-28,Munk 2022,PRJEB40816 +ERR4682886,ERR4682886,Singapore,Jurong,2018-06-20,Munk 2022,PRJEB40816 +ERR4682887,ERR4682887,Colombia,Bogota,2018-09-17,Munk 2022,PRJEB40816 +ERR4682888,ERR4682888,Madagascar,Antananarivo,2018-09-24,Munk 2022,PRJEB40816 +ERR4682889,ERR4682889,Thailand,Pathum Thani,2018-09-10,Munk 2022,PRJEB40816 +ERR4682890,ERR4682890,Pakistan,Karachi,2018-06-20,Munk 2022,PRJEB40816 +ERR4682891,ERR4682891,Brazil,Rio de Janeiro,2018-09-17,Munk 2022,PRJEB40816 +ERR4682892,ERR4682892,Brazil,Rio de Janeiro,2018-01-01,Munk 2022,PRJEB40816 +ERR4682893,ERR4682893,USA,"Atlanta, GA",2018-09-27,Munk 2022,PRJEB40816 +ERR4682894,ERR4682894,USA,Waco,2018-06-21,Munk 2022,PRJEB40816 +ERR4682895,ERR4682895,United Arab Emirates,Dubai,2018-06-27,Munk 2022,PRJEB40816 +ERR4682896,ERR4682896,Viet Nam,Ho Chi Minh,2018-01-01,Munk 2022,PRJEB40816 +ERR4682897,ERR4682897,Senegal,Dakar,2018-01-01,Munk 2022,PRJEB40816 +ERR4682898,ERR4682898,China,Guangzhou,2018-06-05,Munk 2022,PRJEB40816 +ERR4682899,ERR4682899,New Zealand,Dunedin,2018-10-16,Munk 2022,PRJEB40816 +ERR4682900,ERR4682900,Tanzania,Moshi,2018-08-03,Munk 2022,PRJEB40816 +ERR4682901,ERR4682901,Denmark,Lyngby,2018-09-05,Munk 2022,PRJEB40816 +ERR4682902,ERR4682902,Denmark,Lyngby,2018-09-12,Munk 2022,PRJEB40816 +ERR4682903,ERR4682903,Denmark,Lyngby,2018-09-18,Munk 2022,PRJEB40816 +ERR4682904,ERR4682904,Denmark,Lyngby,2018-09-19,Munk 2022,PRJEB40816 +ERR4682905,ERR4682905,Denmark,Lyngby,2018-10-02,Munk 2022,PRJEB40816 +ERR4682906,ERR4682906,Denmark,Lyngby,2018-10-10,Munk 2022,PRJEB40816 +ERR4682907,ERR4682907,Denmark,Lyngby,2018-10-12,Munk 2022,PRJEB40816 +ERR4682908,ERR4682908,Denmark,Lyngby,2018-11-07,Munk 2022,PRJEB40816 +ERR4682909,ERR4682909,Denmark,Lyngby,2018-11-21,Munk 2022,PRJEB40816 +ERR4682910,ERR4682910,Ghana,Accra,2018-07-16,Munk 2022,PRJEB40816 +ERR4682911,ERR4682911,Ghana,Accra,2018-07-16,Munk 2022,PRJEB40816 +ERR4682912,ERR4682912,Ghana,Accra,2018-07-16,Munk 2022,PRJEB40816 +ERR4682913,ERR4682913,Denmark,Lyngby,2018-10-03,Munk 2022,PRJEB40816 diff --git a/data/2024-05-06_munk/Munk-PRJEB40816/taxonomic_composition.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB40816/taxonomic_composition.tsv.gz new file mode 100644 index 0000000..11d0688 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB40816/taxonomic_composition.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB51229/adapters.fasta b/data/2024-05-06_munk/Munk-PRJEB51229/adapters.fasta new file mode 100644 index 0000000..b4c739a --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB51229/adapters.fasta @@ -0,0 +1,41 @@ +>0 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATC +>1 +GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +>2 +TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG +>3 +CTGTCTCTTATACACATCTGACGCTGCCGACGA +>4 +TGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>5 +CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +>6 +GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG +>7 +ACACTCTTTCCCTACACGACGCTCTTCCGATCT +>8 +CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +>9 +GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +>10 +CAAGCAGAAGACGGCATACGAGAT +>11 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>12 +AGATCGGAAGAGCACACGTCTGAACTCCAGTCA +>13 +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>14 +GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG +>15 +CTGTCTCTTATACACATCTCCGAGCCCACGAGAC +>16 +unspecified +>17 +heifigepsna +>18 +CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATC +T +>19 +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT diff --git a/data/2024-05-06_munk/Munk-PRJEB51229/hv_clade_counts.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB51229/hv_clade_counts.tsv.gz new file mode 100644 index 0000000..f190ce9 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB51229/hv_clade_counts.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB51229/hv_hits_blast_paired.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB51229/hv_hits_blast_paired.tsv.gz new file mode 100644 index 0000000..efb2b34 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB51229/hv_hits_blast_paired.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB51229/hv_hits_putative_filtered.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB51229/hv_hits_putative_filtered.tsv.gz new file mode 100644 index 0000000..34a737a Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB51229/hv_hits_putative_filtered.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB51229/qc_adapter_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB51229/qc_adapter_stats.tsv.gz new file mode 100644 index 0000000..62d5e0b Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB51229/qc_adapter_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB51229/qc_basic_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB51229/qc_basic_stats.tsv.gz new file mode 100644 index 0000000..cb1b472 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB51229/qc_basic_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB51229/qc_quality_base_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB51229/qc_quality_base_stats.tsv.gz new file mode 100644 index 0000000..5b8e6b0 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB51229/qc_quality_base_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB51229/qc_quality_sequence_stats.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB51229/qc_quality_sequence_stats.tsv.gz new file mode 100644 index 0000000..9dc1cc1 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB51229/qc_quality_sequence_stats.tsv.gz differ diff --git a/data/2024-05-06_munk/Munk-PRJEB51229/sample-metadata.csv b/data/2024-05-06_munk/Munk-PRJEB51229/sample-metadata.csv new file mode 100644 index 0000000..edd8e23 --- /dev/null +++ b/data/2024-05-06_munk/Munk-PRJEB51229/sample-metadata.csv @@ -0,0 +1,94 @@ +library,sample,country,city,date,dataset,bioproject +ERR8977341,ERR8977341,Australia,Melbourne,2017-07-18,Munk 2022,PRJEB51229 +ERR8977342,ERR8977342,Australia,Melbourne,2017-08-16,Munk 2022,PRJEB51229 +ERR8977343,ERR8977343,Pakistan,Karachi,2017-08-18,Munk 2022,PRJEB51229 +ERR8977344,ERR8977344,Pakistan,Karachi,2017-07-19,Munk 2022,PRJEB51229 +ERR8977345,ERR8977345,Greece,Athens,2017-07-31,Munk 2022,PRJEB51229 +ERR8977346,ERR8977346,Greece,Athens,2017-09-29,Munk 2022,PRJEB51229 +ERR8977347,ERR8977347,Malaysia,Kuala Lumpur,2017-07-18,Munk 2022,PRJEB51229 +ERR8977348,ERR8977348,Malaysia,Kuala Lumpur,2017-08-15,Munk 2022,PRJEB51229 +ERR8977349,ERR8977349,Malaysia,Kuala Lumpur,2017-09-12,Munk 2022,PRJEB51229 +ERR8977350,ERR8977350,Malaysia,Kuala Lumpur,2017-10-10,Munk 2022,PRJEB51229 +ERR8977351,ERR8977351,China,Guangzhou,2017-07-11,Munk 2022,PRJEB51229 +ERR8977352,ERR8977352,China,Guangzhou,2017-08-23,Munk 2022,PRJEB51229 +ERR8977353,ERR8977353,China,Guangzhou,2017-09-30,Munk 2022,PRJEB51229 +ERR8977354,ERR8977354,China,Guangzhou,2017-10-13,Munk 2022,PRJEB51229 +ERR8977355,ERR8977355,Pakistan,Karachi,2017-09-28,Munk 2022,PRJEB51229 +ERR8977356,ERR8977356,Pakistan,Karachi,2017-10-30,Munk 2022,PRJEB51229 +ERR8977357,ERR8977357,Australia,Melbourne,2017-09-27,Munk 2022,PRJEB51229 +ERR8977358,ERR8977358,Australia,Melbourne,2017-10-16,Munk 2022,PRJEB51229 +ERR8977359,ERR8977359,Cameroon,Yaounde,2017-10-23,Munk 2022,PRJEB51229 +ERR8977360,ERR8977360,Cameroon,Yaounde,2017-12-20,Munk 2022,PRJEB51229 +ERR8977361,ERR8977361,Cameroon,Yaounde,2018-01-19,Munk 2022,PRJEB51229 +ERR8977362,ERR8977362,Cameroon,Yaounde,2017-07-19,Munk 2022,PRJEB51229 +ERR8977363,ERR8977363,Cameroon,Yaounde,2017-08-18,Munk 2022,PRJEB51229 +ERR8977364,ERR8977364,Cameroon,Yaounde,2017-09-19,Munk 2022,PRJEB51229 +ERR8977365,ERR8977365,USA,"Seattle, WA",2017-09-28,Munk 2022,PRJEB51229 +ERR8977366,ERR8977366,USA,"Seattle, WA",2017-10-25,Munk 2022,PRJEB51229 +ERR8977367,ERR8977367,USA,"Seattle, WA",2017-01-12,Munk 2022,PRJEB51229 +ERR8977368,ERR8977368,Malaysia,Kuala Lumpur,2017-12-05,Munk 2022,PRJEB51229 +ERR8977369,ERR8977369,Malaysia,Kuala Lumpur,2018-01-04,Munk 2022,PRJEB51229 +ERR8977370,ERR8977370,Malaysia,Kuala Lumpur,2018-01-06,Munk 2022,PRJEB51229 +ERR8977371,ERR8977371,Malaysia,Kuala Lumpur,2017-11-04,Munk 2022,PRJEB51229 +ERR8977372,ERR8977372,Canada,Regina,2017-07-26,Munk 2022,PRJEB51229 +ERR8977373,ERR8977373,Canada,Regina,2017-08-22,Munk 2022,PRJEB51229 +ERR8977374,ERR8977374,Canada,Regina,2017-09-27,Munk 2022,PRJEB51229 +ERR8977375,ERR8977375,Canada,Regina,2017-10-25,Munk 2022,PRJEB51229 +ERR8977376,ERR8977376,Ecuador,Quito,2017-12-23,Munk 2022,PRJEB51229 +ERR8977377,ERR8977377,Ecuador,Quito,2018-01-31,Munk 2022,PRJEB51229 +ERR8977378,ERR8977378,Ecuador,Quito,2018-02-28,Munk 2022,PRJEB51229 +ERR8977379,ERR8977379,Canada,Regina,2017-12-19,Munk 2022,PRJEB51229 +ERR8977380,ERR8977380,Australia,Melbourne,2017-12-21,Munk 2022,PRJEB51229 +ERR8977381,ERR8977381,Australia,Melbourne,2018-01-23,Munk 2022,PRJEB51229 +ERR8977382,ERR8977382,Australia,Melbourne,2018-02-26,Munk 2022,PRJEB51229 +ERR8977383,ERR8977383,China,Guangzhou,2017-12-21,Munk 2022,PRJEB51229 +ERR8977384,ERR8977384,China,Guangzhou,2018-01-10,Munk 2022,PRJEB51229 +ERR8977385,ERR8977385,China,Guangzhou,2018-02-08,Munk 2022,PRJEB51229 +ERR8977386,ERR8977386,China,Guangzhou,2018-03-18,Munk 2022,PRJEB51229 +ERR8977387,ERR8977387,USA,"Seattle, WA",2017-12-20,Munk 2022,PRJEB51229 +ERR8977388,ERR8977388,USA,"Seattle, WA",2018-01-30,Munk 2022,PRJEB51229 +ERR8977389,ERR8977389,USA,"Seattle, WA",2018-02-22,Munk 2022,PRJEB51229 +ERR8977390,ERR8977390,USA,"Seattle, WA",2018-03-29,Munk 2022,PRJEB51229 +ERR8977391,ERR8977391,Australia,Melbourne,2018-03-27,Munk 2022,PRJEB51229 +ERR8977392,ERR8977392,Australia,Melbourne,2018-04-27,Munk 2022,PRJEB51229 +ERR8977393,ERR8977393,Australia,Melbourne,2018-05-17,Munk 2022,PRJEB51229 +ERR8977394,ERR8977394,Malaysia,Kuala Lumpur,2018-03-16,Munk 2022,PRJEB51229 +ERR8977395,ERR8977395,Malaysia,Kuala Lumpur,2018-04-12,Munk 2022,PRJEB51229 +ERR8977396,ERR8977396,Malaysia,Kuala Lumpur,2018-05-08,Munk 2022,PRJEB51229 +ERR8977397,ERR8977397,USA,"Seattle, WA",2018-04-15,Munk 2022,PRJEB51229 +ERR8977398,ERR8977398,USA,"Seattle, WA",2018-05-13,Munk 2022,PRJEB51229 +ERR8977399,ERR8977399,USA,"Seattle, WA",2018-07-17,Munk 2022,PRJEB51229 +ERR8977400,ERR8977400,USA,"Seattle, WA",2018-08-23,Munk 2022,PRJEB51229 +ERR8977401,ERR8977401,Australia,Melbourne,2018-07-19,Munk 2022,PRJEB51229 +ERR8977402,ERR8977402,Australia,Melbourne,2018-08-30,Munk 2022,PRJEB51229 +ERR8977403,ERR8977403,Cameroon,Yaounde,2018-07-19,Munk 2022,PRJEB51229 +ERR8977404,ERR8977404,Cameroon,Yaounde,2018-08-17,Munk 2022,PRJEB51229 +ERR8977405,ERR8977405,Canada,Regina,2018-03-21,Munk 2022,PRJEB51229 +ERR8977406,ERR8977406,Canada,Regina,2018-04-01,Munk 2022,PRJEB51229 +ERR8977407,ERR8977407,Canada,Regina,2018-05-29,Munk 2022,PRJEB51229 +ERR8977408,ERR8977408,Canada,Regina,2018-07-30,Munk 2022,PRJEB51229 +ERR8977409,ERR8977409,Canada,Regina,2018-08-16,Munk 2022,PRJEB51229 +ERR8977410,ERR8977410,Canada,Regina,2018-09-28,Munk 2022,PRJEB51229 +ERR8977411,ERR8977411,Ecuador,Quito,2018-03-29,Munk 2022,PRJEB51229 +ERR8977412,ERR8977412,Ecuador,Quito,2018-04-26,Munk 2022,PRJEB51229 +ERR8977413,ERR8977413,Ecuador,Quito,2018-05-25,Munk 2022,PRJEB51229 +ERR8977414,ERR8977414,Ecuador,Quito,2018-07-26,Munk 2022,PRJEB51229 +ERR8977415,ERR8977415,Ecuador,Quito,2018-08-29,Munk 2022,PRJEB51229 +ERR8977416,ERR8977416,China,Guangzhou,2018-04-10,Munk 2022,PRJEB51229 +ERR8977417,ERR8977417,China,Guangzhou,2018-05-11,Munk 2022,PRJEB51229 +ERR8977418,ERR8977418,China,Guangzhou,2018-08-09,Munk 2022,PRJEB51229 +ERR8977419,ERR8977419,China,Guangzhou,2018-09-01,Munk 2022,PRJEB51229 +ERR8977420,ERR8977420,China,Guangzhou,2018-10-09,Munk 2022,PRJEB51229 +ERR8977421,ERR8977421,Cameroon,Yaounde,2018-04-03,Munk 2022,PRJEB51229 +ERR8977422,ERR8977422,Cameroon,Yaounde,2019-05-22,Munk 2022,PRJEB51229 +ERR8977423,ERR8977423,Cameroon,Yaounde,2019-02-21,Munk 2022,PRJEB51229 +ERR8977424,ERR8977424,Cameroon,Yaounde,2018-03-18,Munk 2022,PRJEB51229 +ERR8977425,ERR8977425,Tanzania,Moshi,2018-08-03,Munk 2022,PRJEB51229 +ERR8977426,ERR8977426,Tanzania,Moshi,2018-08-09,Munk 2022,PRJEB51229 +ERR8977427,ERR8977427,Tanzania,Moshi,2018-08-17,Munk 2022,PRJEB51229 +ERR8977428,ERR8977428,Tanzania,Moshi,2018-08-20,Munk 2022,PRJEB51229 +ERR8977429,ERR8977429,Tanzania,Moshi,2018-09-04,Munk 2022,PRJEB51229 +ERR8977430,ERR8977430,Tanzania,Moshi,2018-09-11,Munk 2022,PRJEB51229 +ERR8977431,ERR8977431,Tanzania,Moshi,2018-09-01,Munk 2022,PRJEB51229 +ERR8977432,ERR8977432,Tanzania,Moshi,2018-09-18,Munk 2022,PRJEB51229 +ERR8977433,ERR8977433,Tanzania,Moshi,2018-09-26,Munk 2022,PRJEB51229 \ No newline at end of file diff --git a/data/2024-05-06_munk/Munk-PRJEB51229/taxonomic_composition.tsv.gz b/data/2024-05-06_munk/Munk-PRJEB51229/taxonomic_composition.tsv.gz new file mode 100644 index 0000000..682b169 Binary files /dev/null and b/data/2024-05-06_munk/Munk-PRJEB51229/taxonomic_composition.tsv.gz differ diff --git a/data/2024-05-06_munk/gdp.csv b/data/2024-05-06_munk/gdp.csv new file mode 100644 index 0000000..4926e1a --- /dev/null +++ b/data/2024-05-06_munk/gdp.csv @@ -0,0 +1,266 @@ +country,gdp_per_capita_ppp +Aruba,40144.64744 +Africa Eastern and Southern,3788.413507 +Afghanistan,2168.133765 +Africa Western and Central,4270.578573 +Angola,6882.279123 +Albania,14618.30061 +Andorra, +Arab World,14981.03665 +United Arab Emirates,74826.51842 +Argentina,23007.83659 +Armenia,14924.77693 +American Samoa, +Antigua and Barbuda,24641.22935 +Australia,52746.71829 +Austria,60574.6271 +Azerbaijan,15054.54461 +Burundi,760.6041261 +Belgium,56621.54176 +Benin,3290.307 +Burkina Faso,2195.355619 +Bangladesh,5699.078134 +Bulgaria,25527.26356 +Bahrain,51879.73634 +"Bahamas, The",35780.75967 +Bosnia and Herzegovina,16354.49014 +Belarus,20106.06891 +Belize,9303.718003 +Bermuda,85145.94364 +Bolivia,8890.462954 +Brazil,15307.94124 +Barbados,16759.38084 +Brunei Darussalam,64029.443 +Bhutan,13061.59027 +Botswana,15758.84288 +Central African Republic,898.3153199 +Canada,50522.16024 +Central Europe and the Baltics,35470.73063 +Switzerland,73732.23533 +Channel Islands, +Chile,25853.77015 +China,16655.39936 +Cote d'Ivoire,5350.010141 +Cameroon,3915.827499 +Democratic Republic of the Congo,1104.843368 +"Congo, Rep.",4267.621523 +Colombia,16091.50445 +Comoros,3430.21084 +Cabo Verde,8071.889116 +Costa Rica,22949.52583 +Caribbean small states,17095.24657 +Cuba, +Curacao,24917.40166 +Cayman Islands,75504.44857 +Cyprus,44686.71875 +Czech Republic,44859.79315 +Germany,58251.77308 +Djibouti,4970.334219 +Dominica,12921.12466 +Denmark,60787.46336 +Dominican Republic,18941.71678 +Algeria,12120.40637 +East Asia & Pacific (excluding high income),15029.81671 +Early-demographic dividend,9778.022166 +East Asia & Pacific,18418.86524 +Europe & Central Asia (excluding high income),23125.52555 +Europe & Central Asia,38041.8527 +Ecuador,11873.28931 +"Egypt, Arab Rep.",12280.60639 +Euro area,50849.94252 +Eritrea, +Spain,43739.63291 +Estonia,39578.08172 +Ethiopia,2274.185566 +European Union,48192.31109 +Fragile and conflict affected situations,4862.385199 +Finland,52569.80006 +Fiji,13803.13539 +France,51240.0683 +Faroe Islands, +"Micronesia, Fed. Sts.",3691.015088 +Gabon,15092.0306 +United Kingdom,49911.15665 +Georgia,15624.97002 +Ghana,5572.669189 +Gibraltar, +Guinea,2653.082556 +Gambia,2171.664617 +Guinea-Bissau,1970.71281 +Equatorial Guinea,16929.14277 +Greece,31611.21144 +Grenada,16046.90915 +Guatemala,9040.834113 +Guam, +Guyana,13364.23566 +High income,52772.68699 +Hong Kong,62119.40193 +Honduras,5851.742458 +Heavily indebted poor countries (HIPC),2681.290212 +Croatia,32123.99613 +Haiti,3244.292031 +Hungary,35152.61383 +IBRD only,14038.30444 +IDA & IBRD total,11422.08242 +IDA total,4083.187874 +IDA blend,5272.69117 +Indonesia,12360.69174 +IDA only,3507.1719 +Isle of Man, +India,6897.770037 +Not classified, +Ireland,90940.63476 +Iran,14681.68732 +Iraq,10736.12226 +Iceland,60523.79699 +Israel,41513.45151 +Italy,46470.03733 +Jamaica,10678.84167 +Jordan,9939.036937 +Japan,42678.15487 +Kazakhstan,27469.4151 +Kenya,4796.354095 +Kyrgyz Republic,5481.38014 +Cambodia,4653.630366 +Kiribati,2046.81068 +St. Kitts and Nevis,32500.97176 +South Korea,43865.04314 +Kuwait,49321.47237 +Latin America & Caribbean (excluding high income),16191.37154 +Lao PDR,8172.577923 +Lebanon,17913.88484 +Liberia,1517.372933 +Libya,23491.20006 +Saint Lucia,16116.5083 +Latin America & Caribbean,16877.17242 +Least developed countries: UN classification,3236.758406 +Low income,2021.629474 +Liechtenstein, +Sri Lanka,14217.46866 +Lower middle income,7143.005575 +Low & middle income,11046.15172 +Lesotho,2538.569643 +Late-demographic dividend,18364.29198 +Lithuania,40577.80374 +Luxembourg,121403.8237 +Latvia,33294.50779 +"Macao SAR, China",127747.2191 +St. Martin (French part), +Morocco,8566.116211 +Monaco, +Moldova,13318.80593 +Madagascar,1652.713347 +Maldives,21878.22112 +Middle East & North Africa,16999.85821 +Mexico,21095.73542 +Marshall Islands,5961.747684 +Middle income,12057.81251 +North Macedonia,18270.75537 +Mali,2313.159094 +Malta,50291.10789 +Myanmar,5435.52094 +Middle East & North Africa (excluding high income),11100.95819 +Montenegro,24140.67272 +Mongolia,12986.37747 +Northern Mariana Islands, +Mozambique,1344.884913 +Mauritania,5621.653077 +Mauritius,24680.5578 +Malawi,1582.070725 +Malaysia,28848.10085 +North America,63624.11308 +Namibia,10434.09591 +New Caledonia, +Niger,1269.068386 +Nigeria,5291.66297 +Nicaragua,5627.047456 +Netherlands,61089.58342 +Norway,70939.60552 +Nepal,4088.421359 +Nauru,10976.39481 +New Zealand,45215.69951 +OECD members,47139.26554 +Oman,35248.2074 +Other small states,23483.02825 +Pakistan,5376.284117 +Panama,34178.9238 +Peru,13275.28157 +Philippines,9102.189237 +Palau,18554.50727 +Papua New Guinea,4115.37039 +Poland,35487.911 +Pre-demographic dividend,3663.265734 +Puerto Rico,36268.24334 +"Korea, Dem. People's Rep.", +Portugal,37845.20997 +Paraguay,14186.92718 +West Bank and Gaza,6510.325436 +Pacific island small states,7508.486105 +Post-demographic dividend,51816.64437 +French Polynesia, +Qatar,94611.73415 +Romania,33550.59066 +Russian Federation,30067.74023 +Rwanda,2283.466203 +South Asia,6561.554987 +Saudi Arabia,49018.9099 +Sudan,4308.421387 +Senegal,3575.974691 +Singapore,102630.9331 +Solomon Islands,2727.894532 +Sierra Leone,1726.069487 +El Salvador,9398.028821 +San Marino,62623.90481 +Somalia,1612.511583 +Serbia,19688.93104 +Sub-Saharan Africa (excluding high income),3980.957746 +South Sudan, +Sub-Saharan Africa,3983.260219 +Small states,21387.59777 +Sao Tome and Principe,3629.701391 +Suriname,19231.42159 +Slovakia,33943.34988 +Slovenia,42747.96211 +Sweden,57229.77675 +Eswatini,8861.814915 +Sint Maarten (Dutch part),45235.08641 +Seychelles,30427.75877 +Syrian Arab Republic, +Turks and Caicos Islands,27033.38156 +Chad,1628.242767 +East Asia & Pacific (IDA & IBRD countries),15196.74512 +Europe & Central Asia (IDA & IBRD countries),24664.0763 +Togo,2151.285143 +Thailand,18760.4124 +Tajikistan,3726.91382 +Turkmenistan,15628.45773 +Latin America & the Caribbean (IDA & IBRD countries),16795.35941 +Timor-Leste,3899.07086 +Middle East & North Africa (IDA & IBRD countries),11155.80497 +Tonga,6613.495736 +South Asia (IDA & IBRD),6561.554987 +Sub-Saharan Africa (IDA & IBRD countries),3983.260219 +Trinidad and Tobago,26787.88747 +Tunisia,11585.14473 +Turkey,28461.17903 +Tuvalu,5036.655673 +Tanzania,2688.211914 +Uganda,2345.44067 +Ukraine,13348.02832 +Upper middle income,17540.11302 +Uruguay,24551.74617 +USA,65120.39466 +Uzbekistan,7717.744117 +St. Vincent and the Grenadines,14784.71408 +"Venezuela, RB", +British Virgin Islands, +Virgin Islands (U.S.), +Viet Nam,10686.80302 +Vanuatu,3200.623455 +World,17720.37801 +Samoa,6615.408449 +Kosovo,11798.48801 +"Yemen, Rep.", +South Africa,14438.32771 +Zambia,3515.384644 +Zimbabwe,2296.845429 \ No newline at end of file diff --git a/data/2024-05-06_munk/hdi.csv b/data/2024-05-06_munk/hdi.csv new file mode 100644 index 0000000..d02ad75 --- /dev/null +++ b/data/2024-05-06_munk/hdi.csv @@ -0,0 +1,195 @@ +country,HDI +Switzerland,0.967 +Norway,0.966 +Iceland,0.959 +Hong Kong,0.956 +Denmark,0.952 +Sweden,0.952 +Germany,0.950 +Ireland,0.950 +Singapore,0.949 +Australia,0.946 +Netherlands,0.946 +Belgium,0.942 +Finland,0.942 +Liechtenstein,0.942 +United Kingdom,0.940 +New Zealand,0.939 +United Arab Emirates,0.937 +Canada,0.935 +South Korea,0.929 +Luxembourg,0.927 +USA,0.927 +Austria,0.926 +Slovenia,0.926 +Japan,0.920 +Israel,0.915 +Malta,0.915 +Spain,0.911 +France,0.910 +Cyprus,0.907 +Italy,0.906 +Estonia,0.899 +Czech Republic,0.895 +Greece,0.893 +Bahrain,0.888 +Andorra,0.884 +Poland,0.881 +Latvia,0.879 +Lithuania,0.879 +Croatia,0.878 +Qatar,0.875 +Saudi Arabia,0.875 +Portugal,0.874 +San Marino,0.867 +Chile,0.860 +Slovakia,0.855 +Turkey,0.855 +Hungary,0.851 +Argentina,0.849 +Kuwait,0.847 +Montenegro,0.844 +Saint Kitts and Nevis,0.838 +Uruguay,0.830 +Romania,0.827 +Antigua and Barbuda,0.826 +Brunei Darussalam,0.823 +Russian Federation,0.821 +Bahamas,0.820 +Panama,0.820 +Oman,0.819 +Georgia,0.814 +Trinidad and Tobago,0.814 +Barbados,0.809 +Malaysia,0.807 +Costa Rica,0.806 +Serbia,0.805 +Thailand,0.803 +Kazakhstan,0.802 +Seychelles,0.802 +Belarus,0.801 +Bulgaria,0.799 +Palau,0.797 +Mauritius,0.796 +Grenada,0.793 +Albania,0.789 +China,0.788 +Armenia,0.786 +Mexico,0.781 +Iran,0.780 +Sri Lanka,0.780 +Bosnia and Herzegovina,0.779 +Saint Vincent and the Grenadines,0.772 +Dominican Republic,0.766 +Ecuador,0.765 +North Macedonia,0.765 +Cuba,0.764 +Moldova,0.763 +Maldives,0.762 +Peru,0.762 +Azerbaijan,0.760 +Brazil,0.760 +Colombia,0.758 +Libya,0.746 +Algeria,0.745 +Turkmenistan,0.744 +Guyana,0.742 +Mongolia,0.741 +Dominica,0.740 +Tonga,0.739 +Jordan,0.736 +Ukraine,0.734 +Tunisia,0.732 +Marshall Islands,0.731 +Paraguay,0.731 +Fiji,0.729 +Egypt,0.728 +Uzbekistan,0.727 +Viet Nam,0.726 +Saint Lucia,0.725 +Lebanon,0.723 +South Africa,0.717 +"Palestine, State of",0.716 +Indonesia,0.713 +Philippines,0.710 +Botswana,0.708 +Jamaica,0.706 +Samoa,0.702 +Kyrgyzstan,0.701 +Belize,0.700 +Venezuela (Bolivarian Republic of),0.699 +Bolivia,0.698 +Morocco,0.698 +Nauru,0.696 +Gabon,0.693 +Suriname,0.690 +Bhutan,0.681 +Tajikistan,0.679 +El Salvador,0.674 +Iraq,0.673 +Bangladesh,0.670 +Nicaragua,0.669 +Cabo Verde,0.661 +Tuvalu,0.653 +Equatorial Guinea,0.650 +India,0.644 +Micronesia (Federated States of),0.634 +Guatemala,0.629 +Kiribati,0.628 +Honduras,0.624 +Lao People's Democratic Republic,0.620 +Vanuatu,0.614 +Sao Tome and Principe,0.613 +Eswatini (Kingdom of),0.610 +Namibia,0.610 +Myanmar,0.608 +Ghana,0.602 +Kenya,0.601 +Nepal,0.601 +Cambodia,0.600 +Congo,0.593 +Angola,0.591 +Cameroon,0.587 +Comoros,0.586 +Zambia,0.569 +Papua New Guinea,0.568 +Timor-Leste,0.566 +Solomon Islands,0.562 +Syrian Arab Republic,0.557 +Haiti,0.552 +Uganda,0.550 +Zimbabwe,0.550 +Nigeria,0.548 +Rwanda,0.548 +Togo,0.547 +Mauritania,0.540 +Pakistan,0.540 +Cote d'Ivoire,0.534 +Tanzania,0.532 +Lesotho,0.521 +Senegal,0.517 +Sudan,0.516 +Djibouti,0.515 +Malawi,0.508 +Benin,0.504 +Gambia,0.495 +Eritrea,0.493 +Ethiopia,0.492 +Liberia,0.487 +Madagascar,0.487 +Guinea-Bissau,0.483 +Democratic Republic of the Congo,0.481 +Guinea,0.471 +Afghanistan,0.462 +Mozambique,0.461 +Sierra Leone,0.458 +Burkina Faso,0.438 +Yemen,0.424 +Burundi,0.420 +Mali,0.410 +Chad,0.394 +Niger,0.394 +Central African Republic,0.387 +South Sudan,0.381 +Somalia,0.380 +, \ No newline at end of file diff --git a/data/2024-05-06_munk/projects.txt b/data/2024-05-06_munk/projects.txt new file mode 100644 index 0000000..acbd876 --- /dev/null +++ b/data/2024-05-06_munk/projects.txt @@ -0,0 +1,7 @@ +Munk-PRJEB13831 +Munk-PRJEB27054 +Munk-PRJEB27621 +Munk-PRJEB40798 +Munk-PRJEB40815 +Munk-PRJEB40816 +Munk-PRJEB51229 diff --git a/data/2024-05-06_munk/taxid-names.tsv.gz b/data/2024-05-06_munk/taxid-names.tsv.gz new file mode 120000 index 0000000..626546b --- /dev/null +++ b/data/2024-05-06_munk/taxid-names.tsv.gz @@ -0,0 +1 @@ +../2024-04-01_spurbeck/taxid-names.tsv.gz \ No newline at end of file diff --git a/data/2024-05-06_munk/viral-taxids.tsv.gz b/data/2024-05-06_munk/viral-taxids.tsv.gz new file mode 120000 index 0000000..349083e --- /dev/null +++ b/data/2024-05-06_munk/viral-taxids.tsv.gz @@ -0,0 +1 @@ +../2024-03-19_brumfield/viral-taxids.tsv.gz \ No newline at end of file diff --git a/data/2024-05-06_munk/viral_classes.tsv.gz b/data/2024-05-06_munk/viral_classes.tsv.gz new file mode 100644 index 0000000..45a260f Binary files /dev/null and b/data/2024-05-06_munk/viral_classes.tsv.gz differ diff --git a/docs/notebooks/2024-05-06_munk.html b/docs/notebooks/2024-05-06_munk.html new file mode 100644 index 0000000..0e865a1 --- /dev/null +++ b/docs/notebooks/2024-05-06_munk.html @@ -0,0 +1,3309 @@ + + + + + + + + +Will’s Public NAO Notebook - Workflow analysis of Maritz et al. (2019) + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+

Workflow analysis of Maritz et al. (2019)

+

Wastewater from NYC.

+
+
+ + +
+ +
+
Author
+
+

Will Bradshaw

+
+
+ +
+
Published
+
+

May 1, 2024

+
+
+ + +
+ + +
+ + + + +

The final dataset from the P2RA dataset I want to analyze here is Munk et al. (2022), an enormous dataset of >1,000 raw influent samples from 101 countries collected between 2016 and 2019. As in previous DNA studies like Bengtsson-Palme, samples were centrifuged and only the pellet was retained for sequencing, so we expect viral abundance to be low; nevertheless, this is the largest and most comprehensive DNA wastewater dataset we’ve been able to find to date, so it’s worth having a look at what’s in it. The pellet from each sample was resuspended, was homogenized with bead-beating, underwent DNA extraction and library prep, and was sequenced using Illumina technology; earlier samples were sequenced on an Illumina HiSeq3000, while later samples were sequenced on a NovaSeq6000, both with 2x150bp reads.

+

The raw data

+

The Munk data comprised 1,189 total samples, of which 1,185 had complete metadata. These samples came from 101 countries, with the largest number of samples coming from the USA, Canada, and Denmark:

+
+
Code
# Importing the data is a bit more complicated this time as the samples are split across seven (!) pipeline runs
+data_dir_base <- "../data/2024-05-06_munk"
+data_dirs <- list.dirs(data_dir_base, recursive = FALSE)
+
+# Data input paths
+libraries_paths <- file.path(data_dirs, "sample-metadata.csv")
+basic_stats_paths <- file.path(data_dirs, "qc_basic_stats.tsv.gz")
+adapter_stats_paths <- file.path(data_dirs, "qc_adapter_stats.tsv.gz")
+quality_base_stats_paths <- file.path(data_dirs, "qc_quality_base_stats.tsv.gz")
+quality_seq_stats_paths <- file.path(data_dirs, "qc_quality_sequence_stats.tsv.gz")
+
+# Import libraries and extract metadata from sample names
+ctypes <- cols(date="D", .default="c")
+libraries_raw <- lapply(libraries_paths, read_csv, col_types = ctypes) %>%
+  bind_rows
+libraries <- libraries_raw %>%
+  # Add missing dates
+  mutate(date = ifelse(sample == "ERR4682809", as_date("2018-06-01"), date),
+         date = ifelse(sample == "ERR4682803", as_date("2018-06-01"), date),
+         date = ifelse(sample == "ERR2683170", as_date("2017-06-01"), date)) %>%
+  # Filter samples with unknown dates
+  filter(!is.na(date)) %>%
+  arrange(date, country, city) %>%
+  mutate(sample = fct_inorder(sample), date=as_date(date))
+
+
+
+
Code
sample_countries <- libraries %>% group_by(country) %>% count %>% ungroup %>% 
+  mutate(p=n/sum(n)) %>% arrange(desc(p)) %>% mutate(country=fct_inorder(country))
+g_countries <- ggplot(sample_countries, aes(x=country, y=n)) + 
+  geom_col() + 
+  scale_y_continuous(name="# Samples", expand=c(0,0), limits=c(0,120), breaks=seq(0,200,20)) +
+  theme_kit + theme(axis.text.x = element_text(size=rel(0.5)))
+g_countries
+
+
+

+
+
+
+
+

The 1,185 libraries included in this analysis varied dramatically in size, from 33,554 read pairs to over 123 million. The mean number of read pairs per library was 33.5M, and the dataset as a whole comprised 39.7B read pairs and almost 12 terabases of sequence:

+
+
Code
# Import QC data
+stages <- c("raw_concat", "cleaned", "dedup", "ribo_initial", "ribo_secondary")
+import_basic <- function(paths){
+  lapply(paths, read_tsv, show_col_types = FALSE) %>% bind_rows %>%
+    inner_join(libraries, by="sample") %>%
+    arrange(sample) %>%
+    mutate(stage = factor(stage, levels = stages),
+           sample = fct_inorder(sample))
+}
+import_basic_paired <- function(paths){
+  import_basic(paths) %>% arrange(read_pair) %>% 
+    mutate(read_pair = fct_inorder(as.character(read_pair)))
+}
+basic_stats <- import_basic(basic_stats_paths)
+adapter_stats <- import_basic_paired(adapter_stats_paths)
+quality_base_stats <- import_basic_paired(quality_base_stats_paths)
+quality_seq_stats <- import_basic_paired(quality_seq_stats_paths)
+
+# Identify small and large datasets
+basic_stats_raw <- basic_stats %>% filter(stage == "raw_concat")
+libraries_small <- basic_stats_raw %>% filter(n_read_pairs <= 1e7) %>% pull(library)
+libraries <- libraries %>% mutate(small = library %in% libraries_small)
+basic_stats <- basic_stats %>% mutate(small = library %in% libraries_small)
+adapter_stats <- adapter_stats %>% mutate(small = library %in% libraries_small)
+quality_base_stats <- quality_base_stats %>% mutate(small = library %in% libraries_small)
+quality_seq_stats <- quality_seq_stats %>% mutate(small = library %in% libraries_small)
+
+# Filter to raw data
+basic_stats_raw <- basic_stats %>% filter(stage == "raw_concat")
+adapter_stats_raw <- adapter_stats %>% filter(stage == "raw_concat")
+quality_base_stats_raw <- quality_base_stats %>% filter(stage == "raw_concat")
+quality_seq_stats_raw <- quality_seq_stats %>% filter(stage == "raw_concat")
+
+# Get key values for readout
+raw_read_counts <- basic_stats_raw %>% ungroup %>% 
+  summarize(rmin = min(n_read_pairs), rmax=max(n_read_pairs),
+            rmean=mean(n_read_pairs), 
+            rtot = sum(n_read_pairs),
+            btot = sum(n_bases_approx),
+            dmin = min(percent_duplicates), dmax=max(percent_duplicates),
+            dmean=mean(percent_duplicates), .groups = "drop")
+
+
+
+
Code
# Prepare data
+basic_stats_raw_metrics <- basic_stats_raw %>%
+  select(sample, date,
+         `# Read pairs` = n_read_pairs,
+         `Total base pairs\n(approx)` = n_bases_approx,
+         `% Duplicates\n(FASTQC)` = percent_duplicates) %>%
+  pivot_longer(-(sample:date), names_to = "metric", values_to = "value") %>%
+  mutate(metric = fct_inorder(metric))
+
+# Set up plot templates
+g_basic <- ggplot(basic_stats_raw_metrics, aes(x=date, y=value)) +
+  geom_col(position = "dodge") +
+  scale_x_date() +
+  scale_y_continuous(expand=c(0,0)) +
+  expand_limits(y=c(0,100)) +
+  facet_grid(metric~., scales = "free", space="free_x", switch="y") +
+  theme_kit + theme(
+    axis.title.y = element_blank(),
+    strip.text.y = element_text(face="plain")
+  )
+g_basic
+
+
+

+
+
+
+
+

Adapter levels were high, read qualities were variable (in definite need of trimming) and duplicate levels were moderate:

+
+
Code
# Set up plotting templates
+g_qual_raw <- ggplot(mapping=aes(linetype=read_pair,
+                         group=interaction(sample,read_pair))) + 
+  scale_linetype_discrete(name = "Read Pair") +
+  guides(color=guide_legend(nrow=2,byrow=TRUE),
+         linetype = guide_legend(nrow=2,byrow=TRUE)) +
+  theme_base
+
+# Visualize adapters
+g_adapters_raw <- g_qual_raw + 
+  geom_line(aes(x=position, y=pc_adapters), data=adapter_stats_raw) +
+  scale_y_continuous(name="% Adapters", limits=c(0,NA),
+                     breaks = seq(0,100,10), expand=c(0,0)) +
+  scale_x_continuous(name="Position", limits=c(0,NA),
+                     breaks=seq(0,500,20), expand=c(0,0)) +
+  facet_grid(.~adapter)
+g_adapters_raw
+
+
+

+
+
+
+
Code
# Visualize quality
+g_quality_base_raw <- g_qual_raw +
+  geom_hline(yintercept=25, linetype="dashed", color="red") +
+  geom_hline(yintercept=30, linetype="dashed", color="red") +
+  geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats_raw) +
+  scale_y_continuous(name="Mean Phred score", expand=c(0,0), limits=c(10,45)) +
+  scale_x_continuous(name="Position", limits=c(0,NA),
+                     breaks=seq(0,500,20), expand=c(0,0))
+g_quality_base_raw
+
+
+

+
+
+
+
Code
g_quality_seq_raw <- g_qual_raw +
+  geom_vline(xintercept=25, linetype="dashed", color="red") +
+  geom_vline(xintercept=30, linetype="dashed", color="red") +
+  geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats_raw) +
+  scale_x_continuous(name="Mean Phred score", expand=c(0,0)) +
+  scale_y_continuous(name="# Sequences", expand=c(0,0))
+g_quality_seq_raw
+
+
+

+
+
+
+
+

Preprocessing

+

About 6% of reads on average were lost during cleaning, and a further 10% during deduplication; however, in both cases a minority of samples lost much larger read fractions. Very few reads were lost during ribodepletion, as expected for DNA sequencing libraries.

+
+
Code
n_reads_rel <- basic_stats %>% 
+  select(sample, stage, 
+         percent_duplicates, n_read_pairs) %>%
+  group_by(sample) %>% arrange(sample, stage) %>%
+  mutate(p_reads_retained = replace_na(n_read_pairs / lag(n_read_pairs), 0),
+         p_reads_lost = 1 - p_reads_retained,
+         p_reads_retained_abs = n_read_pairs / n_read_pairs[1],
+         p_reads_lost_abs = 1-p_reads_retained_abs,
+         p_reads_lost_abs_marginal = replace_na(p_reads_lost_abs - lag(p_reads_lost_abs), 0))
+n_reads_rel_display <- n_reads_rel %>% 
+  group_by(Stage=stage) %>% 
+  summarize(`% Total Reads Lost (Cumulative)` = paste0(round(min(p_reads_lost_abs*100),1), "-", round(max(p_reads_lost_abs*100),1), " (mean ", round(mean(p_reads_lost_abs*100),1), ")"),
+            `% Total Reads Lost (Marginal)` = paste0(round(min(p_reads_lost_abs_marginal*100),1), "-", round(max(p_reads_lost_abs_marginal*100),1), " (mean ", round(mean(p_reads_lost_abs_marginal*100),1), ")"), .groups="drop") %>% 
+  filter(Stage != "raw_concat") %>%
+  mutate(Stage = Stage %>% as.numeric %>% factor(labels=c("Trimming & filtering", "Deduplication", "Initial ribodepletion", "Secondary ribodepletion")))
+n_reads_rel_display
+
+
+ +
+
+
+
+
Code
g_stage_base <- ggplot(mapping=aes(x=stage, group=sample)) +
+  theme_kit
+
+# Plot reads over preprocessing
+g_reads_stages <- g_stage_base +
+  geom_line(aes(y=n_read_pairs), data=basic_stats) +
+  scale_y_continuous("# Read pairs", expand=c(0,0), limits=c(0,NA))
+g_reads_stages
+
+
+

+
+
+
+
Code
# Plot relative read losses during preprocessing
+g_reads_rel <- g_stage_base +
+  geom_line(aes(y=p_reads_lost_abs_marginal), data=n_reads_rel) +
+  scale_y_continuous("% Total Reads Lost", expand=c(0,0), 
+                     labels = function(x) x*100)
+g_reads_rel
+
+
+

+
+
+
+
+

As usual, data cleaning was very successful at removing adapters and improving read qualities:

+
+
Code
g_qual <- ggplot(mapping=aes(linetype=read_pair, 
+                         group=interaction(sample,read_pair))) + 
+  scale_linetype_discrete(name = "Read Pair") +
+  guides(color=guide_legend(nrow=2,byrow=TRUE),
+         linetype = guide_legend(nrow=2,byrow=TRUE)) +
+  theme_base
+
+# Visualize adapters
+g_adapters <- g_qual + 
+  geom_line(aes(x=position, y=pc_adapters), data=adapter_stats) +
+  scale_y_continuous(name="% Adapters", limits=c(0,20),
+                     breaks = seq(0,50,10), expand=c(0,0)) +
+  scale_x_continuous(name="Position", limits=c(0,NA),
+                     breaks=seq(0,140,20), expand=c(0,0)) +
+  facet_grid(stage~adapter)
+g_adapters
+
+
+

+
+
+
+
Code
# Visualize quality
+g_quality_base <- g_qual +
+  geom_hline(yintercept=25, linetype="dashed", color="red") +
+  geom_hline(yintercept=30, linetype="dashed", color="red") +
+  geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats) +
+  scale_y_continuous(name="Mean Phred score", expand=c(0,0), limits=c(10,45)) +
+  scale_x_continuous(name="Position", limits=c(0,NA),
+                     breaks=seq(0,140,20), expand=c(0,0)) +
+  facet_grid(stage~.)
+g_quality_base
+
+
+

+
+
+
+
Code
g_quality_seq <- g_qual +
+  geom_vline(xintercept=25, linetype="dashed", color="red") +
+  geom_vline(xintercept=30, linetype="dashed", color="red") +
+  geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats) +
+  scale_x_continuous(name="Mean Phred score", expand=c(0,0)) +
+  scale_y_continuous(name="# Sequences", expand=c(0,0)) +
+  facet_grid(stage~.)
+g_quality_seq
+
+
+

+
+
+
+
+

According to FASTQC, cleaning + deduplication was mostly effective at reducing measured duplicate levels, though a few samples retained high measured duplicate levels throughout the pipeline:

+
+
Code
stage_dup <- basic_stats %>% group_by(stage) %>% 
+  summarize(dmin = min(percent_duplicates), dmax=max(percent_duplicates),
+            dmean=mean(percent_duplicates), .groups = "drop")
+
+g_dup_stages <- g_stage_base +
+  geom_line(aes(y=percent_duplicates), data=basic_stats) +
+  scale_y_continuous("% Duplicates", limits=c(0,NA), expand=c(0,0))
+g_dup_stages
+
+
+

+
+
+
+
Code
g_readlen_stages <- g_stage_base + 
+  geom_line(aes(y=mean_seq_len), data=basic_stats) +
+  scale_y_continuous("Mean read length (nt)", expand=c(0,0), limits=c(0,NA))
+g_readlen_stages
+
+
+

+
+
+
+
+

High-level composition

+

As before, to assess the high-level composition of the reads, I ran the ribodepleted files through Kraken (using the Standard 16 database) and summarized the results with Bracken. Combining these results with the read counts above gives us a breakdown of the inferred composition of the samples:

+
+
Code
classifications <- c("Filtered", "Duplicate", "Ribosomal", "Unassigned",
+                     "Bacterial", "Archaeal", "Viral", "Human")
+
+# Import composition data
+comp_paths <- file.path(data_dirs, "taxonomic_composition.tsv.gz")
+comp <- lapply(comp_paths, read_tsv, show_col_types = FALSE) %>% bind_rows %>%
+  inner_join(libraries, by="sample") %>%
+  mutate(classification = factor(classification, levels = classifications))
+  
+
+# Summarize composition
+read_comp_summ <- comp %>% 
+  group_by(classification) %>%
+  summarize(n_reads = sum(n_reads), .groups = "drop_last") %>%
+  mutate(n_reads = replace_na(n_reads,0),
+    p_reads = n_reads/sum(n_reads),
+    pc_reads = p_reads*100)
+
+
+
+
Code
# Prepare plotting templates
+g_comp_base <- ggplot(mapping=aes(x=sample, y=p_reads, fill=classification)) +
+  theme_xblank + theme(axis.ticks.x = element_blank())
+scale_y_pc_reads <- purrr::partial(scale_y_continuous, name = "% Reads",
+                                   expand = c(0,0), labels = function(y) y*100)
+
+# Plot overall composition
+g_comp <- g_comp_base + geom_col(data = comp, position = "stack", width=1) +
+  scale_y_pc_reads(limits = c(0,1.01), breaks = seq(0,1,0.2)) +
+  scale_fill_brewer(palette = "Set1", name = "Classification")
+g_comp
+
+
+

+
+
+
+
Code
# Plot composition of minor components
+comp_minor <- comp %>% 
+  filter(classification %in% c("Archaeal", "Viral", "Human", "Other"))
+palette_minor <- brewer.pal(9, "Set1")[6:9]
+g_comp_minor <- g_comp_base + 
+  geom_col(data=comp_minor, position = "stack", width=1) +
+  scale_y_pc_reads() +
+  scale_fill_manual(values=palette_minor, name = "Classification")
+g_comp_minor
+
+
+

+
+
+
+
+
+
Code
p_reads_summ_group <- comp %>%
+  mutate(classification = ifelse(classification %in% c("Filtered", "Duplicate", "Unassigned"), "Excluded", as.character(classification)),
+         classification = fct_inorder(classification)) %>%
+  group_by(classification, sample) %>%
+  summarize(p_reads = sum(p_reads), .groups = "drop") %>%
+  group_by(classification) %>%
+  summarize(pc_min = min(p_reads)*100, pc_max = max(p_reads)*100, 
+            pc_mean = mean(p_reads)*100, .groups = "drop")
+p_reads_summ_prep <- p_reads_summ_group %>%
+  mutate(classification = fct_inorder(classification),
+         pc_min = pc_min %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),
+         pc_max = pc_max %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),
+         pc_mean = pc_mean %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2),
+         display = paste0(pc_min, "-", pc_max, "% (mean ", pc_mean, "%)"))
+p_reads_summ <- p_reads_summ_prep %>%
+  select(Classification=classification, 
+         `Read Fraction`=display) %>%
+  arrange(Classification)
+p_reads_summ
+
+
+ +
+
+
+

As in previous DNA datasets, the vast majority of classified reads were bacterial in origin. Viral fraction averaged 0.33%, higher than in other DNA wastewater datasets I’ve looked at, and reached >1% in 35 samples. As is common for DNA wastewater data, viral reads were overwhelmingly dominated by Caudoviricetes phages, though Quintoviricetes (parvoviruses) also showed significant prevalence in some samples:

+
+
Code
# # Get Kraken reports
+# reports_paths <- file.path(data_dirs, "kraken_reports.tsv.gz")
+# reports <- lapply(reports_paths, read_tsv, show_col_types = FALSE) %>% bind_rows %>%
+#   inner_join(libraries, by="sample")
+# 
+# Get viral taxonomy
+viral_taxa_path <- file.path(data_dir_base, "viral-taxids.tsv.gz")
+viral_taxa <- read_tsv(viral_taxa_path, show_col_types = FALSE)
+# 
+# # Filter to viral taxa
+# kraken_reports_viral <- filter(reports, taxid %in% viral_taxa$taxid) %>%
+#   group_by(sample) %>%
+#   mutate(p_reads_viral = n_reads_clade/n_reads_clade[1])
+# kraken_reports_viral_cleaned <- kraken_reports_viral %>%
+#   inner_join(libraries, by="sample") %>%
+#   select(-pc_reads_total, -n_reads_direct, -contains("minimizers")) %>%
+#   select(name, taxid, p_reads_viral, n_reads_clade, everything())
+# 
+# viral_classes <- kraken_reports_viral_cleaned %>% filter(rank == "C")
+
+viral_classes_path <- file.path(data_dir_base, "viral_classes.tsv.gz")
+# write_tsv(viral_classes, viral_classes_path)
+viral_classes <- read_tsv(viral_classes_path, show_col_types = FALSE)
+
+
+
+
Code
major_threshold <- 0.02
+
+# Identify major viral classes
+viral_classes_major_tab <- viral_classes %>% 
+  group_by(name, taxid) %>%
+  summarize(p_reads_viral_max = max(p_reads_viral), .groups="drop") %>%
+  filter(p_reads_viral_max >= major_threshold)
+viral_classes_major_list <- viral_classes_major_tab %>% pull(name)
+viral_classes_major <- viral_classes %>% 
+  filter(name %in% viral_classes_major_list) %>%
+  select(name, taxid, sample, p_reads_viral)
+viral_classes_minor <- viral_classes_major %>% 
+  group_by(sample) %>%
+  summarize(p_reads_viral_major = sum(p_reads_viral), .groups = "drop") %>%
+  mutate(name = "Other", taxid=NA, p_reads_viral = 1-p_reads_viral_major) %>%
+  select(name, taxid, sample, p_reads_viral)
+viral_classes_display <- bind_rows(viral_classes_major, viral_classes_minor) %>%
+  arrange(desc(p_reads_viral)) %>% 
+  mutate(name = factor(name, levels=c(viral_classes_major_list, "Other")),
+         p_reads_viral = pmax(p_reads_viral, 0)) %>%
+  rename(p_reads = p_reads_viral, classification=name)
+
+palette_viral <- c(brewer.pal(12, "Set3"), brewer.pal(8, "Dark2"))
+g_classes <- g_comp_base + 
+  geom_col(data=viral_classes_display, position = "stack", width=1) +
+  scale_y_continuous(name="% Viral Reads", limits=c(0,1.01), breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral class")
+  
+g_classes
+
+
+

+
+
+
+
+

Human-infecting virus reads: validation

+

Next, I investigated the human-infecting virus read content of these unenriched samples. A grand total of 331,452 reads were identified as putatively human-viral:

+
+
Code
# Import HV read data
+hv_reads_filtered_paths <- file.path(data_dirs, "hv_hits_putative_filtered.tsv.gz")
+hv_reads_filtered <- lapply(hv_reads_filtered_paths, read_tsv,
+                            show_col_types = FALSE) %>%
+  bind_rows() %>%
+  left_join(libraries, by="sample")
+
+# Count reads
+n_hv_filtered <- hv_reads_filtered %>%
+  group_by(sample, seq_id) %>% count %>%
+  group_by(sample) %>% count %>% 
+  inner_join(basic_stats %>% filter(stage == "ribo_initial") %>% 
+               select(sample, n_read_pairs), by="sample") %>% 
+  rename(n_putative = n, n_total = n_read_pairs) %>% 
+  mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100)
+n_hv_filtered_summ <- n_hv_filtered %>% ungroup %>%
+  summarize(n_putative = sum(n_putative), n_total = sum(n_total), 
+            .groups="drop") %>% 
+  mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100)
+
+
+
+
Code
# Collapse multi-entry sequences
+rmax <- purrr::partial(max, na.rm = TRUE)
+collapse <- function(x) ifelse(all(x == x[1]), x[1], paste(x, collapse="/"))
+mrg <- hv_reads_filtered %>% 
+  mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev, na.rm = TRUE)) %>%
+  arrange(desc(adj_score_max)) %>%
+  group_by(seq_id) %>%
+  summarize(sample = collapse(sample),
+            genome_id = collapse(genome_id),
+            taxid_best = taxid[1],
+            taxid = collapse(as.character(taxid)),
+            best_alignment_score_fwd = rmax(best_alignment_score_fwd),
+            best_alignment_score_rev = rmax(best_alignment_score_rev),
+            query_len_fwd = rmax(query_len_fwd),
+            query_len_rev = rmax(query_len_rev),
+            query_seq_fwd = query_seq_fwd[!is.na(query_seq_fwd)][1],
+            query_seq_rev = query_seq_rev[!is.na(query_seq_rev)][1],
+            classified = rmax(classified),
+            assigned_name = collapse(assigned_name),
+            assigned_taxid_best = assigned_taxid[1],
+            assigned_taxid = collapse(as.character(assigned_taxid)),
+            assigned_hv = rmax(assigned_hv),
+            hit_hv = rmax(hit_hv),
+            encoded_hits = collapse(encoded_hits),
+            adj_score_fwd = rmax(adj_score_fwd),
+            adj_score_rev = rmax(adj_score_rev)
+            ) %>%
+  inner_join(libraries, by="sample") %>%
+  mutate(kraken_label = ifelse(assigned_hv, "Kraken2 HV\nassignment",
+                               ifelse(hit_hv, "Kraken2 HV\nhit",
+                                      "No hit or\nassignment"))) %>%
+  mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev),
+         highscore = adj_score_max >= 20)
+
+# Plot results
+geom_vhist <- purrr::partial(geom_histogram, binwidth=5, boundary=0)
+g_vhist_base <- ggplot(mapping=aes(x=adj_score_max)) +
+  geom_vline(xintercept=20, linetype="dashed", color="red") +
+  facet_wrap(~kraken_label, labeller = labeller(kit = label_wrap_gen(20)), scales = "free_y") +
+  scale_x_continuous(name = "Maximum adjusted alignment score") + 
+  scale_y_continuous(name="# Read pairs") + 
+  theme_base 
+g_vhist_0 <- g_vhist_base + geom_vhist(data=mrg)
+g_vhist_0
+
+
+

+
+
+
+
+

BLASTing these reads against nt, we find that the pipeline performs well, with only a single high-scoring false-positive read:

+
+
Code
# Import paired BLAST results
+blast_paired_paths <- file.path(data_dirs, "hv_hits_blast_paired.tsv.gz")
+blast_paired <- lapply(blast_paired_paths, read_tsv, show_col_types = FALSE) %>% bind_rows
+
+# Add viral status
+blast_viral <- mutate(blast_paired, viral = staxid %in% viral_taxa$taxid) %>%
+  mutate(viral_full = viral & n_reads == 2)
+
+# Compare to Kraken & Bowtie assignments
+match_taxid <- function(taxid_1, taxid_2){
+  p1 <- mapply(grepl, paste0("/", taxid_1, "$"), taxid_2)
+  p2 <- mapply(grepl, paste0("^", taxid_1, "/"), taxid_2)
+  p3 <- mapply(grepl, paste0("^", taxid_1, "$"), taxid_2)
+  out <- setNames(p1|p2|p3, NULL)
+  return(out)
+}
+mrg_assign <- mrg %>% select(sample, seq_id, taxid, assigned_taxid, adj_score_max)
+blast_assign <- inner_join(blast_viral, mrg_assign, by="seq_id") %>%
+    mutate(taxid_match_bowtie = match_taxid(staxid, taxid),
+           taxid_match_kraken = match_taxid(staxid, assigned_taxid),
+           taxid_match_any = taxid_match_bowtie | taxid_match_kraken)
+blast_out <- blast_assign %>%
+  group_by(seq_id) %>%
+  summarize(viral_status = ifelse(any(viral_full), 2,
+                                  ifelse(any(taxid_match_any), 2,
+                                             ifelse(any(viral), 1, 0))),
+            .groups = "drop")
+
+
+
+
Code
# Merge BLAST results with unenriched read data
+mrg_blast <- full_join(mrg, blast_out, by="seq_id") %>%
+  mutate(viral_status = replace_na(viral_status, 0),
+         viral_status_out = ifelse(viral_status == 0, FALSE, TRUE))
+
+# Plot
+g_vhist_1 <- g_vhist_base + geom_vhist(data=mrg_blast, mapping=aes(fill=viral_status_out)) +
+  scale_fill_brewer(palette = "Set1", name = "Viral status")
+g_vhist_1
+
+
+

+
+
+
+
+

My usual disjunctive score threshold of 20 gave precision, sensitivity, and F1 scores all >99%:

+
+
Code
test_sens_spec <- function(tab, score_threshold){
+  tab_retained <- tab %>% 
+    mutate(retain_score = (adj_score_fwd > score_threshold | adj_score_rev > score_threshold),
+           retain = assigned_hv | retain_score) %>%
+    group_by(viral_status_out, retain) %>% count
+  pos_tru <- tab_retained %>% filter(viral_status_out == "TRUE", retain) %>% pull(n) %>% sum
+  pos_fls <- tab_retained %>% filter(viral_status_out != "TRUE", retain) %>% pull(n) %>% sum
+  neg_tru <- tab_retained %>% filter(viral_status_out != "TRUE", !retain) %>% pull(n) %>% sum
+  neg_fls <- tab_retained %>% filter(viral_status_out == "TRUE", !retain) %>% pull(n) %>% sum
+  sensitivity <- pos_tru / (pos_tru + neg_fls)
+  specificity <- neg_tru / (neg_tru + pos_fls)
+  precision   <- pos_tru / (pos_tru + pos_fls)
+  f1 <- 2 * precision * sensitivity / (precision + sensitivity)
+  out <- tibble(threshold=score_threshold, sensitivity=sensitivity, 
+                specificity=specificity, precision=precision, f1=f1)
+  return(out)
+}
+range_f1 <- function(intab, inrange=15:45){
+  tss <- purrr::partial(test_sens_spec, tab=intab)
+  stats <- lapply(inrange, tss) %>% bind_rows %>%
+    pivot_longer(!threshold, names_to="metric", values_to="value")
+  return(stats)
+}
+stats_0 <- range_f1(mrg_blast)
+g_stats_0 <- ggplot(stats_0, aes(x=threshold, y=value, color=metric)) +
+  geom_vline(xintercept=20, color = "red", linetype = "dashed") +
+  geom_line() +
+  scale_y_continuous(name = "Value", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) +
+  scale_x_continuous(name = "Adjusted Score Threshold", expand = c(0,0)) +
+  scale_color_brewer(palette="Dark2") +
+  theme_base
+g_stats_0
+
+
+

+
+
+
+
Code
stats_0 %>% filter(threshold == 20) %>% 
+  select(Threshold=threshold, Metric=metric, Value=value)
+
+
+ +
+
+
+

Human-infecting viruses: overall relative abundance

+
+
Code
# Get raw read counts
+read_counts_raw <- basic_stats_raw %>%
+  select(sample, n_reads_raw = n_read_pairs)
+
+# Get HV read counts
+mrg_hv <- mrg %>% mutate(hv_status = assigned_hv | highscore) %>%
+  rename(taxid_all = taxid, taxid = taxid_best)
+read_counts_hv <- mrg_hv %>% filter(hv_status) %>% group_by(sample) %>% 
+  count(name="n_reads_hv")
+read_counts <- read_counts_raw %>% left_join(read_counts_hv, by="sample") %>%
+  mutate(n_reads_hv = replace_na(n_reads_hv, 0)) %>%
+  inner_join(libraries, by="sample")
+
+# Aggregate
+read_counts_grp <- read_counts %>% group_by(country) %>%
+  summarize(n_reads_raw = sum(n_reads_raw),
+            n_reads_hv = sum(n_reads_hv), 
+            n_samples = n(), .groups="drop") %>%
+  mutate(sample= "All samples")
+read_counts_tot <- read_counts_grp %>% group_by(sample) %>%
+  summarize(n_reads_raw = sum(n_reads_raw),
+            n_reads_hv = sum(n_reads_hv), .groups="drop") %>%
+  mutate(country= "All countries")
+read_counts_agg <- bind_rows(read_counts_grp, read_counts_tot) %>%
+  mutate(p_reads_hv = n_reads_hv/n_reads_raw,
+         sample = factor(sample, levels=c(levels(libraries$sample), "All samples")))
+
+
+

Applying a disjunctive cutoff at S=20 identifies 325,390 read pairs as human-viral. This gives an overall relative HV abundance of \(8.19 \times 10^{-6}\); higher than any other DNA WW dataset I’ve analyzed and competitive with many RNA datasets:

+
+
Code
# Visualize
+g_phv_agg <- ggplot(read_counts_agg, aes(x=country)) +
+  geom_point(aes(y=p_reads_hv)) +
+  scale_y_log10("Relative abundance of human virus reads") +
+  theme_kit + theme(axis.text.x = element_text(size=rel(0.5)))
+
+g_phv_agg
+
+
+

+
+
+
+
+
+
Code
# Collate past RA values
+ra_past <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched,
+                   "Brumfield", 5e-5, "RNA", FALSE,
+                   "Brumfield", 3.66e-7, "DNA", FALSE,
+                   "Spurbeck", 5.44e-6, "RNA", FALSE,
+                   "Yang", 3.62e-4, "RNA", FALSE,
+                   "Rothman (unenriched)", 1.87e-5, "RNA", FALSE,
+                   "Rothman (panel-enriched)", 3.3e-5, "RNA", TRUE,
+                   "Crits-Christoph (unenriched)", 1.37e-5, "RNA", FALSE,
+                   "Crits-Christoph (panel-enriched)", 1.26e-2, "RNA", TRUE,
+                   "Prussin (non-control)", 1.63e-5, "RNA", FALSE,
+                   "Prussin (non-control)", 4.16e-5, "DNA", FALSE,
+                   "Rosario (non-control)", 1.21e-5, "RNA", FALSE,
+                   "Rosario (non-control)", 1.50e-4, "DNA", FALSE,
+                   "Leung", 1.73e-5, "DNA", FALSE,
+                   "Brinch", 3.88e-6, "DNA", FALSE,
+                   "Bengtsson-Palme", 8.86e-8, "DNA", FALSE,
+                   "Ng", 2.90e-7, "DNA", FALSE,
+                   "Maritz", 9.42e-7, "DNA", FALSE
+)
+
+# Collate new RA values
+ra_new <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched,
+                  "Munk", 8.19e-6, "DNA", FALSE)
+
+
+# Plot
+scale_color_na <- purrr::partial(scale_color_brewer, palette="Set1",
+                                 name="Nucleic acid type")
+ra_comp <- bind_rows(ra_past, ra_new) %>% mutate(dataset = fct_inorder(dataset))
+g_ra_comp <- ggplot(ra_comp, aes(y=dataset, x=ra, color=na_type)) +
+  geom_point() +
+  scale_color_na() +
+  scale_x_log10(name="Relative abundance of human virus reads") +
+  theme_base + theme(axis.title.y = element_blank())
+g_ra_comp
+
+
+

+
+
+
+
+

One potential explanation for the higher HV fraction in the Munk data compared to other DNA WW datasets is the sample location: whereas Brinch, Maritz, Bengtsson-Palme and Ng are all from highly developed economies with good sanitation, Munk includes samples from numerous countries including many with much lower incomes and development scores. To quickly test this, I took the most recent Human Development Index dataset from the UN (20221) and GDP per capita dataset from the World Bank (PPP, 2019). In both cases, there was a weak negative correlation between the development metric and measured human-viral load:

+
+
Code
# HDI
+hdi_path <- file.path(data_dir_base, "hdi.csv")
+hdi <- read_csv(hdi_path, show_col_types = FALSE)
+read_counts_hdi <- inner_join(read_counts_grp, hdi, by="country") %>%
+  mutate(p_reads_hv = n_reads_hv/n_reads_raw,
+         log_p = log10(p_reads_hv))
+g_hdi <- ggscatter(read_counts_hdi, x="HDI", y="p_reads_hv",
+                   add = "reg.line") +
+  stat_cor(method="pearson") +
+  geom_point() +
+  scale_x_continuous("HDI (2022)") +
+  scale_y_continuous("HV RA") +
+  theme_base
+g_hdi
+
+
+

+
+
+
+
Code
# GDP
+gdp_path <- file.path(data_dir_base, "gdp.csv")
+gdp <- read_csv(gdp_path, show_col_types = FALSE)
+read_counts_gdp <- inner_join(read_counts_grp, gdp, by="country") %>%
+  mutate(p_reads_hv = n_reads_hv/n_reads_raw,
+         log_p = log10(p_reads_hv),
+         log_gdp = log10(gdp_per_capita_ppp))
+g_gdp <- ggscatter(read_counts_gdp, x="log_gdp", y="p_reads_hv",
+                   add = "reg.line") +
+  stat_cor(method = "pearson") +
+  scale_x_continuous("Log GDP per Capita (PPP, Int$, 2019)", labels = function(x) paste0("1e+", x)) +
+  scale_y_continuous("Relative abundance of human virus reads") +
+  theme_base
+g_gdp
+
+
+

+
+
+
+
+

Human-infecting viruses: taxonomy and composition

+

In investigating the taxonomy of human-infecting virus reads, I restricted my analysis to samples with more than 5 HV read pairs total across all viruses, to reduce noise arising from extremely low HV read counts in some samples. 1,129 samples met this criterion.

+

As usual, at the family level, most samples were dominated by Adenoviridae, Polyomaviridae and Papillomaviridae. Three other families, Parvoviridae, Circoviridae and Herpesviridae, also showed substantial prevalence.

+
+
Code
# Get viral taxon names for putative HV reads
+viral_taxa$name[viral_taxa$taxid == 249588] <- "Mamastrovirus"
+viral_taxa$name[viral_taxa$taxid == 194960] <- "Kobuvirus"
+viral_taxa$name[viral_taxa$taxid == 688449] <- "Salivirus"
+viral_taxa$name[viral_taxa$taxid == 585893] <- "Picobirnaviridae"
+viral_taxa$name[viral_taxa$taxid == 333922] <- "Betapapillomavirus"
+viral_taxa$name[viral_taxa$taxid == 334207] <- "Betapapillomavirus 3"
+viral_taxa$name[viral_taxa$taxid == 369960] <- "Porcine type-C oncovirus"
+viral_taxa$name[viral_taxa$taxid == 333924] <- "Betapapillomavirus 2"
+viral_taxa$name[viral_taxa$taxid == 687329] <- "Anelloviridae"
+viral_taxa$name[viral_taxa$taxid == 325455] <- "Gammapapillomavirus"
+viral_taxa$name[viral_taxa$taxid == 333750] <- "Alphapapillomavirus"
+viral_taxa$name[viral_taxa$taxid == 694002] <- "Betacoronavirus"
+viral_taxa$name[viral_taxa$taxid == 334202] <- "Mupapillomavirus"
+viral_taxa$name[viral_taxa$taxid == 197911] <- "Alphainfluenzavirus"
+viral_taxa$name[viral_taxa$taxid == 186938] <- "Respirovirus"
+viral_taxa$name[viral_taxa$taxid == 333926] <- "Gammapapillomavirus 1"
+viral_taxa$name[viral_taxa$taxid == 337051] <- "Betapapillomavirus 1"
+viral_taxa$name[viral_taxa$taxid == 337043] <- "Alphapapillomavirus 4"
+viral_taxa$name[viral_taxa$taxid == 694003] <- "Betacoronavirus 1"
+viral_taxa$name[viral_taxa$taxid == 334204] <- "Mupapillomavirus 2"
+viral_taxa$name[viral_taxa$taxid == 334208] <- "Betapapillomavirus 4"
+viral_taxa$name[viral_taxa$taxid == 333928] <- "Gammapapillomavirus 2"
+viral_taxa$name[viral_taxa$taxid == 337039] <- "Alphapapillomavirus 2"
+viral_taxa$name[viral_taxa$taxid == 333929] <- "Gammapapillomavirus 3"
+viral_taxa$name[viral_taxa$taxid == 337042] <- "Alphapapillomavirus 7"
+viral_taxa$name[viral_taxa$taxid == 334203] <- "Mupapillomavirus 1"
+viral_taxa$name[viral_taxa$taxid == 333757] <- "Alphapapillomavirus 8"
+viral_taxa$name[viral_taxa$taxid == 337050] <- "Alphapapillomavirus 6"
+viral_taxa$name[viral_taxa$taxid == 333767] <- "Alphapapillomavirus 3"
+viral_taxa$name[viral_taxa$taxid == 333754] <- "Alphapapillomavirus 10"
+viral_taxa$name[viral_taxa$taxid == 687363] <- "Torque teno virus 24"
+viral_taxa$name[viral_taxa$taxid == 687342] <- "Torque teno virus 3"
+viral_taxa$name[viral_taxa$taxid == 687359] <- "Torque teno virus 20"
+viral_taxa$name[viral_taxa$taxid == 194441] <- "Primate T-lymphotropic virus 2"
+viral_taxa$name[viral_taxa$taxid == 334209] <- "Betapapillomavirus 5"
+viral_taxa$name[viral_taxa$taxid == 194965] <- "Aichivirus B"
+viral_taxa$name[viral_taxa$taxid == 333930] <- "Gammapapillomavirus 4"
+viral_taxa$name[viral_taxa$taxid == 337048] <- "Alphapapillomavirus 1"
+viral_taxa$name[viral_taxa$taxid == 337041] <- "Alphapapillomavirus 9"
+viral_taxa$name[viral_taxa$taxid == 337049] <- "Alphapapillomavirus 11"
+viral_taxa$name[viral_taxa$taxid == 337044] <- "Alphapapillomavirus 5"
+
+# Filter samples and add viral taxa information
+samples_keep <- read_counts %>% filter(n_reads_hv > 5) %>% pull(sample)
+mrg_hv_named <- mrg_hv %>% filter(sample %in% samples_keep, hv_status) %>% left_join(viral_taxa, by="taxid") 
+
+# Discover viral species & genera for HV reads
+raise_rank <- function(read_db, taxid_db, out_rank = "species", verbose = FALSE){
+  # Get higher ranks than search rank
+  ranks <- c("subspecies", "species", "subgenus", "genus", "subfamily", "family", "suborder", "order", "class", "subphylum", "phylum", "kingdom", "superkingdom")
+  rank_match <- which.max(ranks == out_rank)
+  high_ranks <- ranks[rank_match:length(ranks)]
+  # Merge read DB and taxid DB
+  reads <- read_db %>% select(-parent_taxid, -rank, -name) %>%
+    left_join(taxid_db, by="taxid")
+  # Extract sequences that are already at appropriate rank
+  reads_rank <- filter(reads, rank == out_rank)
+  # Drop sequences at a higher rank and return unclassified sequences
+  reads_norank <- reads %>% filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))
+  while(nrow(reads_norank) > 0){ # As long as there are unclassified sequences...
+    # Promote read taxids and re-merge with taxid DB, then re-classify and filter
+    reads_remaining <- reads_norank %>% mutate(taxid = parent_taxid) %>%
+      select(-parent_taxid, -rank, -name) %>%
+      left_join(taxid_db, by="taxid")
+    reads_rank <- reads_remaining %>% filter(rank == out_rank) %>%
+      bind_rows(reads_rank)
+    reads_norank <- reads_remaining %>%
+      filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid))
+  }
+  # Finally, extract and append reads that were excluded during the process
+  reads_dropped <- reads %>% filter(!seq_id %in% reads_rank$seq_id)
+  reads_out <- reads_rank %>% bind_rows(reads_dropped) %>%
+    select(-parent_taxid, -rank, -name) %>%
+    left_join(taxid_db, by="taxid")
+  return(reads_out)
+}
+hv_reads_species <- raise_rank(mrg_hv_named, viral_taxa, "species")
+hv_reads_genus <- raise_rank(mrg_hv_named, viral_taxa, "genus")
+hv_reads_family <- raise_rank(mrg_hv_named, viral_taxa, "family")
+
+
+
+
Code
threshold_major_family <- 0.02
+
+# Count reads for each human-viral family
+hv_family_counts <- hv_reads_family %>% 
+  group_by(sample, name, taxid) %>%
+  count(name = "n_reads_hv") %>%
+  group_by(sample) %>%
+  mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking families and group others
+hv_family_major_tab <- hv_family_counts %>% group_by(name) %>% 
+  filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>%
+  arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > threshold_major_family)
+hv_family_counts_major <- hv_family_counts %>%
+  mutate(name_display = ifelse(name %in% hv_family_major_tab$name, name, "Other")) %>%
+  group_by(sample, name_display) %>%
+  summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv), 
+            .groups="drop") %>%
+  mutate(name_display = factor(name_display, 
+                               levels = c(hv_family_major_tab$name, "Other")))
+hv_family_counts_display <- hv_family_counts_major %>%
+  rename(p_reads = p_reads_hv, classification = name_display)
+
+# Plot
+g_hv_family <- g_comp_base + 
+  geom_col(data=hv_family_counts_display, position = "stack", width=1) +
+  scale_y_continuous(name="% HV Reads", limits=c(0,1.01), 
+                     breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral family") +
+  labs(title="Family composition of human-viral reads") +
+  guides(fill=guide_legend(ncol=4)) +
+  theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain"))
+g_hv_family
+
+
+

+
+
+
+
Code
# Get most prominent families for text
+hv_family_collate <- hv_family_counts %>% group_by(name, taxid) %>% 
+  summarize(n_reads_tot = sum(n_reads_hv),
+            p_reads_max = max(p_reads_hv), .groups="drop") %>% 
+  arrange(desc(n_reads_tot))
+
+
+

In investigating individual viral families, to avoid distortions from a few rare reads, I restricted myself to samples where that family made up at least 10% of human-viral reads:

+
+
Code
threshold_major_species <- 0.05
+taxid_adeno <- 10508
+
+# Get set of adenoviridae reads
+adeno_samples <- hv_family_counts %>% filter(taxid == taxid_adeno) %>%
+  filter(p_reads_hv >= 0.1) %>%
+  pull(sample)
+adeno_ids <- hv_reads_family %>% 
+  filter(taxid == taxid_adeno, sample %in% adeno_samples) %>%
+  pull(seq_id)
+
+# Count reads for each adenoviridae species
+adeno_species_counts <- hv_reads_species %>%
+  filter(seq_id %in% adeno_ids) %>%
+  group_by(sample, name, taxid) %>%
+  count(name = "n_reads_hv") %>%
+  group_by(sample) %>%
+  mutate(p_reads_adeno = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking families and group others
+adeno_species_major_tab <- adeno_species_counts %>% group_by(name) %>% 
+  filter(p_reads_adeno == max(p_reads_adeno)) %>% 
+  filter(row_number() == 1) %>%
+  arrange(desc(p_reads_adeno)) %>% 
+  filter(p_reads_adeno > threshold_major_species)
+adeno_species_counts_major <- adeno_species_counts %>%
+  mutate(name_display = ifelse(name %in% adeno_species_major_tab$name, 
+                               name, "Other")) %>%
+  group_by(sample, name_display) %>%
+  summarize(n_reads_adeno = sum(n_reads_hv),
+            p_reads_adeno = sum(p_reads_adeno), 
+            .groups="drop") %>%
+  mutate(name_display = factor(name_display, 
+                               levels = c(adeno_species_major_tab$name, "Other")))
+adeno_species_counts_display <- adeno_species_counts_major %>%
+  rename(p_reads = p_reads_adeno, classification = name_display)
+
+# Plot
+g_adeno_species <- g_comp_base + 
+  geom_col(data=adeno_species_counts_display, position = "stack", width=1) +
+  scale_y_continuous(name="% Adenoviridae Reads", limits=c(0,1.01), 
+                     breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral species") +
+  labs(title="Species composition of Adenoviridae reads") +
+  guides(fill=guide_legend(ncol=3)) +
+  theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain"))
+
+g_adeno_species
+
+
+

+
+
+
+
Code
# Get most prominent species for text
+adeno_species_collate <- adeno_species_counts %>% group_by(name, taxid) %>% 
+  summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_adeno), .groups="drop") %>% 
+  arrange(desc(n_reads_tot))
+
+
+
+
Code
threshold_major_species <- 0.1
+taxid_polyoma <- 151341
+
+# Get set of polyomaviridae reads
+polyoma_samples <- hv_family_counts %>% filter(taxid == taxid_polyoma) %>%
+  filter(p_reads_hv >= 0.1) %>%
+  pull(sample)
+polyoma_ids <- hv_reads_family %>% 
+  filter(taxid == taxid_polyoma, sample %in% polyoma_samples) %>%
+  pull(seq_id)
+
+# Count reads for each polyomaviridae species
+polyoma_species_counts <- hv_reads_species %>%
+  filter(seq_id %in% polyoma_ids) %>%
+  group_by(sample, name, taxid) %>%
+  count(name = "n_reads_hv") %>%
+  group_by(sample) %>%
+  mutate(p_reads_polyoma = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking families and group others
+polyoma_species_major_tab <- polyoma_species_counts %>% group_by(name) %>% 
+  filter(p_reads_polyoma == max(p_reads_polyoma)) %>% 
+  filter(row_number() == 1) %>%
+  arrange(desc(p_reads_polyoma)) %>% 
+  filter(p_reads_polyoma > threshold_major_species)
+polyoma_species_counts_major <- polyoma_species_counts %>%
+  mutate(name_display = ifelse(name %in% polyoma_species_major_tab$name, 
+                               name, "Other")) %>%
+  group_by(sample, name_display) %>%
+  summarize(n_reads_polyoma = sum(n_reads_hv),
+            p_reads_polyoma = sum(p_reads_polyoma), 
+            .groups="drop") %>%
+  mutate(name_display = factor(name_display, 
+                               levels = c(polyoma_species_major_tab$name, "Other")))
+polyoma_species_counts_display <- polyoma_species_counts_major %>%
+  rename(p_reads = p_reads_polyoma, classification = name_display)
+
+# Plot
+g_polyoma_species <- g_comp_base + 
+  geom_col(data=polyoma_species_counts_display, position = "stack", width=1) +
+  scale_y_continuous(name="% Polyomaviridae Reads", limits=c(0,1.01), 
+                     breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral species") +
+  labs(title="Species composition of Polyomaviridae reads") +
+  guides(fill=guide_legend(ncol=3)) +
+  theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain"))
+
+g_polyoma_species
+
+
+

+
+
+
+
Code
# Get most prominent species for text
+polyoma_species_collate <- polyoma_species_counts %>% group_by(name, taxid) %>% 
+  summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_polyoma), .groups="drop") %>% 
+  arrange(desc(n_reads_tot))
+
+
+
+
Code
threshold_major_species <- 0.5
+taxid_papilloma <- 151340
+
+# Get set of papillomaviridae reads
+papilloma_samples <- hv_family_counts %>% filter(taxid == taxid_papilloma) %>%
+  filter(p_reads_hv >= 0.1) %>%
+  pull(sample)
+papilloma_ids <- hv_reads_family %>% 
+  filter(taxid == taxid_papilloma, sample %in% papilloma_samples) %>%
+  pull(seq_id)
+
+# Count reads for each papillomaviridae species
+papilloma_species_counts <- hv_reads_species %>%
+  filter(seq_id %in% papilloma_ids) %>%
+  group_by(sample, name, taxid) %>%
+  count(name = "n_reads_hv") %>%
+  group_by(sample) %>%
+  mutate(p_reads_papilloma = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking families and group others
+papilloma_species_major_tab <- papilloma_species_counts %>% group_by(name) %>% 
+  filter(p_reads_papilloma == max(p_reads_papilloma)) %>% 
+  filter(row_number() == 1) %>%
+  arrange(desc(p_reads_papilloma)) %>% 
+  filter(p_reads_papilloma > threshold_major_species)
+papilloma_species_counts_major <- papilloma_species_counts %>%
+  mutate(name_display = ifelse(name %in% papilloma_species_major_tab$name, 
+                               name, "Other")) %>%
+  group_by(sample, name_display) %>%
+  summarize(n_reads_papilloma = sum(n_reads_hv),
+            p_reads_papilloma = sum(p_reads_papilloma), 
+            .groups="drop") %>%
+  mutate(name_display = factor(name_display, 
+                               levels = c(papilloma_species_major_tab$name, "Other")))
+papilloma_species_counts_display <- papilloma_species_counts_major %>%
+  rename(p_reads = p_reads_papilloma, classification = name_display)
+
+# Plot
+g_papilloma_species <- g_comp_base + 
+  geom_col(data=papilloma_species_counts_display, position = "stack", width=1) +
+  scale_y_continuous(name="% Papillomaviridae Reads", limits=c(0,1.01), 
+                     breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral species") +
+  labs(title="Species composition of Papillomaviridae reads") +
+  guides(fill=guide_legend(ncol=3)) +
+  theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain"))
+
+g_papilloma_species
+
+
+

+
+
+
+
Code
# Get most prominent species for text
+papilloma_species_collate <- papilloma_species_counts %>% group_by(name, taxid) %>% 
+  summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_papilloma), .groups="drop") %>% 
+  arrange(desc(n_reads_tot))
+
+
+
+
Code
threshold_major_species <- 0.1
+taxid_parvo <- 10780
+
+# Get set of parvoviridae reads
+parvo_samples <- hv_family_counts %>% filter(taxid == taxid_parvo) %>%
+  filter(p_reads_hv >= 0.1) %>%
+  pull(sample)
+parvo_ids <- hv_reads_family %>% 
+  filter(taxid == taxid_parvo, sample %in% parvo_samples) %>%
+  pull(seq_id)
+
+# Count reads for each parvoviridae species
+parvo_species_counts <- hv_reads_species %>%
+  filter(seq_id %in% parvo_ids) %>%
+  group_by(sample, name, taxid) %>%
+  count(name = "n_reads_hv") %>%
+  group_by(sample) %>%
+  mutate(p_reads_parvo = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking families and group others
+parvo_species_major_tab <- parvo_species_counts %>% group_by(name) %>% 
+  filter(p_reads_parvo == max(p_reads_parvo)) %>% 
+  filter(row_number() == 1) %>%
+  arrange(desc(p_reads_parvo)) %>% 
+  filter(p_reads_parvo > threshold_major_species)
+parvo_species_counts_major <- parvo_species_counts %>%
+  mutate(name_display = ifelse(name %in% parvo_species_major_tab$name, 
+                               name, "Other")) %>%
+  group_by(sample, name_display) %>%
+  summarize(n_reads_parvo = sum(n_reads_hv),
+            p_reads_parvo = sum(p_reads_parvo), 
+            .groups="drop") %>%
+  mutate(name_display = factor(name_display, 
+                               levels = c(parvo_species_major_tab$name, "Other")))
+parvo_species_counts_display <- parvo_species_counts_major %>%
+  rename(p_reads = p_reads_parvo, classification = name_display)
+
+# Plot
+g_parvo_species <- g_comp_base + 
+  geom_col(data=parvo_species_counts_display, position = "stack", width=1) +
+  scale_y_continuous(name="% Parvoviridae Reads", limits=c(0,1.01), 
+                     breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral species") +
+  labs(title="Species composition of Parvoviridae reads") +
+  guides(fill=guide_legend(ncol=3)) +
+  theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain"))
+
+g_parvo_species
+
+
+

+
+
+
+
Code
# Get most prominent species for text
+parvo_species_collate <- parvo_species_counts %>% group_by(name, taxid) %>% 
+  summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_parvo), .groups="drop") %>% 
+  arrange(desc(n_reads_tot))
+
+
+
+
Code
threshold_major_species <- 0.1
+taxid_circo <- 39724
+
+# Get set of circoviridae reads
+circo_samples <- hv_family_counts %>% filter(taxid == taxid_circo) %>%
+  filter(p_reads_hv >= 0.1) %>%
+  pull(sample)
+circo_ids <- hv_reads_family %>% 
+  filter(taxid == taxid_circo, sample %in% circo_samples) %>%
+  pull(seq_id)
+
+# Count reads for each circoviridae species
+circo_species_counts <- hv_reads_species %>%
+  filter(seq_id %in% circo_ids) %>%
+  group_by(sample, name, taxid) %>%
+  count(name = "n_reads_hv") %>%
+  group_by(sample) %>%
+  mutate(p_reads_circo = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking families and group others
+circo_species_major_tab <- circo_species_counts %>% group_by(name) %>% 
+  filter(p_reads_circo == max(p_reads_circo)) %>% 
+  filter(row_number() == 1) %>%
+  arrange(desc(p_reads_circo)) %>% 
+  filter(p_reads_circo > threshold_major_species)
+circo_species_counts_major <- circo_species_counts %>%
+  mutate(name_display = ifelse(name %in% circo_species_major_tab$name, 
+                               name, "Other")) %>%
+  group_by(sample, name_display) %>%
+  summarize(n_reads_circo = sum(n_reads_hv),
+            p_reads_circo = sum(p_reads_circo), 
+            .groups="drop") %>%
+  mutate(name_display = factor(name_display, 
+                               levels = c(circo_species_major_tab$name, "Other")))
+circo_species_counts_display <- circo_species_counts_major %>%
+  rename(p_reads = p_reads_circo, classification = name_display)
+
+# Plot
+g_circo_species <- g_comp_base + 
+  geom_col(data=circo_species_counts_display, position = "stack", width=1) +
+  scale_y_continuous(name="% Circoviridae Reads", limits=c(0,1.01), 
+                     breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral species") +
+  labs(title="Species composition of Circoviridae reads") +
+  guides(fill=guide_legend(ncol=3)) +
+  theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain"))
+
+g_circo_species
+
+
+

+
+
+
+
Code
# Get most prominent species for text
+circo_species_collate <- circo_species_counts %>% group_by(name, taxid) %>% 
+  summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_circo), .groups="drop") %>% 
+  arrange(desc(n_reads_tot))
+
+
+
+
Code
threshold_major_species <- 0.1
+taxid_herpes <- 10292
+
+# Get set of herpesviridae reads
+herpes_samples <- hv_family_counts %>% filter(taxid == taxid_herpes) %>%
+  filter(p_reads_hv >= 0.1) %>%
+  pull(sample)
+herpes_ids <- hv_reads_family %>% 
+  filter(taxid == taxid_herpes, sample %in% herpes_samples) %>%
+  pull(seq_id)
+
+# Count reads for each herpesviridae species
+herpes_species_counts <- hv_reads_species %>%
+  filter(seq_id %in% herpes_ids) %>%
+  group_by(sample, name, taxid) %>%
+  count(name = "n_reads_hv") %>%
+  group_by(sample) %>%
+  mutate(p_reads_herpes = n_reads_hv/sum(n_reads_hv))
+
+# Identify high-ranking families and group others
+herpes_species_major_tab <- herpes_species_counts %>% group_by(name) %>% 
+  filter(p_reads_herpes == max(p_reads_herpes)) %>% 
+  filter(row_number() == 1) %>%
+  arrange(desc(p_reads_herpes)) %>% 
+  filter(p_reads_herpes > threshold_major_species)
+herpes_species_counts_major <- herpes_species_counts %>%
+  mutate(name_display = ifelse(name %in% herpes_species_major_tab$name, 
+                               name, "Other")) %>%
+  group_by(sample, name_display) %>%
+  summarize(n_reads_herpes = sum(n_reads_hv),
+            p_reads_herpes = sum(p_reads_herpes), 
+            .groups="drop") %>%
+  mutate(name_display = factor(name_display, 
+                               levels = c(herpes_species_major_tab$name, "Other")))
+herpes_species_counts_display <- herpes_species_counts_major %>%
+  rename(p_reads = p_reads_herpes, classification = name_display)
+
+# Plot
+g_herpes_species <- g_comp_base + 
+  geom_col(data=herpes_species_counts_display, position = "stack", width=1) +
+  scale_y_continuous(name="% Herpesviridae Reads", limits=c(0,1.01), 
+                     breaks = seq(0,1,0.2),
+                     expand=c(0,0), labels = function(y) y*100) +
+  scale_fill_manual(values=palette_viral, name = "Viral species") +
+  labs(title="Species composition of Herpesviridae reads") +
+  guides(fill=guide_legend(ncol=3)) +
+  theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain"))
+
+g_herpes_species
+
+
+

+
+
+
+
Code
# Get most prominent species for text
+herpes_species_collate <- herpes_species_counts %>% group_by(name, taxid) %>% 
+  summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_herpes), .groups="drop") %>% 
+  arrange(desc(n_reads_tot))
+
+
+

Finally, here again are the overall relative abundances of the specific viral genera I picked out manually in my last entry:

+
+
Code
# Define reference genera
+path_genera_rna <- c("Mamastrovirus", "Enterovirus", "Salivirus", "Kobuvirus", "Norovirus", "Sapovirus", "Rotavirus", "Alphacoronavirus", "Betacoronavirus", "Alphainfluenzavirus", "Betainfluenzavirus", "Lentivirus")
+path_genera_dna <- c("Mastadenovirus", "Alphapolyomavirus", "Betapolyomavirus", "Alphapapillomavirus", "Betapapillomavirus", "Gammapapillomavirus", "Orthopoxvirus", "Simplexvirus",
+                     "Lymphocryptovirus", "Cytomegalovirus", "Dependoparvovirus")
+path_genera <- bind_rows(tibble(name=path_genera_rna, genome_type="RNA genome"),
+                         tibble(name=path_genera_dna, genome_type="DNA genome")) %>%
+  left_join(viral_taxa, by="name")
+
+# Count in each sample
+mrg_hv_named_all <- mrg_hv %>% left_join(viral_taxa, by="taxid")
+hv_reads_genus_all <- raise_rank(mrg_hv_named_all, viral_taxa, "genus")
+n_path_genera <- hv_reads_genus_all %>% 
+  group_by(sample, name, taxid) %>% 
+  count(name="n_reads_viral") %>% 
+  inner_join(path_genera, by=c("name", "taxid")) %>%
+  left_join(read_counts_raw, by=c("sample")) %>%
+  mutate(p_reads_viral = n_reads_viral/n_reads_raw)
+
+# Pivot out and back to add zero lines
+n_path_genera_out <- n_path_genera %>% ungroup %>% select(sample, name, n_reads_viral) %>%
+  pivot_wider(names_from="name", values_from="n_reads_viral", values_fill=0) %>%
+  pivot_longer(-sample, names_to="name", values_to="n_reads_viral") %>%
+  left_join(read_counts_raw, by="sample") %>%
+  left_join(path_genera, by="name") %>%
+  mutate(p_reads_viral = n_reads_viral/n_reads_raw)
+
+## Aggregate across dates
+n_path_genera_stype <- n_path_genera_out %>% 
+  group_by(name, taxid, genome_type) %>%
+  summarize(n_reads_raw = sum(n_reads_raw),
+            n_reads_viral = sum(n_reads_viral), .groups = "drop") %>%
+  mutate(sample="All samples", location="All locations",
+         p_reads_viral = n_reads_viral/n_reads_raw,
+         na_type = "DNA")
+
+# Plot
+g_path_genera <- ggplot(n_path_genera_stype,
+                        aes(y=name, x=p_reads_viral)) +
+  geom_point() +
+  scale_x_log10(name="Relative abundance") +
+  facet_grid(genome_type~., scales="free_y") +
+  theme_base + theme(axis.title.y = element_blank())
+g_path_genera
+
+
+

+
+
+
+
+

Conclusion

+

This is the final P2RA dataset I needed to analyze before we finish re-doing that analysis for publication, so I’m pretty happy to have it done. In terms of the results, things mostly look similar to other DNA WW datasets I’ve looked at, with the notable difference that the total fraction of human-infecting viruses is significantly higher. I’m still not sure what’s causing this elevation; the methods used in this study don’t seem any different from other studies that got much lower fractions, and the fact that this study sampled from developing countries seems like only a partial explanation.

+ + + + +

Footnotes

+
    +
  1. I wasn’t able to quickly find any HDI datasets other than the most recent one, and it didn’t seem worth doing serious digging for this quick analysis.↩︎

  2. +
+
+ + + + \ No newline at end of file diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/dev-metrics-linear-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/dev-metrics-linear-1.png new file mode 100644 index 0000000..d5189b7 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/dev-metrics-linear-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/dev-metrics-linear-2.png b/docs/notebooks/2024-05-06_munk_files/figure-html/dev-metrics-linear-2.png new file mode 100644 index 0000000..630b3a7 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/dev-metrics-linear-2.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/hv-family-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-family-1.png new file mode 100644 index 0000000..9d2464a Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-family-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-adeno-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-adeno-1.png new file mode 100644 index 0000000..abed68d Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-adeno-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-circo-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-circo-1.png new file mode 100644 index 0000000..6495ccb Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-circo-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-herpes-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-herpes-1.png new file mode 100644 index 0000000..61634f0 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-herpes-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-papilloma-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-papilloma-1.png new file mode 100644 index 0000000..6d4c2df Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-papilloma-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-parvo-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-parvo-1.png new file mode 100644 index 0000000..7ce50db Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-parvo-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-polyoma-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-polyoma-1.png new file mode 100644 index 0000000..292f770 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/hv-species-polyoma-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-basic-stats-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-basic-stats-1.png new file mode 100644 index 0000000..ab88544 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-basic-stats-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-blast-results-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-blast-results-1.png new file mode 100644 index 0000000..6b0dfde Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-blast-results-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-composition-all-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-composition-all-1.png new file mode 100644 index 0000000..2fbe47c Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-composition-all-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-composition-all-2.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-composition-all-2.png new file mode 100644 index 0000000..edf9644 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-composition-all-2.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-countries-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-countries-1.png new file mode 100644 index 0000000..e65e9cb Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-countries-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-f1-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-f1-1.png new file mode 100644 index 0000000..4648da4 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-f1-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-hv-ra-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-hv-ra-1.png new file mode 100644 index 0000000..88778ec Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-hv-ra-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-hv-scores-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-hv-scores-1.png new file mode 100644 index 0000000..240ebba Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-hv-scores-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-quality-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-quality-1.png new file mode 100644 index 0000000..2a7ccfc Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-quality-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-quality-2.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-quality-2.png new file mode 100644 index 0000000..e12fb27 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-quality-2.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-quality-3.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-quality-3.png new file mode 100644 index 0000000..87635aa Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-quality-3.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-raw-quality-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-raw-quality-1.png new file mode 100644 index 0000000..3837738 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-raw-quality-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-raw-quality-2.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-raw-quality-2.png new file mode 100644 index 0000000..c5a63a5 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-raw-quality-2.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/plot-raw-quality-3.png b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-raw-quality-3.png new file mode 100644 index 0000000..93e39bf Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/plot-raw-quality-3.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/preproc-dedup-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/preproc-dedup-1.png new file mode 100644 index 0000000..2ccad54 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/preproc-dedup-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/preproc-dedup-2.png b/docs/notebooks/2024-05-06_munk_files/figure-html/preproc-dedup-2.png new file mode 100644 index 0000000..f9e59de Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/preproc-dedup-2.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/preproc-figures-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/preproc-figures-1.png new file mode 100644 index 0000000..046085b Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/preproc-figures-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/preproc-figures-2.png b/docs/notebooks/2024-05-06_munk_files/figure-html/preproc-figures-2.png new file mode 100644 index 0000000..c0bc87a Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/preproc-figures-2.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/ra-genera-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/ra-genera-1.png new file mode 100644 index 0000000..d2a3fbb Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/ra-genera-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/ra-hv-past-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/ra-hv-past-1.png new file mode 100644 index 0000000..1305110 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/ra-hv-past-1.png differ diff --git a/docs/notebooks/2024-05-06_munk_files/figure-html/viral-class-composition-1.png b/docs/notebooks/2024-05-06_munk_files/figure-html/viral-class-composition-1.png new file mode 100644 index 0000000..3e87559 Binary files /dev/null and b/docs/notebooks/2024-05-06_munk_files/figure-html/viral-class-composition-1.png differ diff --git a/notebooks/2024-05-06_munk.qmd b/notebooks/2024-05-06_munk.qmd new file mode 100644 index 0000000..8a442ec --- /dev/null +++ b/notebooks/2024-05-06_munk.qmd @@ -0,0 +1,1330 @@ +--- +title: "Workflow analysis of Maritz et al. (2019)" +subtitle: "Wastewater from NYC." +author: "Will Bradshaw" +date: 2024-05-01 +format: + html: + code-fold: true + code-tools: true + code-link: true + df-print: paged +editor: visual +title-block-banner: black +draft: true +--- + +```{r} +#| label: preamble +#| include: false + +# Load packages +library(tidyverse) +library(cowplot) +library(patchwork) +library(fastqcr) +library(RColorBrewer) +library(ggpubr) +source("../scripts/aux_plot-theme.R") + +# GGplot themes and scales +theme_base <- theme_base + theme(aspect.ratio = NULL) +theme_rotate <- theme_base + theme( + axis.text.x = element_text(hjust = 1, angle = 45), +) +theme_kit <- theme_rotate + theme( + axis.title.x = element_blank(), +) +theme_xblank <- theme_kit + theme( + axis.text.x = element_blank() +) +tnl <- theme(legend.position = "none") +``` + +The final dataset from the P2RA dataset I want to analyze here is [Munk et al. (2022)](https://www.nature.com/articles/s41467-022-34312-7), an enormous dataset of \>1,000 raw influent samples from 101 countries collected between 2016 and 2019. As in previous DNA studies like Bengtsson-Palme, samples were centrifuged and only the pellet was retained for sequencing, so we expect viral abundance to be low; nevertheless, this is the largest and most comprehensive DNA wastewater dataset we've been able to find to date, so it's worth having a look at what's in it. The pellet from each sample was resuspended, was homogenized with bead-beating, underwent DNA extraction and library prep, and was sequenced using Illumina technology; earlier samples were sequenced on an Illumina HiSeq3000, while later samples were sequenced on a NovaSeq6000, both with 2x150bp reads. + +# The raw data + +The Munk data comprised 1,189 total samples, of which 1,185 had complete metadata. These samples came from 101 countries, with the largest number of samples coming from the USA, Canada, and Denmark: + +```{r} +#| warning: false +#| label: import-qc-data + +# Importing the data is a bit more complicated this time as the samples are split across seven (!) pipeline runs +data_dir_base <- "../data/2024-05-06_munk" +data_dirs <- list.dirs(data_dir_base, recursive = FALSE) + +# Data input paths +libraries_paths <- file.path(data_dirs, "sample-metadata.csv") +basic_stats_paths <- file.path(data_dirs, "qc_basic_stats.tsv.gz") +adapter_stats_paths <- file.path(data_dirs, "qc_adapter_stats.tsv.gz") +quality_base_stats_paths <- file.path(data_dirs, "qc_quality_base_stats.tsv.gz") +quality_seq_stats_paths <- file.path(data_dirs, "qc_quality_sequence_stats.tsv.gz") + +# Import libraries and extract metadata from sample names +ctypes <- cols(date="D", .default="c") +libraries_raw <- lapply(libraries_paths, read_csv, col_types = ctypes) %>% + bind_rows +libraries <- libraries_raw %>% + # Add missing dates + mutate(date = ifelse(sample == "ERR4682809", as_date("2018-06-01"), date), + date = ifelse(sample == "ERR4682803", as_date("2018-06-01"), date), + date = ifelse(sample == "ERR2683170", as_date("2017-06-01"), date)) %>% + # Filter samples with unknown dates + filter(!is.na(date)) %>% + arrange(date, country, city) %>% + mutate(sample = fct_inorder(sample), date=as_date(date)) +``` + +```{r} +#| label: plot-countries +#| fig-width: 8 +sample_countries <- libraries %>% group_by(country) %>% count %>% ungroup %>% + mutate(p=n/sum(n)) %>% arrange(desc(p)) %>% mutate(country=fct_inorder(country)) +g_countries <- ggplot(sample_countries, aes(x=country, y=n)) + + geom_col() + + scale_y_continuous(name="# Samples", expand=c(0,0), limits=c(0,120), breaks=seq(0,200,20)) + + theme_kit + theme(axis.text.x = element_text(size=rel(0.5))) +g_countries +``` + +The 1,185 libraries included in this analysis varied dramatically in size, from 33,554 read pairs to over 123 million. The mean number of read pairs per library was 33.5M, and the dataset as a whole comprised 39.7B read pairs and almost 12 terabases of sequence: + +```{r} +#| label: process-qc-data + +# Import QC data +stages <- c("raw_concat", "cleaned", "dedup", "ribo_initial", "ribo_secondary") +import_basic <- function(paths){ + lapply(paths, read_tsv, show_col_types = FALSE) %>% bind_rows %>% + inner_join(libraries, by="sample") %>% + arrange(sample) %>% + mutate(stage = factor(stage, levels = stages), + sample = fct_inorder(sample)) +} +import_basic_paired <- function(paths){ + import_basic(paths) %>% arrange(read_pair) %>% + mutate(read_pair = fct_inorder(as.character(read_pair))) +} +basic_stats <- import_basic(basic_stats_paths) +adapter_stats <- import_basic_paired(adapter_stats_paths) +quality_base_stats <- import_basic_paired(quality_base_stats_paths) +quality_seq_stats <- import_basic_paired(quality_seq_stats_paths) + +# Identify small and large datasets +basic_stats_raw <- basic_stats %>% filter(stage == "raw_concat") +libraries_small <- basic_stats_raw %>% filter(n_read_pairs <= 1e7) %>% pull(library) +libraries <- libraries %>% mutate(small = library %in% libraries_small) +basic_stats <- basic_stats %>% mutate(small = library %in% libraries_small) +adapter_stats <- adapter_stats %>% mutate(small = library %in% libraries_small) +quality_base_stats <- quality_base_stats %>% mutate(small = library %in% libraries_small) +quality_seq_stats <- quality_seq_stats %>% mutate(small = library %in% libraries_small) + +# Filter to raw data +basic_stats_raw <- basic_stats %>% filter(stage == "raw_concat") +adapter_stats_raw <- adapter_stats %>% filter(stage == "raw_concat") +quality_base_stats_raw <- quality_base_stats %>% filter(stage == "raw_concat") +quality_seq_stats_raw <- quality_seq_stats %>% filter(stage == "raw_concat") + +# Get key values for readout +raw_read_counts <- basic_stats_raw %>% ungroup %>% + summarize(rmin = min(n_read_pairs), rmax=max(n_read_pairs), + rmean=mean(n_read_pairs), + rtot = sum(n_read_pairs), + btot = sum(n_bases_approx), + dmin = min(percent_duplicates), dmax=max(percent_duplicates), + dmean=mean(percent_duplicates), .groups = "drop") +``` + +```{r} +#| fig-width: 9 +#| warning: false +#| label: plot-basic-stats + +# Prepare data +basic_stats_raw_metrics <- basic_stats_raw %>% + select(sample, date, + `# Read pairs` = n_read_pairs, + `Total base pairs\n(approx)` = n_bases_approx, + `% Duplicates\n(FASTQC)` = percent_duplicates) %>% + pivot_longer(-(sample:date), names_to = "metric", values_to = "value") %>% + mutate(metric = fct_inorder(metric)) + +# Set up plot templates +g_basic <- ggplot(basic_stats_raw_metrics, aes(x=date, y=value)) + + geom_col(position = "dodge") + + scale_x_date() + + scale_y_continuous(expand=c(0,0)) + + expand_limits(y=c(0,100)) + + facet_grid(metric~., scales = "free", space="free_x", switch="y") + + theme_kit + theme( + axis.title.y = element_blank(), + strip.text.y = element_text(face="plain") + ) +g_basic +``` + +Adapter levels were high, read qualities were variable (in definite need of trimming) and duplicate levels were moderate: + +```{r} +#| label: plot-raw-quality + +# Set up plotting templates +g_qual_raw <- ggplot(mapping=aes(linetype=read_pair, + group=interaction(sample,read_pair))) + + scale_linetype_discrete(name = "Read Pair") + + guides(color=guide_legend(nrow=2,byrow=TRUE), + linetype = guide_legend(nrow=2,byrow=TRUE)) + + theme_base + +# Visualize adapters +g_adapters_raw <- g_qual_raw + + geom_line(aes(x=position, y=pc_adapters), data=adapter_stats_raw) + + scale_y_continuous(name="% Adapters", limits=c(0,NA), + breaks = seq(0,100,10), expand=c(0,0)) + + scale_x_continuous(name="Position", limits=c(0,NA), + breaks=seq(0,500,20), expand=c(0,0)) + + facet_grid(.~adapter) +g_adapters_raw + +# Visualize quality +g_quality_base_raw <- g_qual_raw + + geom_hline(yintercept=25, linetype="dashed", color="red") + + geom_hline(yintercept=30, linetype="dashed", color="red") + + geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats_raw) + + scale_y_continuous(name="Mean Phred score", expand=c(0,0), limits=c(10,45)) + + scale_x_continuous(name="Position", limits=c(0,NA), + breaks=seq(0,500,20), expand=c(0,0)) +g_quality_base_raw + +g_quality_seq_raw <- g_qual_raw + + geom_vline(xintercept=25, linetype="dashed", color="red") + + geom_vline(xintercept=30, linetype="dashed", color="red") + + geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats_raw) + + scale_x_continuous(name="Mean Phred score", expand=c(0,0)) + + scale_y_continuous(name="# Sequences", expand=c(0,0)) +g_quality_seq_raw +``` + +# Preprocessing + +About 6% of reads on average were lost during cleaning, and a further 10% during deduplication; however, in both cases a minority of samples lost much larger read fractions. Very few reads were lost during ribodepletion, as expected for DNA sequencing libraries. + +```{r} +#| label: preproc-table +n_reads_rel <- basic_stats %>% + select(sample, stage, + percent_duplicates, n_read_pairs) %>% + group_by(sample) %>% arrange(sample, stage) %>% + mutate(p_reads_retained = replace_na(n_read_pairs / lag(n_read_pairs), 0), + p_reads_lost = 1 - p_reads_retained, + p_reads_retained_abs = n_read_pairs / n_read_pairs[1], + p_reads_lost_abs = 1-p_reads_retained_abs, + p_reads_lost_abs_marginal = replace_na(p_reads_lost_abs - lag(p_reads_lost_abs), 0)) +n_reads_rel_display <- n_reads_rel %>% + group_by(Stage=stage) %>% + summarize(`% Total Reads Lost (Cumulative)` = paste0(round(min(p_reads_lost_abs*100),1), "-", round(max(p_reads_lost_abs*100),1), " (mean ", round(mean(p_reads_lost_abs*100),1), ")"), + `% Total Reads Lost (Marginal)` = paste0(round(min(p_reads_lost_abs_marginal*100),1), "-", round(max(p_reads_lost_abs_marginal*100),1), " (mean ", round(mean(p_reads_lost_abs_marginal*100),1), ")"), .groups="drop") %>% + filter(Stage != "raw_concat") %>% + mutate(Stage = Stage %>% as.numeric %>% factor(labels=c("Trimming & filtering", "Deduplication", "Initial ribodepletion", "Secondary ribodepletion"))) +n_reads_rel_display +``` + +```{r} +#| label: preproc-figures +#| warning: false +#| fig-height: 4 +#| fig-width: 6 + +g_stage_base <- ggplot(mapping=aes(x=stage, group=sample)) + + theme_kit + +# Plot reads over preprocessing +g_reads_stages <- g_stage_base + + geom_line(aes(y=n_read_pairs), data=basic_stats) + + scale_y_continuous("# Read pairs", expand=c(0,0), limits=c(0,NA)) +g_reads_stages + +# Plot relative read losses during preprocessing +g_reads_rel <- g_stage_base + + geom_line(aes(y=p_reads_lost_abs_marginal), data=n_reads_rel) + + scale_y_continuous("% Total Reads Lost", expand=c(0,0), + labels = function(x) x*100) +g_reads_rel +``` + +As usual, data cleaning was very successful at removing adapters and improving read qualities: + +```{r} +#| warning: false +#| label: plot-quality +#| fig-height: 7 + +g_qual <- ggplot(mapping=aes(linetype=read_pair, + group=interaction(sample,read_pair))) + + scale_linetype_discrete(name = "Read Pair") + + guides(color=guide_legend(nrow=2,byrow=TRUE), + linetype = guide_legend(nrow=2,byrow=TRUE)) + + theme_base + +# Visualize adapters +g_adapters <- g_qual + + geom_line(aes(x=position, y=pc_adapters), data=adapter_stats) + + scale_y_continuous(name="% Adapters", limits=c(0,20), + breaks = seq(0,50,10), expand=c(0,0)) + + scale_x_continuous(name="Position", limits=c(0,NA), + breaks=seq(0,140,20), expand=c(0,0)) + + facet_grid(stage~adapter) +g_adapters + +# Visualize quality +g_quality_base <- g_qual + + geom_hline(yintercept=25, linetype="dashed", color="red") + + geom_hline(yintercept=30, linetype="dashed", color="red") + + geom_line(aes(x=position, y=mean_phred_score), data=quality_base_stats) + + scale_y_continuous(name="Mean Phred score", expand=c(0,0), limits=c(10,45)) + + scale_x_continuous(name="Position", limits=c(0,NA), + breaks=seq(0,140,20), expand=c(0,0)) + + facet_grid(stage~.) +g_quality_base + +g_quality_seq <- g_qual + + geom_vline(xintercept=25, linetype="dashed", color="red") + + geom_vline(xintercept=30, linetype="dashed", color="red") + + geom_line(aes(x=mean_phred_score, y=n_sequences), data=quality_seq_stats) + + scale_x_continuous(name="Mean Phred score", expand=c(0,0)) + + scale_y_continuous(name="# Sequences", expand=c(0,0)) + + facet_grid(stage~.) +g_quality_seq +``` + +According to FASTQC, cleaning + deduplication was mostly effective at reducing measured duplicate levels, though a few samples retained high measured duplicate levels throughout the pipeline: + +```{r} +#| label: preproc-dedup +#| fig-height: 3.5 +#| fig-width: 6 + +stage_dup <- basic_stats %>% group_by(stage) %>% + summarize(dmin = min(percent_duplicates), dmax=max(percent_duplicates), + dmean=mean(percent_duplicates), .groups = "drop") + +g_dup_stages <- g_stage_base + + geom_line(aes(y=percent_duplicates), data=basic_stats) + + scale_y_continuous("% Duplicates", limits=c(0,NA), expand=c(0,0)) +g_dup_stages + +g_readlen_stages <- g_stage_base + + geom_line(aes(y=mean_seq_len), data=basic_stats) + + scale_y_continuous("Mean read length (nt)", expand=c(0,0), limits=c(0,NA)) +g_readlen_stages +``` + +# High-level composition + +As before, to assess the high-level composition of the reads, I ran the ribodepleted files through Kraken (using the Standard 16 database) and summarized the results with Bracken. Combining these results with the read counts above gives us a breakdown of the inferred composition of the samples: + +```{r} +#| label: prepare-composition + +classifications <- c("Filtered", "Duplicate", "Ribosomal", "Unassigned", + "Bacterial", "Archaeal", "Viral", "Human") + +# Import composition data +comp_paths <- file.path(data_dirs, "taxonomic_composition.tsv.gz") +comp <- lapply(comp_paths, read_tsv, show_col_types = FALSE) %>% bind_rows %>% + inner_join(libraries, by="sample") %>% + mutate(classification = factor(classification, levels = classifications)) + + +# Summarize composition +read_comp_summ <- comp %>% + group_by(classification) %>% + summarize(n_reads = sum(n_reads), .groups = "drop_last") %>% + mutate(n_reads = replace_na(n_reads,0), + p_reads = n_reads/sum(n_reads), + pc_reads = p_reads*100) +``` + +```{r} +#| label: plot-composition-all +#| fig-height: 7 +#| fig-width: 8 + +# Prepare plotting templates +g_comp_base <- ggplot(mapping=aes(x=sample, y=p_reads, fill=classification)) + + theme_xblank + theme(axis.ticks.x = element_blank()) +scale_y_pc_reads <- purrr::partial(scale_y_continuous, name = "% Reads", + expand = c(0,0), labels = function(y) y*100) + +# Plot overall composition +g_comp <- g_comp_base + geom_col(data = comp, position = "stack", width=1) + + scale_y_pc_reads(limits = c(0,1.01), breaks = seq(0,1,0.2)) + + scale_fill_brewer(palette = "Set1", name = "Classification") +g_comp + +# Plot composition of minor components +comp_minor <- comp %>% + filter(classification %in% c("Archaeal", "Viral", "Human", "Other")) +palette_minor <- brewer.pal(9, "Set1")[6:9] +g_comp_minor <- g_comp_base + + geom_col(data=comp_minor, position = "stack", width=1) + + scale_y_pc_reads() + + scale_fill_manual(values=palette_minor, name = "Classification") +g_comp_minor + +``` + +```{r} +#| label: composition-summary + +p_reads_summ_group <- comp %>% + mutate(classification = ifelse(classification %in% c("Filtered", "Duplicate", "Unassigned"), "Excluded", as.character(classification)), + classification = fct_inorder(classification)) %>% + group_by(classification, sample) %>% + summarize(p_reads = sum(p_reads), .groups = "drop") %>% + group_by(classification) %>% + summarize(pc_min = min(p_reads)*100, pc_max = max(p_reads)*100, + pc_mean = mean(p_reads)*100, .groups = "drop") +p_reads_summ_prep <- p_reads_summ_group %>% + mutate(classification = fct_inorder(classification), + pc_min = pc_min %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2), + pc_max = pc_max %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2), + pc_mean = pc_mean %>% signif(digits=2) %>% sapply(format, scientific=FALSE, trim=TRUE, digits=2), + display = paste0(pc_min, "-", pc_max, "% (mean ", pc_mean, "%)")) +p_reads_summ <- p_reads_summ_prep %>% + select(Classification=classification, + `Read Fraction`=display) %>% + arrange(Classification) +p_reads_summ +``` + +As in previous DNA datasets, the vast majority of classified reads were bacterial in origin. Viral fraction averaged 0.33%, higher than in other DNA wastewater datasets I've looked at, and reached \>1% in 35 samples. As is common for DNA wastewater data, viral reads were overwhelmingly dominated by *Caudoviricetes* phages, though *Quintoviricetes* (parvoviruses) also showed significant prevalence in some samples: + +```{r} +#| label: extract-viral-taxa + +# # Get Kraken reports +# reports_paths <- file.path(data_dirs, "kraken_reports.tsv.gz") +# reports <- lapply(reports_paths, read_tsv, show_col_types = FALSE) %>% bind_rows %>% +# inner_join(libraries, by="sample") +# +# Get viral taxonomy +viral_taxa_path <- file.path(data_dir_base, "viral-taxids.tsv.gz") +viral_taxa <- read_tsv(viral_taxa_path, show_col_types = FALSE) +# +# # Filter to viral taxa +# kraken_reports_viral <- filter(reports, taxid %in% viral_taxa$taxid) %>% +# group_by(sample) %>% +# mutate(p_reads_viral = n_reads_clade/n_reads_clade[1]) +# kraken_reports_viral_cleaned <- kraken_reports_viral %>% +# inner_join(libraries, by="sample") %>% +# select(-pc_reads_total, -n_reads_direct, -contains("minimizers")) %>% +# select(name, taxid, p_reads_viral, n_reads_clade, everything()) +# +# viral_classes <- kraken_reports_viral_cleaned %>% filter(rank == "C") + +viral_classes_path <- file.path(data_dir_base, "viral_classes.tsv.gz") +# write_tsv(viral_classes, viral_classes_path) +viral_classes <- read_tsv(viral_classes_path, show_col_types = FALSE) + +``` + +```{r} +#| label: viral-class-composition +#| fig-height: 7 +#| fig-width: 8 + + +major_threshold <- 0.02 + +# Identify major viral classes +viral_classes_major_tab <- viral_classes %>% + group_by(name, taxid) %>% + summarize(p_reads_viral_max = max(p_reads_viral), .groups="drop") %>% + filter(p_reads_viral_max >= major_threshold) +viral_classes_major_list <- viral_classes_major_tab %>% pull(name) +viral_classes_major <- viral_classes %>% + filter(name %in% viral_classes_major_list) %>% + select(name, taxid, sample, p_reads_viral) +viral_classes_minor <- viral_classes_major %>% + group_by(sample) %>% + summarize(p_reads_viral_major = sum(p_reads_viral), .groups = "drop") %>% + mutate(name = "Other", taxid=NA, p_reads_viral = 1-p_reads_viral_major) %>% + select(name, taxid, sample, p_reads_viral) +viral_classes_display <- bind_rows(viral_classes_major, viral_classes_minor) %>% + arrange(desc(p_reads_viral)) %>% + mutate(name = factor(name, levels=c(viral_classes_major_list, "Other")), + p_reads_viral = pmax(p_reads_viral, 0)) %>% + rename(p_reads = p_reads_viral, classification=name) + +palette_viral <- c(brewer.pal(12, "Set3"), brewer.pal(8, "Dark2")) +g_classes <- g_comp_base + + geom_col(data=viral_classes_display, position = "stack", width=1) + + scale_y_continuous(name="% Viral Reads", limits=c(0,1.01), breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral class") + +g_classes + +``` + +# Human-infecting virus reads: validation + +Next, I investigated the human-infecting virus read content of these unenriched samples. A grand total of 331,452 reads were identified as putatively human-viral: + +```{r} +#| label: hv-read-counts + +# Import HV read data +hv_reads_filtered_paths <- file.path(data_dirs, "hv_hits_putative_filtered.tsv.gz") +hv_reads_filtered <- lapply(hv_reads_filtered_paths, read_tsv, + show_col_types = FALSE) %>% + bind_rows() %>% + left_join(libraries, by="sample") + +# Count reads +n_hv_filtered <- hv_reads_filtered %>% + group_by(sample, seq_id) %>% count %>% + group_by(sample) %>% count %>% + inner_join(basic_stats %>% filter(stage == "ribo_initial") %>% + select(sample, n_read_pairs), by="sample") %>% + rename(n_putative = n, n_total = n_read_pairs) %>% + mutate(p_reads = n_putative/n_total, pc_reads = p_reads * 100) +n_hv_filtered_summ <- n_hv_filtered %>% ungroup %>% + summarize(n_putative = sum(n_putative), n_total = sum(n_total), + .groups="drop") %>% + mutate(p_reads = n_putative/n_total, pc_reads = p_reads*100) +``` + +```{r} +#| label: plot-hv-scores +#| warning: false +#| fig-width: 8 + +# Collapse multi-entry sequences +rmax <- purrr::partial(max, na.rm = TRUE) +collapse <- function(x) ifelse(all(x == x[1]), x[1], paste(x, collapse="/")) +mrg <- hv_reads_filtered %>% + mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev, na.rm = TRUE)) %>% + arrange(desc(adj_score_max)) %>% + group_by(seq_id) %>% + summarize(sample = collapse(sample), + genome_id = collapse(genome_id), + taxid_best = taxid[1], + taxid = collapse(as.character(taxid)), + best_alignment_score_fwd = rmax(best_alignment_score_fwd), + best_alignment_score_rev = rmax(best_alignment_score_rev), + query_len_fwd = rmax(query_len_fwd), + query_len_rev = rmax(query_len_rev), + query_seq_fwd = query_seq_fwd[!is.na(query_seq_fwd)][1], + query_seq_rev = query_seq_rev[!is.na(query_seq_rev)][1], + classified = rmax(classified), + assigned_name = collapse(assigned_name), + assigned_taxid_best = assigned_taxid[1], + assigned_taxid = collapse(as.character(assigned_taxid)), + assigned_hv = rmax(assigned_hv), + hit_hv = rmax(hit_hv), + encoded_hits = collapse(encoded_hits), + adj_score_fwd = rmax(adj_score_fwd), + adj_score_rev = rmax(adj_score_rev) + ) %>% + inner_join(libraries, by="sample") %>% + mutate(kraken_label = ifelse(assigned_hv, "Kraken2 HV\nassignment", + ifelse(hit_hv, "Kraken2 HV\nhit", + "No hit or\nassignment"))) %>% + mutate(adj_score_max = pmax(adj_score_fwd, adj_score_rev), + highscore = adj_score_max >= 20) + +# Plot results +geom_vhist <- purrr::partial(geom_histogram, binwidth=5, boundary=0) +g_vhist_base <- ggplot(mapping=aes(x=adj_score_max)) + + geom_vline(xintercept=20, linetype="dashed", color="red") + + facet_wrap(~kraken_label, labeller = labeller(kit = label_wrap_gen(20)), scales = "free_y") + + scale_x_continuous(name = "Maximum adjusted alignment score") + + scale_y_continuous(name="# Read pairs") + + theme_base +g_vhist_0 <- g_vhist_base + geom_vhist(data=mrg) +g_vhist_0 +``` + +BLASTing these reads against nt, we find that the pipeline performs well, with only a single high-scoring false-positive read: + +```{r} +#| label: process-blast-data +#| warning: false + +# Import paired BLAST results +blast_paired_paths <- file.path(data_dirs, "hv_hits_blast_paired.tsv.gz") +blast_paired <- lapply(blast_paired_paths, read_tsv, show_col_types = FALSE) %>% bind_rows + +# Add viral status +blast_viral <- mutate(blast_paired, viral = staxid %in% viral_taxa$taxid) %>% + mutate(viral_full = viral & n_reads == 2) + +# Compare to Kraken & Bowtie assignments +match_taxid <- function(taxid_1, taxid_2){ + p1 <- mapply(grepl, paste0("/", taxid_1, "$"), taxid_2) + p2 <- mapply(grepl, paste0("^", taxid_1, "/"), taxid_2) + p3 <- mapply(grepl, paste0("^", taxid_1, "$"), taxid_2) + out <- setNames(p1|p2|p3, NULL) + return(out) +} +mrg_assign <- mrg %>% select(sample, seq_id, taxid, assigned_taxid, adj_score_max) +blast_assign <- inner_join(blast_viral, mrg_assign, by="seq_id") %>% + mutate(taxid_match_bowtie = match_taxid(staxid, taxid), + taxid_match_kraken = match_taxid(staxid, assigned_taxid), + taxid_match_any = taxid_match_bowtie | taxid_match_kraken) +blast_out <- blast_assign %>% + group_by(seq_id) %>% + summarize(viral_status = ifelse(any(viral_full), 2, + ifelse(any(taxid_match_any), 2, + ifelse(any(viral), 1, 0))), + .groups = "drop") +``` + +```{r} +#| label: plot-blast-results +#| fig-height: 6 +#| warning: false + +# Merge BLAST results with unenriched read data +mrg_blast <- full_join(mrg, blast_out, by="seq_id") %>% + mutate(viral_status = replace_na(viral_status, 0), + viral_status_out = ifelse(viral_status == 0, FALSE, TRUE)) + +# Plot +g_vhist_1 <- g_vhist_base + geom_vhist(data=mrg_blast, mapping=aes(fill=viral_status_out)) + + scale_fill_brewer(palette = "Set1", name = "Viral status") +g_vhist_1 +``` + +My usual disjunctive score threshold of 20 gave precision, sensitivity, and F1 scores all \>99%: + +```{r} +#| label: plot-f1 +test_sens_spec <- function(tab, score_threshold){ + tab_retained <- tab %>% + mutate(retain_score = (adj_score_fwd > score_threshold | adj_score_rev > score_threshold), + retain = assigned_hv | retain_score) %>% + group_by(viral_status_out, retain) %>% count + pos_tru <- tab_retained %>% filter(viral_status_out == "TRUE", retain) %>% pull(n) %>% sum + pos_fls <- tab_retained %>% filter(viral_status_out != "TRUE", retain) %>% pull(n) %>% sum + neg_tru <- tab_retained %>% filter(viral_status_out != "TRUE", !retain) %>% pull(n) %>% sum + neg_fls <- tab_retained %>% filter(viral_status_out == "TRUE", !retain) %>% pull(n) %>% sum + sensitivity <- pos_tru / (pos_tru + neg_fls) + specificity <- neg_tru / (neg_tru + pos_fls) + precision <- pos_tru / (pos_tru + pos_fls) + f1 <- 2 * precision * sensitivity / (precision + sensitivity) + out <- tibble(threshold=score_threshold, sensitivity=sensitivity, + specificity=specificity, precision=precision, f1=f1) + return(out) +} +range_f1 <- function(intab, inrange=15:45){ + tss <- purrr::partial(test_sens_spec, tab=intab) + stats <- lapply(inrange, tss) %>% bind_rows %>% + pivot_longer(!threshold, names_to="metric", values_to="value") + return(stats) +} +stats_0 <- range_f1(mrg_blast) +g_stats_0 <- ggplot(stats_0, aes(x=threshold, y=value, color=metric)) + + geom_vline(xintercept=20, color = "red", linetype = "dashed") + + geom_line() + + scale_y_continuous(name = "Value", limits=c(0,1), breaks = seq(0,1,0.2), expand = c(0,0)) + + scale_x_continuous(name = "Adjusted Score Threshold", expand = c(0,0)) + + scale_color_brewer(palette="Dark2") + + theme_base +g_stats_0 +stats_0 %>% filter(threshold == 20) %>% + select(Threshold=threshold, Metric=metric, Value=value) +``` + +# Human-infecting viruses: overall relative abundance + +```{r} +#| label: count-hv-reads + +# Get raw read counts +read_counts_raw <- basic_stats_raw %>% + select(sample, n_reads_raw = n_read_pairs) + +# Get HV read counts +mrg_hv <- mrg %>% mutate(hv_status = assigned_hv | highscore) %>% + rename(taxid_all = taxid, taxid = taxid_best) +read_counts_hv <- mrg_hv %>% filter(hv_status) %>% group_by(sample) %>% + count(name="n_reads_hv") +read_counts <- read_counts_raw %>% left_join(read_counts_hv, by="sample") %>% + mutate(n_reads_hv = replace_na(n_reads_hv, 0)) %>% + inner_join(libraries, by="sample") + +# Aggregate +read_counts_grp <- read_counts %>% group_by(country) %>% + summarize(n_reads_raw = sum(n_reads_raw), + n_reads_hv = sum(n_reads_hv), + n_samples = n(), .groups="drop") %>% + mutate(sample= "All samples") +read_counts_tot <- read_counts_grp %>% group_by(sample) %>% + summarize(n_reads_raw = sum(n_reads_raw), + n_reads_hv = sum(n_reads_hv), .groups="drop") %>% + mutate(country= "All countries") +read_counts_agg <- bind_rows(read_counts_grp, read_counts_tot) %>% + mutate(p_reads_hv = n_reads_hv/n_reads_raw, + sample = factor(sample, levels=c(levels(libraries$sample), "All samples"))) +``` + +Applying a disjunctive cutoff at S=20 identifies 325,390 read pairs as human-viral. This gives an overall relative HV abundance of $8.19 \times 10^{-6}$; higher than any other DNA WW dataset I've analyzed and competitive with many RNA datasets: + +```{r} +#| label: plot-hv-ra +#| warning: false +#| fig-width: 8 +# Visualize +g_phv_agg <- ggplot(read_counts_agg, aes(x=country)) + + geom_point(aes(y=p_reads_hv)) + + scale_y_log10("Relative abundance of human virus reads") + + theme_kit + theme(axis.text.x = element_text(size=rel(0.5))) + +g_phv_agg +``` + +```{r} +#| label: ra-hv-past + +# Collate past RA values +ra_past <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched, + "Brumfield", 5e-5, "RNA", FALSE, + "Brumfield", 3.66e-7, "DNA", FALSE, + "Spurbeck", 5.44e-6, "RNA", FALSE, + "Yang", 3.62e-4, "RNA", FALSE, + "Rothman (unenriched)", 1.87e-5, "RNA", FALSE, + "Rothman (panel-enriched)", 3.3e-5, "RNA", TRUE, + "Crits-Christoph (unenriched)", 1.37e-5, "RNA", FALSE, + "Crits-Christoph (panel-enriched)", 1.26e-2, "RNA", TRUE, + "Prussin (non-control)", 1.63e-5, "RNA", FALSE, + "Prussin (non-control)", 4.16e-5, "DNA", FALSE, + "Rosario (non-control)", 1.21e-5, "RNA", FALSE, + "Rosario (non-control)", 1.50e-4, "DNA", FALSE, + "Leung", 1.73e-5, "DNA", FALSE, + "Brinch", 3.88e-6, "DNA", FALSE, + "Bengtsson-Palme", 8.86e-8, "DNA", FALSE, + "Ng", 2.90e-7, "DNA", FALSE, + "Maritz", 9.42e-7, "DNA", FALSE +) + +# Collate new RA values +ra_new <- tribble(~dataset, ~ra, ~na_type, ~panel_enriched, + "Munk", 8.19e-6, "DNA", FALSE) + + +# Plot +scale_color_na <- purrr::partial(scale_color_brewer, palette="Set1", + name="Nucleic acid type") +ra_comp <- bind_rows(ra_past, ra_new) %>% mutate(dataset = fct_inorder(dataset)) +g_ra_comp <- ggplot(ra_comp, aes(y=dataset, x=ra, color=na_type)) + + geom_point() + + scale_color_na() + + scale_x_log10(name="Relative abundance of human virus reads") + + theme_base + theme(axis.title.y = element_blank()) +g_ra_comp +``` + +One potential explanation for the higher HV fraction in the Munk data compared to other DNA WW datasets is the sample location: whereas Brinch, Maritz, Bengtsson-Palme and Ng are all from highly developed economies with good sanitation, Munk includes samples from numerous countries including many with much lower incomes and development scores. To quickly test this, I took the most recent Human Development Index dataset from the UN (2022[^1]) and GDP per capita dataset from the World Bank (PPP, 2019). In both cases, there was a weak negative correlation between the development metric and measured human-viral load: + +[^1]: I wasn't able to quickly find any HDI datasets other than the most recent one, and it didn't seem worth doing serious digging for this quick analysis. + +```{r} +#| label: dev-metrics-linear + +# HDI +hdi_path <- file.path(data_dir_base, "hdi.csv") +hdi <- read_csv(hdi_path, show_col_types = FALSE) +read_counts_hdi <- inner_join(read_counts_grp, hdi, by="country") %>% + mutate(p_reads_hv = n_reads_hv/n_reads_raw, + log_p = log10(p_reads_hv)) +g_hdi <- ggscatter(read_counts_hdi, x="HDI", y="p_reads_hv", + add = "reg.line") + + stat_cor(method="pearson") + + geom_point() + + scale_x_continuous("HDI (2022)") + + scale_y_continuous("HV RA") + + theme_base +g_hdi + +# GDP +gdp_path <- file.path(data_dir_base, "gdp.csv") +gdp <- read_csv(gdp_path, show_col_types = FALSE) +read_counts_gdp <- inner_join(read_counts_grp, gdp, by="country") %>% + mutate(p_reads_hv = n_reads_hv/n_reads_raw, + log_p = log10(p_reads_hv), + log_gdp = log10(gdp_per_capita_ppp)) +g_gdp <- ggscatter(read_counts_gdp, x="log_gdp", y="p_reads_hv", + add = "reg.line") + + stat_cor(method = "pearson") + + scale_x_continuous("Log GDP per Capita (PPP, Int$, 2019)", labels = function(x) paste0("1e+", x)) + + scale_y_continuous("Relative abundance of human virus reads") + + theme_base +g_gdp +``` + +# Human-infecting viruses: taxonomy and composition + +In investigating the taxonomy of human-infecting virus reads, I restricted my analysis to samples with more than 5 HV read pairs total across all viruses, to reduce noise arising from extremely low HV read counts in some samples. 1,129 samples met this criterion. + +As usual, at the family level, most samples were dominated by *Adenoviridae*, *Polyomaviridae* and *Papillomaviridae.* Three other families, *Parvoviridae*, *Circoviridae* and *Herpesviridae*, also showed substantial prevalence. + +```{r} +#| label: raise-hv-taxa + +# Get viral taxon names for putative HV reads +viral_taxa$name[viral_taxa$taxid == 249588] <- "Mamastrovirus" +viral_taxa$name[viral_taxa$taxid == 194960] <- "Kobuvirus" +viral_taxa$name[viral_taxa$taxid == 688449] <- "Salivirus" +viral_taxa$name[viral_taxa$taxid == 585893] <- "Picobirnaviridae" +viral_taxa$name[viral_taxa$taxid == 333922] <- "Betapapillomavirus" +viral_taxa$name[viral_taxa$taxid == 334207] <- "Betapapillomavirus 3" +viral_taxa$name[viral_taxa$taxid == 369960] <- "Porcine type-C oncovirus" +viral_taxa$name[viral_taxa$taxid == 333924] <- "Betapapillomavirus 2" +viral_taxa$name[viral_taxa$taxid == 687329] <- "Anelloviridae" +viral_taxa$name[viral_taxa$taxid == 325455] <- "Gammapapillomavirus" +viral_taxa$name[viral_taxa$taxid == 333750] <- "Alphapapillomavirus" +viral_taxa$name[viral_taxa$taxid == 694002] <- "Betacoronavirus" +viral_taxa$name[viral_taxa$taxid == 334202] <- "Mupapillomavirus" +viral_taxa$name[viral_taxa$taxid == 197911] <- "Alphainfluenzavirus" +viral_taxa$name[viral_taxa$taxid == 186938] <- "Respirovirus" +viral_taxa$name[viral_taxa$taxid == 333926] <- "Gammapapillomavirus 1" +viral_taxa$name[viral_taxa$taxid == 337051] <- "Betapapillomavirus 1" +viral_taxa$name[viral_taxa$taxid == 337043] <- "Alphapapillomavirus 4" +viral_taxa$name[viral_taxa$taxid == 694003] <- "Betacoronavirus 1" +viral_taxa$name[viral_taxa$taxid == 334204] <- "Mupapillomavirus 2" +viral_taxa$name[viral_taxa$taxid == 334208] <- "Betapapillomavirus 4" +viral_taxa$name[viral_taxa$taxid == 333928] <- "Gammapapillomavirus 2" +viral_taxa$name[viral_taxa$taxid == 337039] <- "Alphapapillomavirus 2" +viral_taxa$name[viral_taxa$taxid == 333929] <- "Gammapapillomavirus 3" +viral_taxa$name[viral_taxa$taxid == 337042] <- "Alphapapillomavirus 7" +viral_taxa$name[viral_taxa$taxid == 334203] <- "Mupapillomavirus 1" +viral_taxa$name[viral_taxa$taxid == 333757] <- "Alphapapillomavirus 8" +viral_taxa$name[viral_taxa$taxid == 337050] <- "Alphapapillomavirus 6" +viral_taxa$name[viral_taxa$taxid == 333767] <- "Alphapapillomavirus 3" +viral_taxa$name[viral_taxa$taxid == 333754] <- "Alphapapillomavirus 10" +viral_taxa$name[viral_taxa$taxid == 687363] <- "Torque teno virus 24" +viral_taxa$name[viral_taxa$taxid == 687342] <- "Torque teno virus 3" +viral_taxa$name[viral_taxa$taxid == 687359] <- "Torque teno virus 20" +viral_taxa$name[viral_taxa$taxid == 194441] <- "Primate T-lymphotropic virus 2" +viral_taxa$name[viral_taxa$taxid == 334209] <- "Betapapillomavirus 5" +viral_taxa$name[viral_taxa$taxid == 194965] <- "Aichivirus B" +viral_taxa$name[viral_taxa$taxid == 333930] <- "Gammapapillomavirus 4" +viral_taxa$name[viral_taxa$taxid == 337048] <- "Alphapapillomavirus 1" +viral_taxa$name[viral_taxa$taxid == 337041] <- "Alphapapillomavirus 9" +viral_taxa$name[viral_taxa$taxid == 337049] <- "Alphapapillomavirus 11" +viral_taxa$name[viral_taxa$taxid == 337044] <- "Alphapapillomavirus 5" + +# Filter samples and add viral taxa information +samples_keep <- read_counts %>% filter(n_reads_hv > 5) %>% pull(sample) +mrg_hv_named <- mrg_hv %>% filter(sample %in% samples_keep, hv_status) %>% left_join(viral_taxa, by="taxid") + +# Discover viral species & genera for HV reads +raise_rank <- function(read_db, taxid_db, out_rank = "species", verbose = FALSE){ + # Get higher ranks than search rank + ranks <- c("subspecies", "species", "subgenus", "genus", "subfamily", "family", "suborder", "order", "class", "subphylum", "phylum", "kingdom", "superkingdom") + rank_match <- which.max(ranks == out_rank) + high_ranks <- ranks[rank_match:length(ranks)] + # Merge read DB and taxid DB + reads <- read_db %>% select(-parent_taxid, -rank, -name) %>% + left_join(taxid_db, by="taxid") + # Extract sequences that are already at appropriate rank + reads_rank <- filter(reads, rank == out_rank) + # Drop sequences at a higher rank and return unclassified sequences + reads_norank <- reads %>% filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid)) + while(nrow(reads_norank) > 0){ # As long as there are unclassified sequences... + # Promote read taxids and re-merge with taxid DB, then re-classify and filter + reads_remaining <- reads_norank %>% mutate(taxid = parent_taxid) %>% + select(-parent_taxid, -rank, -name) %>% + left_join(taxid_db, by="taxid") + reads_rank <- reads_remaining %>% filter(rank == out_rank) %>% + bind_rows(reads_rank) + reads_norank <- reads_remaining %>% + filter(rank != out_rank, !rank %in% high_ranks, !is.na(taxid)) + } + # Finally, extract and append reads that were excluded during the process + reads_dropped <- reads %>% filter(!seq_id %in% reads_rank$seq_id) + reads_out <- reads_rank %>% bind_rows(reads_dropped) %>% + select(-parent_taxid, -rank, -name) %>% + left_join(taxid_db, by="taxid") + return(reads_out) +} +hv_reads_species <- raise_rank(mrg_hv_named, viral_taxa, "species") +hv_reads_genus <- raise_rank(mrg_hv_named, viral_taxa, "genus") +hv_reads_family <- raise_rank(mrg_hv_named, viral_taxa, "family") +``` + +```{r} +#| label: hv-family +#| fig-height: 5 +#| fig-width: 7 + +threshold_major_family <- 0.02 + +# Count reads for each human-viral family +hv_family_counts <- hv_reads_family %>% + group_by(sample, name, taxid) %>% + count(name = "n_reads_hv") %>% + group_by(sample) %>% + mutate(p_reads_hv = n_reads_hv/sum(n_reads_hv)) + +# Identify high-ranking families and group others +hv_family_major_tab <- hv_family_counts %>% group_by(name) %>% + filter(p_reads_hv == max(p_reads_hv)) %>% filter(row_number() == 1) %>% + arrange(desc(p_reads_hv)) %>% filter(p_reads_hv > threshold_major_family) +hv_family_counts_major <- hv_family_counts %>% + mutate(name_display = ifelse(name %in% hv_family_major_tab$name, name, "Other")) %>% + group_by(sample, name_display) %>% + summarize(n_reads_hv = sum(n_reads_hv), p_reads_hv = sum(p_reads_hv), + .groups="drop") %>% + mutate(name_display = factor(name_display, + levels = c(hv_family_major_tab$name, "Other"))) +hv_family_counts_display <- hv_family_counts_major %>% + rename(p_reads = p_reads_hv, classification = name_display) + +# Plot +g_hv_family <- g_comp_base + + geom_col(data=hv_family_counts_display, position = "stack", width=1) + + scale_y_continuous(name="% HV Reads", limits=c(0,1.01), + breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral family") + + labs(title="Family composition of human-viral reads") + + guides(fill=guide_legend(ncol=4)) + + theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain")) +g_hv_family + +# Get most prominent families for text +hv_family_collate <- hv_family_counts %>% group_by(name, taxid) %>% + summarize(n_reads_tot = sum(n_reads_hv), + p_reads_max = max(p_reads_hv), .groups="drop") %>% + arrange(desc(n_reads_tot)) +``` + +In investigating individual viral families, to avoid distortions from a few rare reads, I restricted myself to samples where that family made up at least 10% of human-viral reads: + +```{r} +#| label: hv-species-adeno +#| fig-height: 5 +#| fig-width: 7 + +threshold_major_species <- 0.05 +taxid_adeno <- 10508 + +# Get set of adenoviridae reads +adeno_samples <- hv_family_counts %>% filter(taxid == taxid_adeno) %>% + filter(p_reads_hv >= 0.1) %>% + pull(sample) +adeno_ids <- hv_reads_family %>% + filter(taxid == taxid_adeno, sample %in% adeno_samples) %>% + pull(seq_id) + +# Count reads for each adenoviridae species +adeno_species_counts <- hv_reads_species %>% + filter(seq_id %in% adeno_ids) %>% + group_by(sample, name, taxid) %>% + count(name = "n_reads_hv") %>% + group_by(sample) %>% + mutate(p_reads_adeno = n_reads_hv/sum(n_reads_hv)) + +# Identify high-ranking families and group others +adeno_species_major_tab <- adeno_species_counts %>% group_by(name) %>% + filter(p_reads_adeno == max(p_reads_adeno)) %>% + filter(row_number() == 1) %>% + arrange(desc(p_reads_adeno)) %>% + filter(p_reads_adeno > threshold_major_species) +adeno_species_counts_major <- adeno_species_counts %>% + mutate(name_display = ifelse(name %in% adeno_species_major_tab$name, + name, "Other")) %>% + group_by(sample, name_display) %>% + summarize(n_reads_adeno = sum(n_reads_hv), + p_reads_adeno = sum(p_reads_adeno), + .groups="drop") %>% + mutate(name_display = factor(name_display, + levels = c(adeno_species_major_tab$name, "Other"))) +adeno_species_counts_display <- adeno_species_counts_major %>% + rename(p_reads = p_reads_adeno, classification = name_display) + +# Plot +g_adeno_species <- g_comp_base + + geom_col(data=adeno_species_counts_display, position = "stack", width=1) + + scale_y_continuous(name="% Adenoviridae Reads", limits=c(0,1.01), + breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral species") + + labs(title="Species composition of Adenoviridae reads") + + guides(fill=guide_legend(ncol=3)) + + theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain")) + +g_adeno_species + +# Get most prominent species for text +adeno_species_collate <- adeno_species_counts %>% group_by(name, taxid) %>% + summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_adeno), .groups="drop") %>% + arrange(desc(n_reads_tot)) +``` + +```{r} +#| label: hv-species-polyoma +#| fig-height: 5 +#| fig-width: 7 + +threshold_major_species <- 0.1 +taxid_polyoma <- 151341 + +# Get set of polyomaviridae reads +polyoma_samples <- hv_family_counts %>% filter(taxid == taxid_polyoma) %>% + filter(p_reads_hv >= 0.1) %>% + pull(sample) +polyoma_ids <- hv_reads_family %>% + filter(taxid == taxid_polyoma, sample %in% polyoma_samples) %>% + pull(seq_id) + +# Count reads for each polyomaviridae species +polyoma_species_counts <- hv_reads_species %>% + filter(seq_id %in% polyoma_ids) %>% + group_by(sample, name, taxid) %>% + count(name = "n_reads_hv") %>% + group_by(sample) %>% + mutate(p_reads_polyoma = n_reads_hv/sum(n_reads_hv)) + +# Identify high-ranking families and group others +polyoma_species_major_tab <- polyoma_species_counts %>% group_by(name) %>% + filter(p_reads_polyoma == max(p_reads_polyoma)) %>% + filter(row_number() == 1) %>% + arrange(desc(p_reads_polyoma)) %>% + filter(p_reads_polyoma > threshold_major_species) +polyoma_species_counts_major <- polyoma_species_counts %>% + mutate(name_display = ifelse(name %in% polyoma_species_major_tab$name, + name, "Other")) %>% + group_by(sample, name_display) %>% + summarize(n_reads_polyoma = sum(n_reads_hv), + p_reads_polyoma = sum(p_reads_polyoma), + .groups="drop") %>% + mutate(name_display = factor(name_display, + levels = c(polyoma_species_major_tab$name, "Other"))) +polyoma_species_counts_display <- polyoma_species_counts_major %>% + rename(p_reads = p_reads_polyoma, classification = name_display) + +# Plot +g_polyoma_species <- g_comp_base + + geom_col(data=polyoma_species_counts_display, position = "stack", width=1) + + scale_y_continuous(name="% Polyomaviridae Reads", limits=c(0,1.01), + breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral species") + + labs(title="Species composition of Polyomaviridae reads") + + guides(fill=guide_legend(ncol=3)) + + theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain")) + +g_polyoma_species + +# Get most prominent species for text +polyoma_species_collate <- polyoma_species_counts %>% group_by(name, taxid) %>% + summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_polyoma), .groups="drop") %>% + arrange(desc(n_reads_tot)) +``` + +```{r} +#| label: hv-species-papilloma +#| fig-height: 5 +#| fig-width: 7 + +threshold_major_species <- 0.5 +taxid_papilloma <- 151340 + +# Get set of papillomaviridae reads +papilloma_samples <- hv_family_counts %>% filter(taxid == taxid_papilloma) %>% + filter(p_reads_hv >= 0.1) %>% + pull(sample) +papilloma_ids <- hv_reads_family %>% + filter(taxid == taxid_papilloma, sample %in% papilloma_samples) %>% + pull(seq_id) + +# Count reads for each papillomaviridae species +papilloma_species_counts <- hv_reads_species %>% + filter(seq_id %in% papilloma_ids) %>% + group_by(sample, name, taxid) %>% + count(name = "n_reads_hv") %>% + group_by(sample) %>% + mutate(p_reads_papilloma = n_reads_hv/sum(n_reads_hv)) + +# Identify high-ranking families and group others +papilloma_species_major_tab <- papilloma_species_counts %>% group_by(name) %>% + filter(p_reads_papilloma == max(p_reads_papilloma)) %>% + filter(row_number() == 1) %>% + arrange(desc(p_reads_papilloma)) %>% + filter(p_reads_papilloma > threshold_major_species) +papilloma_species_counts_major <- papilloma_species_counts %>% + mutate(name_display = ifelse(name %in% papilloma_species_major_tab$name, + name, "Other")) %>% + group_by(sample, name_display) %>% + summarize(n_reads_papilloma = sum(n_reads_hv), + p_reads_papilloma = sum(p_reads_papilloma), + .groups="drop") %>% + mutate(name_display = factor(name_display, + levels = c(papilloma_species_major_tab$name, "Other"))) +papilloma_species_counts_display <- papilloma_species_counts_major %>% + rename(p_reads = p_reads_papilloma, classification = name_display) + +# Plot +g_papilloma_species <- g_comp_base + + geom_col(data=papilloma_species_counts_display, position = "stack", width=1) + + scale_y_continuous(name="% Papillomaviridae Reads", limits=c(0,1.01), + breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral species") + + labs(title="Species composition of Papillomaviridae reads") + + guides(fill=guide_legend(ncol=3)) + + theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain")) + +g_papilloma_species + +# Get most prominent species for text +papilloma_species_collate <- papilloma_species_counts %>% group_by(name, taxid) %>% + summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_papilloma), .groups="drop") %>% + arrange(desc(n_reads_tot)) +``` + +```{r} +#| label: hv-species-parvo +#| fig-height: 5 +#| fig-width: 7 + +threshold_major_species <- 0.1 +taxid_parvo <- 10780 + +# Get set of parvoviridae reads +parvo_samples <- hv_family_counts %>% filter(taxid == taxid_parvo) %>% + filter(p_reads_hv >= 0.1) %>% + pull(sample) +parvo_ids <- hv_reads_family %>% + filter(taxid == taxid_parvo, sample %in% parvo_samples) %>% + pull(seq_id) + +# Count reads for each parvoviridae species +parvo_species_counts <- hv_reads_species %>% + filter(seq_id %in% parvo_ids) %>% + group_by(sample, name, taxid) %>% + count(name = "n_reads_hv") %>% + group_by(sample) %>% + mutate(p_reads_parvo = n_reads_hv/sum(n_reads_hv)) + +# Identify high-ranking families and group others +parvo_species_major_tab <- parvo_species_counts %>% group_by(name) %>% + filter(p_reads_parvo == max(p_reads_parvo)) %>% + filter(row_number() == 1) %>% + arrange(desc(p_reads_parvo)) %>% + filter(p_reads_parvo > threshold_major_species) +parvo_species_counts_major <- parvo_species_counts %>% + mutate(name_display = ifelse(name %in% parvo_species_major_tab$name, + name, "Other")) %>% + group_by(sample, name_display) %>% + summarize(n_reads_parvo = sum(n_reads_hv), + p_reads_parvo = sum(p_reads_parvo), + .groups="drop") %>% + mutate(name_display = factor(name_display, + levels = c(parvo_species_major_tab$name, "Other"))) +parvo_species_counts_display <- parvo_species_counts_major %>% + rename(p_reads = p_reads_parvo, classification = name_display) + +# Plot +g_parvo_species <- g_comp_base + + geom_col(data=parvo_species_counts_display, position = "stack", width=1) + + scale_y_continuous(name="% Parvoviridae Reads", limits=c(0,1.01), + breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral species") + + labs(title="Species composition of Parvoviridae reads") + + guides(fill=guide_legend(ncol=3)) + + theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain")) + +g_parvo_species + +# Get most prominent species for text +parvo_species_collate <- parvo_species_counts %>% group_by(name, taxid) %>% + summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_parvo), .groups="drop") %>% + arrange(desc(n_reads_tot)) +``` + +```{r} +#| label: hv-species-circo +#| fig-height: 5 +#| fig-width: 7 + +threshold_major_species <- 0.1 +taxid_circo <- 39724 + +# Get set of circoviridae reads +circo_samples <- hv_family_counts %>% filter(taxid == taxid_circo) %>% + filter(p_reads_hv >= 0.1) %>% + pull(sample) +circo_ids <- hv_reads_family %>% + filter(taxid == taxid_circo, sample %in% circo_samples) %>% + pull(seq_id) + +# Count reads for each circoviridae species +circo_species_counts <- hv_reads_species %>% + filter(seq_id %in% circo_ids) %>% + group_by(sample, name, taxid) %>% + count(name = "n_reads_hv") %>% + group_by(sample) %>% + mutate(p_reads_circo = n_reads_hv/sum(n_reads_hv)) + +# Identify high-ranking families and group others +circo_species_major_tab <- circo_species_counts %>% group_by(name) %>% + filter(p_reads_circo == max(p_reads_circo)) %>% + filter(row_number() == 1) %>% + arrange(desc(p_reads_circo)) %>% + filter(p_reads_circo > threshold_major_species) +circo_species_counts_major <- circo_species_counts %>% + mutate(name_display = ifelse(name %in% circo_species_major_tab$name, + name, "Other")) %>% + group_by(sample, name_display) %>% + summarize(n_reads_circo = sum(n_reads_hv), + p_reads_circo = sum(p_reads_circo), + .groups="drop") %>% + mutate(name_display = factor(name_display, + levels = c(circo_species_major_tab$name, "Other"))) +circo_species_counts_display <- circo_species_counts_major %>% + rename(p_reads = p_reads_circo, classification = name_display) + +# Plot +g_circo_species <- g_comp_base + + geom_col(data=circo_species_counts_display, position = "stack", width=1) + + scale_y_continuous(name="% Circoviridae Reads", limits=c(0,1.01), + breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral species") + + labs(title="Species composition of Circoviridae reads") + + guides(fill=guide_legend(ncol=3)) + + theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain")) + +g_circo_species + +# Get most prominent species for text +circo_species_collate <- circo_species_counts %>% group_by(name, taxid) %>% + summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_circo), .groups="drop") %>% + arrange(desc(n_reads_tot)) +``` + +```{r} +#| label: hv-species-herpes +#| fig-height: 5 +#| fig-width: 7 + +threshold_major_species <- 0.1 +taxid_herpes <- 10292 + +# Get set of herpesviridae reads +herpes_samples <- hv_family_counts %>% filter(taxid == taxid_herpes) %>% + filter(p_reads_hv >= 0.1) %>% + pull(sample) +herpes_ids <- hv_reads_family %>% + filter(taxid == taxid_herpes, sample %in% herpes_samples) %>% + pull(seq_id) + +# Count reads for each herpesviridae species +herpes_species_counts <- hv_reads_species %>% + filter(seq_id %in% herpes_ids) %>% + group_by(sample, name, taxid) %>% + count(name = "n_reads_hv") %>% + group_by(sample) %>% + mutate(p_reads_herpes = n_reads_hv/sum(n_reads_hv)) + +# Identify high-ranking families and group others +herpes_species_major_tab <- herpes_species_counts %>% group_by(name) %>% + filter(p_reads_herpes == max(p_reads_herpes)) %>% + filter(row_number() == 1) %>% + arrange(desc(p_reads_herpes)) %>% + filter(p_reads_herpes > threshold_major_species) +herpes_species_counts_major <- herpes_species_counts %>% + mutate(name_display = ifelse(name %in% herpes_species_major_tab$name, + name, "Other")) %>% + group_by(sample, name_display) %>% + summarize(n_reads_herpes = sum(n_reads_hv), + p_reads_herpes = sum(p_reads_herpes), + .groups="drop") %>% + mutate(name_display = factor(name_display, + levels = c(herpes_species_major_tab$name, "Other"))) +herpes_species_counts_display <- herpes_species_counts_major %>% + rename(p_reads = p_reads_herpes, classification = name_display) + +# Plot +g_herpes_species <- g_comp_base + + geom_col(data=herpes_species_counts_display, position = "stack", width=1) + + scale_y_continuous(name="% Herpesviridae Reads", limits=c(0,1.01), + breaks = seq(0,1,0.2), + expand=c(0,0), labels = function(y) y*100) + + scale_fill_manual(values=palette_viral, name = "Viral species") + + labs(title="Species composition of Herpesviridae reads") + + guides(fill=guide_legend(ncol=3)) + + theme(plot.title = element_text(size=rel(1.4), hjust=0, face="plain")) + +g_herpes_species + +# Get most prominent species for text +herpes_species_collate <- herpes_species_counts %>% group_by(name, taxid) %>% + summarize(n_reads_tot = sum(n_reads_hv), p_reads_mean = mean(p_reads_herpes), .groups="drop") %>% + arrange(desc(n_reads_tot)) +``` + +Finally, here again are the overall relative abundances of the specific viral genera I picked out manually in my last entry: + +```{r} +#| fig-height: 5 +#| label: ra-genera +#| warning: false + +# Define reference genera +path_genera_rna <- c("Mamastrovirus", "Enterovirus", "Salivirus", "Kobuvirus", "Norovirus", "Sapovirus", "Rotavirus", "Alphacoronavirus", "Betacoronavirus", "Alphainfluenzavirus", "Betainfluenzavirus", "Lentivirus") +path_genera_dna <- c("Mastadenovirus", "Alphapolyomavirus", "Betapolyomavirus", "Alphapapillomavirus", "Betapapillomavirus", "Gammapapillomavirus", "Orthopoxvirus", "Simplexvirus", + "Lymphocryptovirus", "Cytomegalovirus", "Dependoparvovirus") +path_genera <- bind_rows(tibble(name=path_genera_rna, genome_type="RNA genome"), + tibble(name=path_genera_dna, genome_type="DNA genome")) %>% + left_join(viral_taxa, by="name") + +# Count in each sample +mrg_hv_named_all <- mrg_hv %>% left_join(viral_taxa, by="taxid") +hv_reads_genus_all <- raise_rank(mrg_hv_named_all, viral_taxa, "genus") +n_path_genera <- hv_reads_genus_all %>% + group_by(sample, name, taxid) %>% + count(name="n_reads_viral") %>% + inner_join(path_genera, by=c("name", "taxid")) %>% + left_join(read_counts_raw, by=c("sample")) %>% + mutate(p_reads_viral = n_reads_viral/n_reads_raw) + +# Pivot out and back to add zero lines +n_path_genera_out <- n_path_genera %>% ungroup %>% select(sample, name, n_reads_viral) %>% + pivot_wider(names_from="name", values_from="n_reads_viral", values_fill=0) %>% + pivot_longer(-sample, names_to="name", values_to="n_reads_viral") %>% + left_join(read_counts_raw, by="sample") %>% + left_join(path_genera, by="name") %>% + mutate(p_reads_viral = n_reads_viral/n_reads_raw) + +## Aggregate across dates +n_path_genera_stype <- n_path_genera_out %>% + group_by(name, taxid, genome_type) %>% + summarize(n_reads_raw = sum(n_reads_raw), + n_reads_viral = sum(n_reads_viral), .groups = "drop") %>% + mutate(sample="All samples", location="All locations", + p_reads_viral = n_reads_viral/n_reads_raw, + na_type = "DNA") + +# Plot +g_path_genera <- ggplot(n_path_genera_stype, + aes(y=name, x=p_reads_viral)) + + geom_point() + + scale_x_log10(name="Relative abundance") + + facet_grid(genome_type~., scales="free_y") + + theme_base + theme(axis.title.y = element_blank()) +g_path_genera +``` + +# Conclusion + +This is the final P2RA dataset I needed to analyze before we finish re-doing that analysis for publication, so I'm pretty happy to have it done. In terms of the results, things mostly look similar to other DNA WW datasets I've looked at, with the notable difference that the total fraction of human-infecting viruses is significantly higher. I'm still not sure what's causing this elevation; the methods used in this study don't seem any different from other studies that got much lower fractions, and the fact that this study sampled from developing countries seems like only a partial explanation.