From 69fd38bed0db859786a4f55c6b16f14a46f8746c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 09:57:03 -0400 Subject: [PATCH 1/9] update to next sourmash release --- Cargo.lock | 5 ++--- Cargo.toml | 3 ++- src/manysearch.rs | 1 + src/utils.rs | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f4acf157..968f5cea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1548,9 +1548,8 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" -version = "0.15.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a73bae93170d8d0f816e18b6a630d76e134b90958850985ee2f0fb2f641d4de" +version = "0.16.0" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#9b9fc5a4d40521e14390766fb6ffde4c6921062c" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 7935aa38..7dbc99cf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.22.3", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.210", features = ["derive"] } -sourmash = { version = "0.15.2", features = ["branchwater"] } +#sourmash = { version = "0.15.2", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "refactor_rs_downsample", features = ["branchwater"] } serde_json = "1.0.128" niffler = "2.4.0" log = "0.4.22" diff --git a/src/manysearch.rs b/src/manysearch.rs index d343493d..199af8b5 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -219,6 +219,7 @@ fn downsample_and_inflate_abundances( // avoid downsampling if we can if against_scaled != query_scaled { let against_ds = against + .clone() .downsample_scaled(query.scaled()) .expect("cannot downsample sketch"); (abunds, sum_weighted) = query.inflated_abundances(&against_ds)?; diff --git a/src/utils.rs b/src/utils.rs index 33f78316..f0a81d5f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -478,7 +478,7 @@ pub fn load_sketches_above_threshold( if let Ok(against_sig) = against_collection.sig_from_record(against_record) { if let Some(against_mh) = against_sig.minhash() { // downsample against_mh, but keep original md5sum - let against_mh_ds = against_mh.downsample_scaled(query.scaled()).unwrap(); + let against_mh_ds = against_mh.clone().downsample_scaled(query.scaled()).unwrap(); if let Ok(overlap) = against_mh_ds.count_common(query, false) { if overlap >= threshold_hashes { let result = PrefetchResult { From ee580b683ae9f5564b5c48c8e70cda4586c8fbe9 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 10:04:16 -0400 Subject: [PATCH 2/9] cargo fmt --- src/utils.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/utils.rs b/src/utils.rs index f0a81d5f..a702378f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -478,7 +478,10 @@ pub fn load_sketches_above_threshold( if let Ok(against_sig) = against_collection.sig_from_record(against_record) { if let Some(against_mh) = against_sig.minhash() { // downsample against_mh, but keep original md5sum - let against_mh_ds = against_mh.clone().downsample_scaled(query.scaled()).unwrap(); + let against_mh_ds = against_mh + .clone() + .downsample_scaled(query.scaled()) + .unwrap(); if let Ok(overlap) = against_mh_ds.count_common(query, false) { if overlap >= threshold_hashes { let result = PrefetchResult { From 981405166f6e4c341242357db28c1b58beddbe5e Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 15:53:16 -0400 Subject: [PATCH 3/9] upd sourmash --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 968f5cea..7e2264f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1549,7 +1549,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#9b9fc5a4d40521e14390766fb6ffde4c6921062c" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#79afb857967d5f48393341a77e43ea27ab3caf22" dependencies = [ "az", "byteorder", From d27b03e8e677a6b96d866ea2c89208dcc7135aa4 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 16:38:32 -0400 Subject: [PATCH 4/9] correct numbers --- Cargo.lock | 2 +- src/python/tests/test_fastmultigather.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7e2264f7..7b1eb621 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1549,7 +1549,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#79afb857967d5f48393341a77e43ea27ab3caf22" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#ddcb049e99749f1d16c414e0fdb2d06d55a38db7" dependencies = [ "az", "byteorder", diff --git a/src/python/tests/test_fastmultigather.py b/src/python/tests/test_fastmultigather.py index f485b7f0..653b0ba5 100644 --- a/src/python/tests/test_fastmultigather.py +++ b/src/python/tests/test_fastmultigather.py @@ -959,7 +959,7 @@ def test_indexed_full_output(runtmp): # check a few columns average_ani = set(df['average_containment_ani']) avg_ani = set([round(x, 4) for x in average_ani]) - assert avg_ani == {0.8502, 0.8584, 0.8602} + assert avg_ani == {0.9221, 0.9306, 0.9316} # @CTB check against py gather f_unique_weighted = set(df['f_unique_weighted']) f_unique_weighted = set([round(x, 4) for x in f_unique_weighted]) @@ -967,7 +967,7 @@ def test_indexed_full_output(runtmp): unique_intersect_bp = set(df['unique_intersect_bp']) unique_intersect_bp = set([round(x,4) for x in unique_intersect_bp]) - assert unique_intersect_bp == {44000, 18000, 22000} + assert unique_intersect_bp == {4400000, 1800000, 2200000} def test_nonindexed_full_vs_sourmash_gather(runtmp): From e35111a7d2a6a5c510a23b04f8bb091a4ca30e76 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Oct 2024 06:08:38 -0400 Subject: [PATCH 5/9] upd sourmash --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 7b1eb621..da608df5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1549,7 +1549,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#ddcb049e99749f1d16c414e0fdb2d06d55a38db7" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#6eb86a390c53fc243bf65dd38fab3d1712c9f579" dependencies = [ "az", "byteorder", From 4778862e9def28528a7e9cc65d8d75c6ec2dc9f0 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Oct 2024 06:24:42 -0400 Subject: [PATCH 6/9] upd sourmash --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index da608df5..f1218059 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1549,7 +1549,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#6eb86a390c53fc243bf65dd38fab3d1712c9f579" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#62f03eb3de8f4b05307efad74f321ced04de40f1" dependencies = [ "az", "byteorder", From a0e02efb86ad084e1fcd18585a5f71253ca105ad Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Oct 2024 07:19:58 -0400 Subject: [PATCH 7/9] upd sourmash --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index f1218059..a56788cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1549,7 +1549,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#62f03eb3de8f4b05307efad74f321ced04de40f1" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#e4e5555fd81a9a8677bbe065cf7f528270b01fed" dependencies = [ "az", "byteorder", From 9b448c8a873e9fde1bc4cb84f441863078677eb9 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Oct 2024 07:29:52 -0400 Subject: [PATCH 8/9] use new try_into() and eliminate several clone()s --- src/utils.rs | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index a702378f..0b7df6c9 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -476,28 +476,30 @@ pub fn load_sketches_above_threshold( let mut results = Vec::new(); // Load against into memory if let Ok(against_sig) = against_collection.sig_from_record(against_record) { - if let Some(against_mh) = against_sig.minhash() { - // downsample against_mh, but keep original md5sum - let against_mh_ds = against_mh - .clone() - .downsample_scaled(query.scaled()) - .unwrap(); - if let Ok(overlap) = against_mh_ds.count_common(query, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: against_record.name().to_string(), - md5sum: against_mh.md5sum(), - minhash: against_mh_ds.clone(), - location: against_record.internal_location().to_string(), - overlap, - }; - results.push(result); - } + let against_filename = against_sig.filename(); + let against_mh: KmerMinHash = against_sig.try_into().expect("cannot get sketch"); + let against_md5 = against_mh.md5sum(); // keep original md5sum + + let against_mh_ds = against_mh + .downsample_scaled(query.scaled()) + .expect("cannot downsample sketch"); + + // good? ok, store as candidate from prefetch. + if let Ok(overlap) = against_mh_ds.count_common(query, false) { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against_record.name().to_string(), + md5sum: against_md5, + minhash: against_mh_ds, + location: against_record.internal_location().to_string(), + overlap, + }; + results.push(result); } } else { eprintln!( "WARNING: no compatible sketches in path '{}'", - against_sig.filename() + against_filename ); let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } From 18a363e4adb120fbe43b77ec41fc5158c414107b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Oct 2024 10:45:13 -0400 Subject: [PATCH 9/9] upd sourmash --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index a56788cf..14c16762 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1549,7 +1549,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#e4e5555fd81a9a8677bbe065cf7f528270b01fed" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#ceaea393d95b3b85575b51c20784d3b9442da149" dependencies = [ "az", "byteorder",