diff --git a/Cargo.lock b/Cargo.lock index f4acf157..f1218059 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1548,9 +1548,8 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" -version = "0.15.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a73bae93170d8d0f816e18b6a630d76e134b90958850985ee2f0fb2f641d4de" +version = "0.16.0" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#62f03eb3de8f4b05307efad74f321ced04de40f1" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 5f8cc0b4..5e28367b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.22.3", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.210", features = ["derive"] } +#sourmash = { version = "0.15.2", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "refactor_rs_downsample", features = ["branchwater"] } serde_json = "1.0.128" sourmash = { version = "0.15.2", features = ["branchwater"] } niffler = "2.4.0" diff --git a/src/manysearch.rs b/src/manysearch.rs index fcd691d3..a6c320a0 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -220,6 +220,7 @@ fn downsample_and_inflate_abundances( // avoid downsampling if we can if against_scaled != query_scaled { let against_ds = against + .clone() .downsample_scaled(query.scaled()) .expect("cannot downsample sketch"); (abunds, sum_weighted) = query.inflated_abundances(&against_ds)?; diff --git a/src/python/tests/test_fastmultigather.py b/src/python/tests/test_fastmultigather.py index 7bc54b3c..643799b9 100644 --- a/src/python/tests/test_fastmultigather.py +++ b/src/python/tests/test_fastmultigather.py @@ -992,7 +992,7 @@ def test_indexed_full_output(runtmp): # check a few columns average_ani = set(df['average_containment_ani']) avg_ani = set([round(x, 4) for x in average_ani]) - assert avg_ani == {0.8502, 0.8584, 0.8602} + assert avg_ani == {0.9221, 0.9306, 0.9316} # @CTB check against py gather f_unique_weighted = set(df['f_unique_weighted']) f_unique_weighted = set([round(x, 4) for x in f_unique_weighted]) @@ -1000,7 +1000,7 @@ def test_indexed_full_output(runtmp): unique_intersect_bp = set(df['unique_intersect_bp']) unique_intersect_bp = set([round(x,4) for x in unique_intersect_bp]) - assert unique_intersect_bp == {44000, 18000, 22000} + assert unique_intersect_bp == {4400000, 1800000, 2200000} def test_nonindexed_full_vs_sourmash_gather(runtmp): diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 093344e9..48157179 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -452,7 +452,10 @@ pub fn load_sketches_above_threshold( if let Ok(against_sig) = coll.sig_from_record(against_record) { if let Some(against_mh) = against_sig.minhash() { // downsample against_mh, but keep original md5sum - let against_mh_ds = against_mh.downsample_scaled(query.scaled()).unwrap(); + let against_mh_ds = against_mh + .clone() + .downsample_scaled(query.scaled()) + .unwrap(); if let Ok(overlap) = against_mh_ds.count_common(query, false) { if overlap >= threshold_hashes { let result = PrefetchResult {