From 63eb0566c920170fcc1d03845addaaecbe3a3cfb Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Sat, 30 Nov 2024 16:05:12 +0100 Subject: [PATCH 1/2] Remove legacy wand term freq scorer. --- .../profiled_iterator_test.cpp | 2 +- .../sparse_vector_benchmark_test.cpp | 2 +- .../src/tests/queryeval/weak_and/rise_wand.h | 10 +++---- .../tests/queryeval/weak_and/rise_wand.hpp | 6 ++--- .../queryeval/weak_and/wand_bench_setup.hpp | 27 ++++++++++++------- .../queryeval/weak_and/weak_and_bench.cpp | 26 +++++++++--------- .../queryeval/weak_and/weak_and_test.cpp | 11 +++++--- .../weak_and/weak_and_test_expensive.cpp | 8 +++--- .../searchlib/queryeval/wand/wand_parts.h | 21 --------------- .../queryeval/wand/weak_and_search.cpp | 7 +++-- 10 files changed, 55 insertions(+), 65 deletions(-) diff --git a/searchlib/src/tests/queryeval/profiled_iterator/profiled_iterator_test.cpp b/searchlib/src/tests/queryeval/profiled_iterator/profiled_iterator_test.cpp index b4ceacb71ca1..5f5f0d0b5f71 100644 --- a/searchlib/src/tests/queryeval/profiled_iterator/profiled_iterator_test.cpp +++ b/searchlib/src/tests/queryeval/profiled_iterator/profiled_iterator_test.cpp @@ -96,7 +96,7 @@ SearchIterator::UP create_weak_and() { terms.emplace_back(T({1,2,3}).release(), 100, 3); terms.emplace_back(T({5,6}).release(), 200, 2); terms.emplace_back(T({8}).release(), 300, 1); - return WeakAndSearch::create(terms, wand::MatchParams(dummy_heap), wand::TermFrequencyScorer(), 100, true, true); + return WeakAndSearch::create(terms, wand::MatchParams(dummy_heap), wand::Bm25TermFrequencyScorer(num_docs), 100, true, true); } void collect(std::map &counts, const auto &node) { diff --git a/searchlib/src/tests/queryeval/sparse_vector_benchmark/sparse_vector_benchmark_test.cpp b/searchlib/src/tests/queryeval/sparse_vector_benchmark/sparse_vector_benchmark_test.cpp index 00224e4eda02..d1cce8d43440 100644 --- a/searchlib/src/tests/queryeval/sparse_vector_benchmark/sparse_vector_benchmark_test.cpp +++ b/searchlib/src/tests/queryeval/sparse_vector_benchmark/sparse_vector_benchmark_test.cpp @@ -219,7 +219,7 @@ struct RiseWandFactory : SparseVectorFactory { for (size_t i = 0; i < childCnt; ++i) { terms.emplace_back(childFactory.createChild(i, limit), default_weight, limit / (i + 1)); } - return std::make_unique(terms, n); + return std::make_unique(terms, n, rise::TermFreqScorer(limit)); } }; diff --git a/searchlib/src/tests/queryeval/weak_and/rise_wand.h b/searchlib/src/tests/queryeval/weak_and/rise_wand.h index 4c7be54a6f06..d100a206c028 100644 --- a/searchlib/src/tests/queryeval/weak_and/rise_wand.h +++ b/searchlib/src/tests/queryeval/weak_and/rise_wand.h @@ -7,17 +7,17 @@ #include #include +using search::queryeval::wand::Bm25TermFrequencyScorer; using search::queryeval::wand::DotProductScorer; -using search::queryeval::wand::TermFrequencyScorer; using namespace search::queryeval; namespace rise { struct TermFreqScorer { - [[no_unique_address]] TermFrequencyScorer _termFrequencyScorer; - TermFreqScorer() noexcept - : _termFrequencyScorer() + [[no_unique_address]] Bm25TermFrequencyScorer _termFrequencyScorer; + TermFreqScorer(uint32_t num_docs) noexcept + : _termFrequencyScorer(num_docs) { } int64_t calculateMaxScore(const wand::Term &term) const noexcept { return _termFrequencyScorer.calculateMaxScore(term); @@ -127,7 +127,7 @@ class RiseWand : public search::queryeval::SearchIterator void _sortMerge(uint32_t numStreamsToSort); public: - RiseWand(const Terms &terms, uint32_t n); + RiseWand(const Terms &terms, uint32_t n, Scorer scorer); ~RiseWand() override; void next(); void doSeek(uint32_t docid) override; diff --git a/searchlib/src/tests/queryeval/weak_and/rise_wand.hpp b/searchlib/src/tests/queryeval/weak_and/rise_wand.hpp index c477be5cc62f..4e3de74e1e91 100644 --- a/searchlib/src/tests/queryeval/weak_and/rise_wand.hpp +++ b/searchlib/src/tests/queryeval/weak_and/rise_wand.hpp @@ -6,12 +6,10 @@ #include #include -using search::queryeval::wand::TermFrequencyScorer; - namespace rise { template -RiseWand::RiseWand(const Terms &terms, uint32_t n) +RiseWand::RiseWand(const Terms &terms, uint32_t n, Scorer scorer) : _numStreams(0), _streams(), _lastPivotIdx(0), @@ -19,7 +17,7 @@ RiseWand::RiseWand(const Terms &terms, uint32_t n) _streamIndices(new uint16_t[terms.size()]), _streamIndicesAux(new uint16_t[terms.size()]), _streamComparator(_streamDocIds), - _scorer(), + _scorer(std::move(scorer)), _n(n), _limit(1), _streamScores(new score_t[terms.size()]), diff --git a/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp b/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp index 128f95a634d6..1f7743cda8bf 100644 --- a/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp +++ b/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp @@ -119,14 +119,16 @@ VespaWandFactory::~VespaWandFactory() = default; struct VespaArrayWandFactory : WandFactory { mutable SharedWeakAndPriorityQueue _scores; uint32_t n; - explicit VespaArrayWandFactory(uint32_t n_in) + uint32_t docid_limit; + explicit VespaArrayWandFactory(uint32_t n_in, uint32_t docid_limit_in) : _scores(n_in), - n(n_in) + n(n_in), + docid_limit(docid_limit_in) {} ~VespaArrayWandFactory() override; std::string name() const override { return make_string("VESPA ARRAY WAND (n=%u)", n); } SearchIterator::UP create(const wand::Terms &terms) override { - return WeakAndSearch::createArrayWand(terms, wand::MatchParams(_scores, wand::StopWordStrategy::none(), 1, 0), wand::TermFrequencyScorer(), n, true, false); + return WeakAndSearch::createArrayWand(terms, wand::MatchParams(_scores, wand::StopWordStrategy::none(), 1, 0), wand::Bm25TermFrequencyScorer(docid_limit), n, true, false); } }; @@ -135,14 +137,16 @@ VespaArrayWandFactory::~VespaArrayWandFactory() = default; struct VespaHeapWandFactory : WandFactory { mutable SharedWeakAndPriorityQueue _scores; uint32_t n; - explicit VespaHeapWandFactory(uint32_t n_in) + uint32_t docid_limit; + explicit VespaHeapWandFactory(uint32_t n_in, uint32_t docid_limit_in) : _scores(n_in), - n(n_in) + n(n_in), + docid_limit(docid_limit_in) {} ~VespaHeapWandFactory() override; std::string name() const override { return make_string("VESPA HEAP WAND (n=%u)", n); } SearchIterator::UP create(const wand::Terms &terms) override { - return WeakAndSearch::createHeapWand(terms, wand::MatchParams(_scores, wand::StopWordStrategy::none(), 1, 0), wand::TermFrequencyScorer(), n, true, false); + return WeakAndSearch::createHeapWand(terms, wand::MatchParams(_scores, wand::StopWordStrategy::none(), 1, 0), wand::Bm25TermFrequencyScorer(docid_limit), n, true, false); } }; @@ -191,11 +195,16 @@ VespaParallelHeapWandFactory::~VespaParallelHeapWandFactory() = default; struct TermFrequencyRiseWandFactory : WandFactory { uint32_t n; - explicit TermFrequencyRiseWandFactory(uint32_t n_in) noexcept : n(n_in) {} + uint32_t docid_limit; + explicit TermFrequencyRiseWandFactory(uint32_t n_in, uint32_t docid_limit_in) noexcept + : n(n_in), + docid_limit(docid_limit_in) + { + } ~TermFrequencyRiseWandFactory() override; std::string name() const override { return make_string("RISE WAND TF (n=%u)", n); } SearchIterator::UP create(const wand::Terms &terms) override { - return std::make_unique(terms, n); + return std::make_unique(terms, n, rise::TermFreqScorer(docid_limit)); } }; @@ -207,7 +216,7 @@ struct DotProductRiseWandFactory : WandFactory { ~DotProductRiseWandFactory() override; std::string name() const override { return make_string("RISE WAND DP (n=%u)", n); } SearchIterator::UP create(const wand::Terms &terms) override { - return std::make_unique(terms, n); + return std::make_unique(terms, n, DotProductScorer()); } }; diff --git a/searchlib/src/tests/queryeval/weak_and/weak_and_bench.cpp b/searchlib/src/tests/queryeval/weak_and/weak_and_bench.cpp index e536e3a098a8..046c95bcc577 100644 --- a/searchlib/src/tests/queryeval/weak_and/weak_and_bench.cpp +++ b/searchlib/src/tests/queryeval/weak_and/weak_and_bench.cpp @@ -1,18 +1,20 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "wand_bench_setup.hpp" -TEST_FF("benchmark", VespaWandFactory(1000), WandSetup(f1, 10, 10000000)) { f2.benchmark(); } -TEST_FF("benchmark", TermFrequencyRiseWandFactory(1000), WandSetup(f1, 10, 10000000)) { f2.benchmark(); } -TEST_FF("benchmark", VespaWandFactory(1000), WandSetup(f1, 100, 10000000)) { f2.benchmark(); } -TEST_FF("benchmark", TermFrequencyRiseWandFactory(1000), WandSetup(f1, 100, 10000000)) { f2.benchmark(); } -TEST_FF("benchmark", VespaWandFactory(1000), WandSetup(f1, 1000, 10000000)) { f2.benchmark(); } -TEST_FF("benchmark", TermFrequencyRiseWandFactory(1000), WandSetup(f1, 1000, 10000000)) { f2.benchmark(); } +constexpr uint32_t docid_limit = 10000000; -TEST_FFF("benchmark", VespaWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 10, 10000000)) { f3.benchmark(); } -TEST_FFF("benchmark", TermFrequencyRiseWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 10, 10000000)) { f3.benchmark(); } -TEST_FFF("benchmark", VespaWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 100, 10000000)) { f3.benchmark(); } -TEST_FFF("benchmark", TermFrequencyRiseWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 100, 10000000)) { f3.benchmark(); } -TEST_FFF("benchmark", VespaWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 1000, 10000000)) { f3.benchmark(); } -TEST_FFF("benchmark", TermFrequencyRiseWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 1000, 10000000)) { f3.benchmark(); } +TEST_FF("benchmark", VespaWandFactory(1000), WandSetup(f1, 10, docid_limit)) { f2.benchmark(); } +TEST_FF("benchmark", TermFrequencyRiseWandFactory(1000, docid_limit), WandSetup(f1, 10, docid_limit)) { f2.benchmark(); } +TEST_FF("benchmark", VespaWandFactory(1000), WandSetup(f1, 100, docid_limit)) { f2.benchmark(); } +TEST_FF("benchmark", TermFrequencyRiseWandFactory(1000, docid_limit), WandSetup(f1, 100, docid_limit)) { f2.benchmark(); } +TEST_FF("benchmark", VespaWandFactory(1000), WandSetup(f1, 1000, docid_limit)) { f2.benchmark(); } +TEST_FF("benchmark", TermFrequencyRiseWandFactory(1000, docid_limit), WandSetup(f1, 1000, docid_limit)) { f2.benchmark(); } + +TEST_FFF("benchmark", VespaWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 10, docid_limit)) { f3.benchmark(); } +TEST_FFF("benchmark", TermFrequencyRiseWandFactory(1000, docid_limit), FilterFactory(f1, 2), WandSetup(f2, 10, docid_limit)) { f3.benchmark(); } +TEST_FFF("benchmark", VespaWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 100, docid_limit)) { f3.benchmark(); } +TEST_FFF("benchmark", TermFrequencyRiseWandFactory(1000, docid_limit), FilterFactory(f1, 2), WandSetup(f2, 100, docid_limit)) { f3.benchmark(); } +TEST_FFF("benchmark", VespaWandFactory(1000), FilterFactory(f1, 2), WandSetup(f2, 1000, docid_limit)) { f3.benchmark(); } +TEST_FFF("benchmark", TermFrequencyRiseWandFactory(1000, docid_limit), FilterFactory(f1, 2), WandSetup(f2, 1000, docid_limit)) { f3.benchmark(); } TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp b/searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp index fe47319e3a70..d43900518a37 100644 --- a/searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp +++ b/searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp @@ -18,6 +18,8 @@ using History = SearchHistory; namespace { +constexpr uint32_t docid_limit = 116; + struct MyWandSpec : public WandSpec { SharedWeakAndPriorityQueue scores; @@ -30,7 +32,7 @@ struct MyWandSpec : public WandSpec scores(n_in), n(n_in), matching_phase(MatchingPhase::FIRST_PHASE), - my_params(scores, wand::StopWordStrategy::none(), 1, 0) + my_params(scores, wand::StopWordStrategy::none(), 1, docid_limit) {} SearchIterator *create() { bool readonly_scores_heap = (matching_phase != MatchingPhase::FIRST_PHASE); @@ -39,7 +41,7 @@ struct MyWandSpec : public WandSpec } void set_second_phase() { matching_phase = MatchingPhase::SECOND_PHASE; } void set_abs_stop_word_adjust_limit(double limit) { - my_params.stop_words = wand::StopWordStrategy(-limit, 1.0, 0); + my_params.stop_words = wand::StopWordStrategy(-limit, 1.0, docid_limit); } SimpleResult search() { SearchIterator::UP search(create()); @@ -142,7 +144,8 @@ TEST(WeakAndTest, require_that_initial_docid_for_subsearches_are_taken_into_acco terms.push_back(wand::Term(new TrackedSearch("foo", history, new EagerChild(search::endDocId)), 100, 1)); terms.push_back(wand::Term(new TrackedSearch("bar", history, new EagerChild(10)), 100, 2)); SharedWeakAndPriorityQueue scores(2); - auto search = std::make_unique("WAND", history, WeakAndSearch::create(terms, wand::MatchParams(scores), 2, true, false)); + wand::MatchParams match_params(scores, wand::StopWordStrategy::none(), wand::DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY, docid_limit); + auto search = std::make_unique("WAND", history, WeakAndSearch::create(terms, match_params, 2, true, false)); SimpleResult hits; hits.search(*search); EXPECT_EQ(SimpleResult().addHit(10), hits); @@ -191,7 +194,7 @@ class IteratorChildrenVerifier : public search::test::IteratorChildrenVerifier { } static constexpr size_t LARGE_ENOUGH_HEAP_FOR_ALL = 10000; _scores.push_back(std::make_unique(LARGE_ENOUGH_HEAP_FOR_ALL)); - return WeakAndSearch::create(terms, wand::MatchParams(*_scores.back(), wand::StopWordStrategy::none(), 1, 0), -1, strict, false); + return WeakAndSearch::create(terms, wand::MatchParams(*_scores.back(), wand::StopWordStrategy::none(), 1, docid_limit), -1, strict, false); } }; diff --git a/searchlib/src/tests/queryeval/weak_and/weak_and_test_expensive.cpp b/searchlib/src/tests/queryeval/weak_and/weak_and_test_expensive.cpp index 0573404a3b4e..14b5babb20d5 100644 --- a/searchlib/src/tests/queryeval/weak_and/weak_and_test_expensive.cpp +++ b/searchlib/src/tests/queryeval/weak_and/weak_and_test_expensive.cpp @@ -51,25 +51,25 @@ TEST("require that mod search works") { //---- WeakAndSearch ------------------------------------------------------------------------------ TEST_FF("require that (array) WAND and RISE WAND gives the same hits", - VespaArrayWandFactory(NUM_CHILDREN), TermFrequencyRiseWandFactory(NUM_CHILDREN)) + VespaArrayWandFactory(NUM_CHILDREN, LIMIT), TermFrequencyRiseWandFactory(NUM_CHILDREN, LIMIT)) { checkWandHits(f1, f2, 1, 0); } TEST_FF("require that (heap) WAND and RISE WAND gives the same hits", - VespaHeapWandFactory(NUM_CHILDREN), TermFrequencyRiseWandFactory(NUM_CHILDREN)) + VespaHeapWandFactory(NUM_CHILDREN, LIMIT), TermFrequencyRiseWandFactory(NUM_CHILDREN, LIMIT)) { checkWandHits(f1, f2, 1, 0); } TEST_FF("require that (array) WAND and RISE WAND gives the same hits with filtering and skipping", - VespaArrayWandFactory(NUM_CHILDREN), TermFrequencyRiseWandFactory(NUM_CHILDREN)) + VespaArrayWandFactory(NUM_CHILDREN, LIMIT), TermFrequencyRiseWandFactory(NUM_CHILDREN, LIMIT)) { checkWandHits(f1, f2, 123, 5); } TEST_FF("require that (heap) WAND and RISE WAND gives the same hits with filtering and skipping", - VespaHeapWandFactory(NUM_CHILDREN), TermFrequencyRiseWandFactory(NUM_CHILDREN)) + VespaHeapWandFactory(NUM_CHILDREN, LIMIT), TermFrequencyRiseWandFactory(NUM_CHILDREN, LIMIT)) { checkWandHits(f1, f2, 123, 5); } diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h b/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h index f48b9578ee1a..64b850d45834 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h +++ b/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h @@ -455,27 +455,6 @@ DualHeap::stringify() const { constexpr double TermFrequencyScorer_TERM_SCORE_FACTOR = 1000000.0; -/** - * Scorer used with WeakAndAlgorithm that calculates a pseudo term frequency - * as max score and regular score for a term. - */ -struct TermFrequencyScorer -{ - // weight * idf, scaled to fixedpoint - score_t calculateMaxScore(double estHits, double weight) const noexcept { - return (score_t) (TermFrequencyScorer_TERM_SCORE_FACTOR * weight / (1.0 + log(1.0 + (estHits / 1000.0)))); - } - - score_t calculateMaxScore(const Term &term) const noexcept { - return calculateMaxScore(term.estHits, term.weight) + 1; - } - - template - score_t calculate_max_score(const Input &input, ref_t ref) const noexcept { - return calculateMaxScore(input.get_est_hits(ref), input.get_weight(ref)) + 1; - } -}; - class Bm25TermFrequencyScorer { public: diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp index fe349c2338ad..e068d32dc155 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp @@ -173,14 +173,13 @@ WeakAndSearch::create(const Terms &terms, const MatchParams & params, const Scor SearchIterator::UP WeakAndSearch::create(const Terms &terms, const MatchParams & params, uint32_t n, bool strict, bool readonly_scores_heap) { - return create(terms, params, wand::TermFrequencyScorer(), n, strict, readonly_scores_heap); + return create(terms, params, wand::Bm25TermFrequencyScorer(params.docid_limit), n, strict, readonly_scores_heap); } //----------------------------------------------------------------------------- -template SearchIterator::UP WeakAndSearch::create(const Terms &terms, const MatchParams & params, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict, bool readonly_scores_heap); template SearchIterator::UP WeakAndSearch::create(const Terms &terms, const MatchParams & params, const wand::Bm25TermFrequencyScorer & scorer, uint32_t n, bool strict, bool readonly_scores_heap); -template SearchIterator::UP WeakAndSearch::createArrayWand(const Terms &terms, const MatchParams & params, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict, bool readonly_scores_heap); -template SearchIterator::UP WeakAndSearch::createHeapWand(const Terms &terms, const MatchParams & params, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict, bool readonly_scores_heap); +template SearchIterator::UP WeakAndSearch::createArrayWand(const Terms &terms, const MatchParams & params, const wand::Bm25TermFrequencyScorer & scorer, uint32_t n, bool strict, bool readonly_scores_heap); +template SearchIterator::UP WeakAndSearch::createHeapWand(const Terms &terms, const MatchParams & params, const wand::Bm25TermFrequencyScorer & scorer, uint32_t n, bool strict, bool readonly_scores_heap); } From 3751c880025de8965be5a0c5ec74ace381ce5ae0 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Sat, 30 Nov 2024 19:41:19 +0100 Subject: [PATCH 2/2] Remove no longer needed annotation. --- searchlib/src/tests/queryeval/weak_and/rise_wand.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/searchlib/src/tests/queryeval/weak_and/rise_wand.h b/searchlib/src/tests/queryeval/weak_and/rise_wand.h index d100a206c028..fba95f9bb015 100644 --- a/searchlib/src/tests/queryeval/weak_and/rise_wand.h +++ b/searchlib/src/tests/queryeval/weak_and/rise_wand.h @@ -15,7 +15,7 @@ namespace rise { struct TermFreqScorer { - [[no_unique_address]] Bm25TermFrequencyScorer _termFrequencyScorer; + Bm25TermFrequencyScorer _termFrequencyScorer; TermFreqScorer(uint32_t num_docs) noexcept : _termFrequencyScorer(num_docs) { }