Skip to content

Commit

Permalink
[fix](inverted index) Fix for Inaccurate match_phrase_prefix Cache in…
Browse files Browse the repository at this point in the history
… Query Processing (#46310)

Problem Summary:
1. different values of inverted_index_max_expansions require separate
caches.
  • Loading branch information
zzzxl1993 authored Jan 6, 2025
1 parent 624bbff commit af63014
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 25 deletions.
46 changes: 21 additions & 25 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ Status InvertedIndexReader::match_index_search(
InvertedIndexQueryType query_type, const InvertedIndexQueryInfo& query_info,
const FulltextIndexSearcherPtr& index_searcher,
const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
TQueryOptions queryOptions = runtime_state->query_options();
const auto& queryOptions = runtime_state->query_options();
try {
SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
auto query = QueryFactory::create(query_type, index_searcher, queryOptions, io_ctx);
Expand Down Expand Up @@ -294,24 +294,23 @@ Status FullTextIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatist
VLOG_DEBUG << column_name << " begin to search the fulltext index from clucene, query_str ["
<< search_str << "]";

const auto& queryOptions = runtime_state->query_options();
try {
InvertedIndexQueryInfo query_info;
InvertedIndexQueryCache::CacheKey cache_key;
auto index_file_key = _inverted_index_file_reader->get_index_file_cache_key(&_index_meta);

// terms
if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
cache_key = {index_file_key, column_name, query_type, search_str};
query_info.terms.emplace_back(search_str);
} else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
PhraseQuery::parser_info(search_str, column_name, query_type, _index_meta.properties(),
query_info, queryOptions.enable_phrase_query_sequential_opt);
} else {
if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
PhraseQuery::parser_info(
search_str, column_name, query_type, _index_meta.properties(), query_info,
runtime_state->query_options().enable_phrase_query_sequential_opt);
} else {
query_info.terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result(
search_str, column_name, query_type, _index_meta.properties());
}
query_info.terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result(
search_str, column_name, query_type, _index_meta.properties());
}

if (query_info.terms.empty()) {
auto msg = fmt::format(
"token parser result is empty for query, "
Expand All @@ -325,22 +324,20 @@ Status FullTextIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatist
}
}

std::unique_ptr<lucene::search::Query> query;
// field_name
query_info.field_name = StringUtil::string_to_wstring(column_name);

if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
query_type == InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY ||
query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
query_type == InvertedIndexQueryType::EQUAL_QUERY ||
query_type == InvertedIndexQueryType::MATCH_ANY_QUERY) {
std::string str_tokens = join(query_info.terms, " ");
if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
str_tokens += " " + std::to_string(query_info.slop);
str_tokens += " " + std::to_string(query_info.ordered);
}
cache_key = {index_file_key, column_name, query_type, str_tokens};
// cache_key
std::string str_tokens = join(query_info.terms, " ");
if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
str_tokens += " " + std::to_string(query_info.slop);
str_tokens += " " + std::to_string(query_info.ordered);
} else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
str_tokens += " " + std::to_string(queryOptions.inverted_index_max_expansions);
}
cache_key = {index_file_key, column_name, query_type, std::move(str_tokens)};

auto* cache = InvertedIndexQueryCache::instance();
InvertedIndexQueryCacheHandle cache_handler;

Expand All @@ -350,13 +347,12 @@ Status FullTextIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatist
if (cache_status.ok()) {
return Status::OK();
}
FulltextIndexSearcherPtr* searcher_ptr = nullptr;

InvertedIndexCacheHandle inverted_index_cache_handle;
RETURN_IF_ERROR(
handle_searcher_cache(runtime_state, &inverted_index_cache_handle, io_ctx, stats));
auto searcher_variant = inverted_index_cache_handle.get_index_searcher();
searcher_ptr = std::get_if<FulltextIndexSearcherPtr>(&searcher_variant);
auto* searcher_ptr = std::get_if<FulltextIndexSearcherPtr>(&searcher_variant);
if (searcher_ptr != nullptr) {
term_match_bitmap = std::make_shared<roaring::Roaring>();
RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state, query_type, query_info,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,9 @@
-- !sql --
6

-- !sql --
0

-- !sql --
1

Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,9 @@
-- !sql --
0

-- !sql --
4

-- !sql --
377

Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ suite("test_index_match_phrase_prefix_1", "nonConcurrent"){
sql """ INSERT INTO ${indexTbName1} VALUES (6, "O1704361998540E2Cemx9S 123456789", "O1704361998540E2Cemx9S 123456789", "O1704361998540E2Cemx9S 123456789"); """
sql """ INSERT INTO ${indexTbName1} VALUES (7, "O1704361998540E2Cemx9S*123456789", "O1704361998540E2Cemx9S*123456789", "O1704361998540E2Cemx9S*123456789"); """

sql """ INSERT INTO ${indexTbName1} VALUES (1, "", "s1", ""), (2, "", "s2", ""), (3, "", "s3", ""), (4, "", "s4", ""), (5, "", "tv s5", ""); """

try {
sql "sync"
sql """ set enable_common_expr_pushdown = true; """
Expand All @@ -58,7 +60,14 @@ suite("test_index_match_phrase_prefix_1", "nonConcurrent"){
qt_sql """ select count() from ${indexTbName1} where c match_phrase_prefix 'O1704361998540E2Cemx9S=123456789'; """
qt_sql """ select count() from ${indexTbName1} where d match_phrase_prefix 'O1704361998540E2Cemx9S=123456789'; """

sql """ set inverted_index_max_expansions = 3; """
qt_sql """ select count() from ${indexTbName1} where c match_phrase_prefix 'tv s'; """

sql """ set inverted_index_max_expansions = 5; """
qt_sql """ select count() from ${indexTbName1} where c match_phrase_prefix 'tv s'; """

} finally {
sql """ set inverted_index_max_expansions = 50; """
GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,14 @@ suite("test_index_match_regexp", "nonConcurrent"){
qt_sql """ select count() from test_index_match_regexp where request match_regexp '.*tickets.*'; """
qt_sql """ select count() from test_index_match_regexp where request match_regexp 'nonexistence'; """

sql """ set inverted_index_max_expansions = 1; """
qt_sql """ select count() from test_index_match_regexp where request match_regexp 'b'; """

sql """ set inverted_index_max_expansions = 50; """
qt_sql """ select count() from test_index_match_regexp where request match_regexp 'b'; """

} finally {
sql """ set inverted_index_max_expansions = 50; """
GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
}
}

0 comments on commit af63014

Please sign in to comment.