From 50548656b8070ebdc7a118aecf51aa6701f4bc25 Mon Sep 17 00:00:00 2001 From: zzzxl Date: Mon, 25 Nov 2024 17:32:42 +0800 Subject: [PATCH] [fix](inverted index) Content Check for Tokenize Function Parser (#44465) Problem Summary: 1. Prevent users from mistakenly assuming other tokenizers exist. --- be/src/vec/functions/function_tokenize.cpp | 6 ++++++ .../suites/inverted_index_p0/test_tokenize.groovy | 11 +++++++++++ 2 files changed, 17 insertions(+) diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp index e7dc2debe62ad8..b1ec177d444adb 100644 --- a/be/src/vec/functions/function_tokenize.cpp +++ b/be/src/vec/functions/function_tokenize.cpp @@ -140,6 +140,12 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block } inverted_index_ctx.parser_type = get_inverted_index_parser_type_from_string( get_parser_string_from_properties(properties)); + if (inverted_index_ctx.parser_type == InvertedIndexParserType::PARSER_UNKNOWN) { + return Status::Error( + "unsupported parser type. currently, only 'english', 'chinese', and " + "'unicode' analyzers are supported."); + } + inverted_index_ctx.parser_mode = get_parser_mode_string_from_properties(properties); inverted_index_ctx.char_filter_map = get_parser_char_filter_map_from_properties(properties); diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy b/regression-test/suites/inverted_index_p0/test_tokenize.groovy index 8d7e2dac42ef53..4672a39cedbdce 100644 --- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy +++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +import java.sql.SQLException suite("test_tokenize"){ // prepare test table @@ -98,4 +99,14 @@ suite("test_tokenize"){ qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A', '"parser"="unicode"');""" qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A', '"parser"="unicode","stopwords" = "none"');""" + + try { + sql """ SELECT TOKENIZE('华夏智胜新税股票A', '"parser"="eng"'); """ + } catch (SQLException e) { + if (e.message.contains("E-6000")) { + log.info("e message: {}", e.message) + } else { + throw e + } + } }