From 1fb6dca37912623f320595a9cfa1ed3e1a23753d Mon Sep 17 00:00:00 2001 From: qiye Date: Mon, 17 Jun 2024 19:42:18 +0800 Subject: [PATCH] [fix](inverted index)Support Chinese column name with inverted index #36321 (#36375) --- be/src/index-tools/index_tool.cpp | 4 +- .../segment_v2/inverted_index_reader.cpp | 6 +-- .../segment_v2/inverted_index_writer.cpp | 2 +- .../test_index_chinese_column.out | 3 ++ .../test_index_chinese_column.groovy | 42 +++++++++++++++++++ 5 files changed, 51 insertions(+), 6 deletions(-) create mode 100644 regression-test/data/inverted_index_p0/test_index_chinese_column.out create mode 100644 regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy diff --git a/be/src/index-tools/index_tool.cpp b/be/src/index-tools/index_tool.cpp index 9892d9d5bcbfea..53f7aa454c6f26 100644 --- a/be/src/index-tools/index_tool.cpp +++ b/be/src/index-tools/index_tool.cpp @@ -89,8 +89,8 @@ void search(lucene::store::Directory* dir, std::string& field, std::string& toke IndexSearcher s(reader); std::unique_ptr query; - std::wstring field_ws(field.begin(), field.end()); - std::wstring token_ws(token.begin(), token.end()); + auto field_ws = StringUtil::string_to_wstring(field); + auto token_ws = StringUtil::string_to_wstring(token); lucene::index::Term* term = _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str()); if (pred == "eq" || pred == "match") { query.reset(new lucene::search::TermQuery(term)); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 5780f04ade9029..79ab97ee50ae28 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -155,7 +155,7 @@ void InvertedIndexReader::get_analyse_result(std::vector& analyse_r bool drop_duplicates) { analyse_result.clear(); - std::wstring field_ws = std::wstring(field_name.begin(), field_name.end()); + std::wstring field_ws = StringUtil::string_to_wstring(field_name); std::unique_ptr token_stream( analyzer->tokenStream(field_ws.c_str(), reader)); @@ -316,7 +316,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run IndexSearcherPtr index_searcher = nullptr; std::unique_ptr query; - std::wstring field_ws = std::wstring(column_name.begin(), column_name.end()); + std::wstring field_ws = StringUtil::string_to_wstring(column_name); roaring::Roaring query_match_bitmap; bool null_bitmap_already_read = false; @@ -635,7 +635,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, // std::string search_str = reinterpret_cast(query_value)->to_string(); VLOG_DEBUG << "begin to query the inverted index from clucene" << ", column_name: " << column_name << ", search_str: " << search_str; - std::wstring column_name_ws = std::wstring(column_name.begin(), column_name.end()); + std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); std::wstring search_str_ws = StringUtil::string_to_wstring(search_str); // unique_ptr with custom deleter std::unique_ptr term { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index f2c891fefcab84..7b9628c7c0479d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -83,7 +83,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { _parser_type = get_inverted_index_parser_type_from_string( get_parser_string_from_properties(_index_meta->properties())); _value_key_coder = get_key_coder(field_type); - _field_name = std::wstring(field_name.begin(), field_name.end()); + _field_name = StringUtil::string_to_wstring(field_name); } ~InvertedIndexColumnWriterImpl() override { diff --git a/regression-test/data/inverted_index_p0/test_index_chinese_column.out b/regression-test/data/inverted_index_p0/test_index_chinese_column.out new file mode 100644 index 00000000000000..541d416885c655 --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_chinese_column.out @@ -0,0 +1,3 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 json love anny json anny 2023-10-10T12:11:11 diff --git a/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy b/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy new file mode 100644 index 00000000000000..880077585d2e54 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +suite("test_index_chinese_column", "inverted_index_select"){ + def createAndInsertData = { table_name -> + sql "DROP TABLE IF EXISTS ${table_name}" + sql """ + CREATE TABLE ${table_name} + ( + k1 int , + 名称 string, + k3 char(50), + k4 varchar(200), + k5 datetime, + index index_str_k2 (`名称`) using inverted properties("parser"="english","ignore_above"="257") + ) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES("replication_num" = "1") + """ + sql " insert into ${table_name} values(1, 'json love anny', 'json', 'anny', '2023-10-10 12:11:11') " + qt_sql "SELECT * FROM ${table_name} WHERE 名称 match_all 'json'" + } + + def table_name = "test_index_chinese_column" + + sql "set enable_unicode_name_support=true" + + createAndInsertData(table_name) +}