Skip to content

Commit

Permalink
[fix](inverted index)Support Chinese column name with inverted index #…
Browse files Browse the repository at this point in the history
  • Loading branch information
qidaye authored Jun 17, 2024
1 parent dabad7c commit 1fb6dca
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 6 deletions.
4 changes: 2 additions & 2 deletions be/src/index-tools/index_tool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ void search(lucene::store::Directory* dir, std::string& field, std::string& toke
IndexSearcher s(reader);
std::unique_ptr<lucene::search::Query> query;

std::wstring field_ws(field.begin(), field.end());
std::wstring token_ws(token.begin(), token.end());
auto field_ws = StringUtil::string_to_wstring(field);
auto token_ws = StringUtil::string_to_wstring(token);
lucene::index::Term* term = _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str());
if (pred == "eq" || pred == "match") {
query.reset(new lucene::search::TermQuery(term));
Expand Down
6 changes: 3 additions & 3 deletions be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ void InvertedIndexReader::get_analyse_result(std::vector<std::string>& analyse_r
bool drop_duplicates) {
analyse_result.clear();

std::wstring field_ws = std::wstring(field_name.begin(), field_name.end());
std::wstring field_ws = StringUtil::string_to_wstring(field_name);
std::unique_ptr<lucene::analysis::TokenStream> token_stream(
analyzer->tokenStream(field_ws.c_str(), reader));

Expand Down Expand Up @@ -316,7 +316,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run

IndexSearcherPtr index_searcher = nullptr;
std::unique_ptr<lucene::search::Query> query;
std::wstring field_ws = std::wstring(column_name.begin(), column_name.end());
std::wstring field_ws = StringUtil::string_to_wstring(column_name);

roaring::Roaring query_match_bitmap;
bool null_bitmap_already_read = false;
Expand Down Expand Up @@ -635,7 +635,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
// std::string search_str = reinterpret_cast<const StringRef*>(query_value)->to_string();
VLOG_DEBUG << "begin to query the inverted index from clucene"
<< ", column_name: " << column_name << ", search_str: " << search_str;
std::wstring column_name_ws = std::wstring(column_name.begin(), column_name.end());
std::wstring column_name_ws = StringUtil::string_to_wstring(column_name);
std::wstring search_str_ws = StringUtil::string_to_wstring(search_str);
// unique_ptr with custom deleter
std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term {
Expand Down
2 changes: 1 addition & 1 deletion be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
_parser_type = get_inverted_index_parser_type_from_string(
get_parser_string_from_properties(_index_meta->properties()));
_value_key_coder = get_key_coder(field_type);
_field_name = std::wstring(field_name.begin(), field_name.end());
_field_name = StringUtil::string_to_wstring(field_name);
}

~InvertedIndexColumnWriterImpl() override {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
1 json love anny json anny 2023-10-10T12:11:11
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
suite("test_index_chinese_column", "inverted_index_select"){
def createAndInsertData = { table_name ->
sql "DROP TABLE IF EXISTS ${table_name}"
sql """
CREATE TABLE ${table_name}
(
k1 int ,
名称 string,
k3 char(50),
k4 varchar(200),
k5 datetime,
index index_str_k2 (`名称`) using inverted properties("parser"="english","ignore_above"="257")
)
DISTRIBUTED BY RANDOM BUCKETS 1
PROPERTIES("replication_num" = "1")
"""
sql " insert into ${table_name} values(1, 'json love anny', 'json', 'anny', '2023-10-10 12:11:11') "
qt_sql "SELECT * FROM ${table_name} WHERE 名称 match_all 'json'"
}

def table_name = "test_index_chinese_column"

sql "set enable_unicode_name_support=true"

createAndInsertData(table_name)
}

0 comments on commit 1fb6dca

Please sign in to comment.