This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new a42538291a7 [fix](inverted index)Support Chinese column name with inverted index (#36321) a42538291a7 is described below commit a42538291a7a2d2cd3281d5b1e324065c28660f2 Author: qiye <jianliang5...@gmail.com> AuthorDate: Sun Jun 16 10:02:14 2024 +0800 [fix](inverted index)Support Chinese column name with inverted index (#36321) 1. `std::string` to `std::wstring` conversion only supports ASCII characters. For non-ASCII characters, we need to use `StringUtil::string_to_wstring` 2. Fix index_tool check_terms_stats_v2 and add field info to print Issue Number: #34118 --- be/src/index-tools/index_tool.cpp | 12 ++++-- .../rowset/segment_v2/inverted_index_reader.cpp | 6 +-- .../rowset/segment_v2/inverted_index_writer.cpp | 2 +- .../test_index_chinese_column.out | 7 ++++ .../test_index_chinese_column.groovy | 44 ++++++++++++++++++++++ 5 files changed, 64 insertions(+), 7 deletions(-) diff --git a/be/src/index-tools/index_tool.cpp b/be/src/index-tools/index_tool.cpp index 5f49bc268f4..d729cc3ff97 100644 --- a/be/src/index-tools/index_tool.cpp +++ b/be/src/index-tools/index_tool.cpp @@ -101,6 +101,9 @@ std::string get_usage(const std::string& progname) { "--trans_vec_file=path/to/file\n"; ss << "./index_tool --operation=write_index_v2 --idx_file_path=path/to/index " "--data_file_path=data/to/index\n"; + ss << "./index_tool --operation=show_nested_files_v2 --idx_file_path=path/to/file\n"; + ss << "./index_tool --operation=check_terms_stats_v2 --idx_file_path=path/to/file " + "--idx_id=index_id\n"; return ss.str(); } @@ -205,7 +208,10 @@ void check_terms_stats(lucene::store::Directory* dir) { /* empty */ std::string token = lucene_wcstoutf8string(te->term(false)->text(), te->term(false)->textLength()); + std::string field = lucene_wcstoutf8string(te->term(false)->field(), + lenOfString(te->term(false)->field())); + printf("Field: %s ", field.c_str()); printf("Term: %s ", token.c_str()); printf("Freq: %d\n", te->docFreq()); if (FLAGS_print_doc_id) { @@ -557,7 +563,7 @@ int main(int argc, char** argv) { auto field_config = (int32_t)(lucene::document::Field::STORE_NO); field_config |= (int32_t)(lucene::document::Field::INDEX_NONORMS); field_config |= lucene::document::Field::INDEX_TOKENIZED; - auto field_name = std::wstring(name.begin(), name.end()); + auto field_name = StringUtil::string_to_wstring(name); auto field = _CLNEW lucene::document::Field(field_name.c_str(), field_config); field->setOmitTermFreqAndPositions(false); doc->add(*field); @@ -632,7 +638,7 @@ int main(int argc, char** argv) { std::cerr << "error occurred when show files: " << err.what() << std::endl; } } else if (FLAGS_operation == "check_terms_stats_v2") { - if (FLAGS_idx_file_path == "") { + if (FLAGS_idx_file_path == "" || FLAGS_idx_id <= 0) { std::cout << "no file flag for check " << std::endl; return -1; } @@ -647,7 +653,7 @@ int main(int argc, char** argv) { return -1; } std::vector<std::string> files; - int64_t index_id = 1; + int64_t index_id = FLAGS_idx_id; std::string index_suffix = ""; doris::TabletIndexPB index_pb; index_pb.set_index_id(index_id); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 35286088a57..3639bff05c4 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -164,7 +164,7 @@ void InvertedIndexReader::get_analyse_result(std::vector<std::string>& analyse_r bool drop_duplicates) { analyse_result.clear(); - std::wstring field_ws = std::wstring(field_name.begin(), field_name.end()); + std::wstring field_ws = StringUtil::string_to_wstring(field_name); std::unique_ptr<lucene::analysis::TokenStream> token_stream( analyzer->tokenStream(field_ws.c_str(), reader)); @@ -353,7 +353,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run } std::unique_ptr<lucene::search::Query> query; - query_info.field_name = std::wstring(column_name.begin(), column_name.end()); + query_info.field_name = StringUtil::string_to_wstring(column_name); if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || @@ -464,7 +464,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, // std::string search_str = reinterpret_cast<const StringRef*>(query_value)->to_string(); VLOG_DEBUG << "begin to query the inverted index from clucene" << ", column_name: " << column_name << ", search_str: " << search_str; - std::wstring column_name_ws = std::wstring(column_name.begin(), column_name.end()); + std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); std::wstring search_str_ws = StringUtil::string_to_wstring(search_str); // unique_ptr with custom deleter std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index cc2c89bb116..9f51098a052 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -88,7 +88,7 @@ public: _parser_type = get_inverted_index_parser_type_from_string( get_parser_string_from_properties(_index_meta->properties())); _value_key_coder = get_key_coder(field_type); - _field_name = std::wstring(field_name.begin(), field_name.end()); + _field_name = StringUtil::string_to_wstring(field_name); } ~InvertedIndexColumnWriterImpl() override { diff --git a/regression-test/data/inverted_index_p0/test_index_chinese_column.out b/regression-test/data/inverted_index_p0/test_index_chinese_column.out new file mode 100644 index 00000000000..8b3ebab527e --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_chinese_column.out @@ -0,0 +1,7 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 json love anny json anny 2023-10-10T12:11:11 + +-- !sql -- +1 json love anny json anny 2023-10-10T12:11:11 + diff --git a/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy b/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy new file mode 100644 index 00000000000..21a94e1ffef --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +suite("test_index_chinese_column", "inverted_index_select"){ + def createAndInsertData = { table_name, inverted_index_storage_format -> + sql "DROP TABLE IF EXISTS ${table_name}" + sql """ + CREATE TABLE ${table_name} + ( + k1 int , + 名称 string, + k3 char(50), + k4 varchar(200), + k5 datetime, + index index_str_k2 (`名称`) using inverted properties("parser"="english","ignore_above"="257") + ) + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES("replication_num" = "1","inverted_index_storage_format" = "${inverted_index_storage_format}") + """ + sql " insert into ${table_name} values(1, 'json love anny', 'json', 'anny', '2023-10-10 12:11:11') " + qt_sql "SELECT * FROM ${table_name} WHERE 名称 match_all 'json'" + } + + def table_name_v1 = "test_index_chinese_column_v1" + def table_name_v2 = "test_index_chinese_column_v2" + + sql "set enable_unicode_name_support=true" + + createAndInsertData(table_name_v1, "V1") + createAndInsertData(table_name_v2, "V2") +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org