This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 1fb6dca3791 [fix](inverted index)Support Chinese column name with 
inverted index #36321 (#36375)
1fb6dca3791 is described below

commit 1fb6dca37912623f320595a9cfa1ed3e1a23753d
Author: qiye <jianliang5...@gmail.com>
AuthorDate: Mon Jun 17 19:42:18 2024 +0800

    [fix](inverted index)Support Chinese column name with inverted index #36321 
(#36375)
---
 be/src/index-tools/index_tool.cpp                  |  4 +--
 .../rowset/segment_v2/inverted_index_reader.cpp    |  6 ++--
 .../rowset/segment_v2/inverted_index_writer.cpp    |  2 +-
 .../test_index_chinese_column.out                  |  3 ++
 .../test_index_chinese_column.groovy               | 42 ++++++++++++++++++++++
 5 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/be/src/index-tools/index_tool.cpp 
b/be/src/index-tools/index_tool.cpp
index 9892d9d5bcb..53f7aa454c6 100644
--- a/be/src/index-tools/index_tool.cpp
+++ b/be/src/index-tools/index_tool.cpp
@@ -89,8 +89,8 @@ void search(lucene::store::Directory* dir, std::string& 
field, std::string& toke
     IndexSearcher s(reader);
     std::unique_ptr<lucene::search::Query> query;
 
-    std::wstring field_ws(field.begin(), field.end());
-    std::wstring token_ws(token.begin(), token.end());
+    auto field_ws = StringUtil::string_to_wstring(field);
+    auto token_ws = StringUtil::string_to_wstring(token);
     lucene::index::Term* term = _CLNEW lucene::index::Term(field_ws.c_str(), 
token_ws.c_str());
     if (pred == "eq" || pred == "match") {
         query.reset(new lucene::search::TermQuery(term));
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 5780f04ade9..79ab97ee50a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -155,7 +155,7 @@ void 
InvertedIndexReader::get_analyse_result(std::vector<std::string>& analyse_r
                                              bool drop_duplicates) {
     analyse_result.clear();
 
-    std::wstring field_ws = std::wstring(field_name.begin(), field_name.end());
+    std::wstring field_ws = StringUtil::string_to_wstring(field_name);
     std::unique_ptr<lucene::analysis::TokenStream> token_stream(
             analyzer->tokenStream(field_ws.c_str(), reader));
 
@@ -316,7 +316,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
 
         IndexSearcherPtr index_searcher = nullptr;
         std::unique_ptr<lucene::search::Query> query;
-        std::wstring field_ws = std::wstring(column_name.begin(), 
column_name.end());
+        std::wstring field_ws = StringUtil::string_to_wstring(column_name);
 
         roaring::Roaring query_match_bitmap;
         bool null_bitmap_already_read = false;
@@ -635,7 +635,7 @@ Status 
StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
     // std::string search_str = reinterpret_cast<const 
StringRef*>(query_value)->to_string();
     VLOG_DEBUG << "begin to query the inverted index from clucene"
                << ", column_name: " << column_name << ", search_str: " << 
search_str;
-    std::wstring column_name_ws = std::wstring(column_name.begin(), 
column_name.end());
+    std::wstring column_name_ws = StringUtil::string_to_wstring(column_name);
     std::wstring search_str_ws = StringUtil::string_to_wstring(search_str);
     // unique_ptr with custom deleter
     std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term {
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index f2c891fefca..7b9628c7c04 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -83,7 +83,7 @@ public:
         _parser_type = get_inverted_index_parser_type_from_string(
                 get_parser_string_from_properties(_index_meta->properties()));
         _value_key_coder = get_key_coder(field_type);
-        _field_name = std::wstring(field_name.begin(), field_name.end());
+        _field_name = StringUtil::string_to_wstring(field_name);
     }
 
     ~InvertedIndexColumnWriterImpl() override {
diff --git 
a/regression-test/data/inverted_index_p0/test_index_chinese_column.out 
b/regression-test/data/inverted_index_p0/test_index_chinese_column.out
new file mode 100644
index 00000000000..541d416885c
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_chinese_column.out
@@ -0,0 +1,3 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+1      json love anny  json    anny    2023-10-10T12:11:11
diff --git 
a/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy 
b/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy
new file mode 100644
index 00000000000..880077585d2
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_index_chinese_column.groovy
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+suite("test_index_chinese_column", "inverted_index_select"){
+    def createAndInsertData = { table_name ->
+        sql "DROP TABLE IF EXISTS ${table_name}"
+        sql """
+            CREATE TABLE ${table_name}
+            (
+                k1 int ,
+                名称 string,
+                k3 char(50),
+                k4 varchar(200),
+                k5 datetime,
+                index index_str_k2 (`名称`) using inverted 
properties("parser"="english","ignore_above"="257")
+            )
+            DISTRIBUTED BY RANDOM BUCKETS 1
+            PROPERTIES("replication_num" = "1")
+        """
+        sql " insert into ${table_name} values(1, 'json love anny', 'json', 
'anny', '2023-10-10 12:11:11') "
+        qt_sql "SELECT * FROM ${table_name} WHERE 名称 match_all 'json'"
+    }
+
+    def table_name = "test_index_chinese_column"
+
+    sql "set enable_unicode_name_support=true"
+
+    createAndInsertData(table_name)
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to