This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new fe6bae2924 [fix](invert index) supports utf8 and non-utf8 strings 
(#22570)
fe6bae2924 is described below

commit fe6bae29248cb8019878e13a88239741538f606a
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Sat Aug 5 12:52:53 2023 +0800

    [fix](invert index) supports utf8 and non-utf8 strings (#22570)
    
    supports utf8 and non-utf8 strings: [fix] compatible with utf8 and invalid 
utf8 doris-thirdparty#110
---
 be/src/clucene                                     |  2 +-
 .../rowset/segment_v2/inverted_index_reader.cpp    |  5 +-
 docs/zh-CN/docs/data-table/index/inverted-index.md |  4 +-
 .../test_inverted_index_keyword.out                | 61 ++++++++++++++
 .../test_inverted_index_keyword.groovy             | 92 ++++++++++++++++++++++
 5 files changed, 159 insertions(+), 5 deletions(-)

diff --git a/be/src/clucene b/be/src/clucene
index 313ae23c47..dda894af51 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 313ae23c47ea6f73289e79364a259e404458ac7f
+Subproject commit dda894af51024226f10336eea3d344cebeef310d
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 9b5e10ee44..992c4fa8d8 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -37,6 +37,7 @@
 #include <CLucene/util/CLStreams.h>
 #include <CLucene/util/FutureArrays.h>
 #include <CLucene/util/bkd/bkd_docid_iterator.h>
+#include <CLucene/util/stringUtil.h>
 #include <math.h>
 #include <string.h>
 
@@ -141,7 +142,7 @@ std::vector<std::wstring> 
InvertedIndexReader::get_analyse_result(
         if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
             if (token.termLength<char>() != 0) {
                 std::string_view term(token.termBuffer<char>(), 
token.termLength<char>());
-                std::wstring ws_term = lucene_utf8stows(term);
+                std::wstring ws_term = StringUtil::string_to_wstring(term);
                 analyse_result.emplace_back(ws_term);
             }
         } else {
@@ -443,7 +444,7 @@ Status 
StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
     VLOG_DEBUG << "begin to query the inverted index from clucene"
                << ", column_name: " << column_name << ", search_str: " << 
search_str;
     std::wstring column_name_ws = std::wstring(column_name.begin(), 
column_name.end());
-    std::wstring search_str_ws = lucene_utf8stows(search_str);
+    std::wstring search_str_ws = StringUtil::string_to_wstring(search_str);
     // unique_ptr with custom deleter
     std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term {
             _CLNEW lucene::index::Term(column_name_ws.c_str(), 
search_str_ws.c_str()),
diff --git a/docs/zh-CN/docs/data-table/index/inverted-index.md 
b/docs/zh-CN/docs/data-table/index/inverted-index.md
index f3bb248955..25633f0913 100644
--- a/docs/zh-CN/docs/data-table/index/inverted-index.md
+++ b/docs/zh-CN/docs/data-table/index/inverted-index.md
@@ -75,8 +75,8 @@ Doris倒排索引的功能简要介绍如下:
       - chinese是中文分词,适合被索引列主要是中文的情况,性能比english分词低
       - 
unicode是多语言混合类型分词,适用于中英文混合、多语言混合的情况。它能够对邮箱前缀和后缀、IP地址以及字符数字混合进行分词,并且可以对中文按字符分词。
     - parser_mode用于指定分词的模式,目前parser = chinese时支持如下几种模式:
-      - fine_grained:细粒度模式,倾向于分出比较短的词,比如 '武汉长江大桥' 会分成 '武汉', '武汉市', '市长', '长江', 
'长江大桥', '大桥' 6个词
-      - coarse_grained:粗粒度模式,倾向于分出比较长的词,,比如 '武汉长江大桥' 会分成 '武汉市' '长江大桥' 2个词
+      - fine_grained:细粒度模式,倾向于分出比较短的词,比如 '武汉市长江大桥' 会分成 '武汉', '武汉市', '市长', 
'长江', '长江大桥', '大桥' 6个词
+      - coarse_grained:粗粒度模式,倾向于分出比较长的词,,比如 '武汉市长江大桥' 会分成 '武汉市' '长江大桥' 2个词
       - 默认coarse_grained
     - support_phrase用于指定索引是否支持MATCH_PHRASE短语查询加速
       - true为支持,但是索引需要更多的存储空间
diff --git 
a/regression-test/data/inverted_index_p0/test_inverted_index_keyword.out 
b/regression-test/data/inverted_index_p0/test_inverted_index_keyword.out
new file mode 100644
index 0000000000..6d8a6429b8
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_inverted_index_keyword.out
@@ -0,0 +1,61 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+1      330204195805121025
+
+-- !sql --
+2      36
+
+-- !sql --
+2      330225197806187713
+
+-- !sql --
+2      330227195911020791
+
+-- !sql --
+2      330224196312012744
+
+-- !sql --
+2      330205196003131214
+
+-- !sql --
+2      330224197301242119
+
+-- !sql --
+2      3302哈哈1645676
+
+-- !sql --
+2      330225196202011579
+
+-- !sql --
+2      33022719660610183x
+
+-- !sql --
+2      330225197801043198
+
+-- !sql --
+3      中国
+
+-- !sql --
+3      美国
+
+-- !sql --
+3      英国
+
+-- !sql --
+3      体育
+
+-- !sql --
+3      体育场
+
+-- !sql --
+3      中国人
+
+-- !sql --
+3      北京市
+
+-- !sql --
+3      我在北京市
+
+-- !sql --
+3      我在西安市
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_inverted_index_keyword.groovy 
b/regression-test/suites/inverted_index_p0/test_inverted_index_keyword.groovy
new file mode 100644
index 0000000000..1663de8e7c
--- /dev/null
+++ 
b/regression-test/suites/inverted_index_p0/test_inverted_index_keyword.groovy
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_inverted_index_keyword"){
+    // prepare test table
+
+
+    def timeout = 60000
+    def delta_time = 1000
+    def alter_res = "null"
+    def useTime = 0
+
+    def indexTblName = "test_inverted_index_keyword"
+
+    sql "DROP TABLE IF EXISTS ${indexTblName}"
+    // create 1 replica table
+    sql """
+       CREATE TABLE IF NOT EXISTS ${indexTblName}(
+               `id`int(11)NULL,
+               `c` text NULL,
+               INDEX c_idx(`c`) USING INVERTED COMMENT ''
+       ) ENGINE=OLAP
+       DUPLICATE KEY(`id`)
+       COMMENT 'OLAP'
+       DISTRIBUTED BY HASH(`id`) BUCKETS 1
+       PROPERTIES(
+               "replication_allocation" = "tag.location.default: 1"
+       );
+    """
+    
+    def var_result = sql "show variables"
+    logger.info("show variales result: " + var_result )
+
+    sql """INSERT INTO ${indexTblName} VALUES
+        (1, '330204195805121025'),
+        (2, '36'),
+        (2, '330225197806187713'),
+        (2, '330227195911020791'),
+        (2, '330224196312012744'),
+        (2, '330205196003131214'),
+        (2, '330224197301242119'),
+        (2, '3302哈哈1645676'),
+        (2, '330225196202011579'),
+        (2, '33022719660610183x'),
+        (2, '330225197801043198'),
+        (3, '中国'),
+        (3, '美国'),
+        (3, '英国'),
+        (3, '体育'),
+        (3, '体育场'),
+        (3, '中国人'),
+        (3, '北京市'),
+        (3, '我在北京市'),
+        (3, '我在西安市')
+    """
+
+    qt_sql "SELECT * FROM ${indexTblName} where c match '330204195805121025'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '36'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '330225197806187713'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '330227195911020791'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '330224196312012744'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '330205196003131214'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '330224197301242119'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '3302哈哈1645676'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '330225196202011579'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '33022719660610183x'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '330225197801043198'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '中国'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '美国'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '英国'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '体育'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '体育场'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '中国人'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '北京市'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '我在北京市'";
+    qt_sql "SELECT * FROM ${indexTblName} where c match '我在西安市'";
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to