This is an automated email from the ASF dual-hosted git repository. panxiaolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new fe6bae2924 [fix](invert index) supports utf8 and non-utf8 strings (#22570) fe6bae2924 is described below commit fe6bae29248cb8019878e13a88239741538f606a Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Sat Aug 5 12:52:53 2023 +0800 [fix](invert index) supports utf8 and non-utf8 strings (#22570) supports utf8 and non-utf8 strings: [fix] compatible with utf8 and invalid utf8 doris-thirdparty#110 --- be/src/clucene | 2 +- .../rowset/segment_v2/inverted_index_reader.cpp | 5 +- docs/zh-CN/docs/data-table/index/inverted-index.md | 4 +- .../test_inverted_index_keyword.out | 61 ++++++++++++++ .../test_inverted_index_keyword.groovy | 92 ++++++++++++++++++++++ 5 files changed, 159 insertions(+), 5 deletions(-) diff --git a/be/src/clucene b/be/src/clucene index 313ae23c47..dda894af51 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 313ae23c47ea6f73289e79364a259e404458ac7f +Subproject commit dda894af51024226f10336eea3d344cebeef310d diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 9b5e10ee44..992c4fa8d8 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -37,6 +37,7 @@ #include <CLucene/util/CLStreams.h> #include <CLucene/util/FutureArrays.h> #include <CLucene/util/bkd/bkd_docid_iterator.h> +#include <CLucene/util/stringUtil.h> #include <math.h> #include <string.h> @@ -141,7 +142,7 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result( if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) { if (token.termLength<char>() != 0) { std::string_view term(token.termBuffer<char>(), token.termLength<char>()); - std::wstring ws_term = lucene_utf8stows(term); + std::wstring ws_term = StringUtil::string_to_wstring(term); analyse_result.emplace_back(ws_term); } } else { @@ -443,7 +444,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, VLOG_DEBUG << "begin to query the inverted index from clucene" << ", column_name: " << column_name << ", search_str: " << search_str; std::wstring column_name_ws = std::wstring(column_name.begin(), column_name.end()); - std::wstring search_str_ws = lucene_utf8stows(search_str); + std::wstring search_str_ws = StringUtil::string_to_wstring(search_str); // unique_ptr with custom deleter std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term { _CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()), diff --git a/docs/zh-CN/docs/data-table/index/inverted-index.md b/docs/zh-CN/docs/data-table/index/inverted-index.md index f3bb248955..25633f0913 100644 --- a/docs/zh-CN/docs/data-table/index/inverted-index.md +++ b/docs/zh-CN/docs/data-table/index/inverted-index.md @@ -75,8 +75,8 @@ Doris倒排索引的功能简要介绍如下: - chinese是中文分词,适合被索引列主要是中文的情况,性能比english分词低 - unicode是多语言混合类型分词,适用于中英文混合、多语言混合的情况。它能够对邮箱前缀和后缀、IP地址以及字符数字混合进行分词,并且可以对中文按字符分词。 - parser_mode用于指定分词的模式,目前parser = chinese时支持如下几种模式: - - fine_grained:细粒度模式,倾向于分出比较短的词,比如 '武汉长江大桥' 会分成 '武汉', '武汉市', '市长', '长江', '长江大桥', '大桥' 6个词 - - coarse_grained:粗粒度模式,倾向于分出比较长的词,,比如 '武汉长江大桥' 会分成 '武汉市' '长江大桥' 2个词 + - fine_grained:细粒度模式,倾向于分出比较短的词,比如 '武汉市长江大桥' 会分成 '武汉', '武汉市', '市长', '长江', '长江大桥', '大桥' 6个词 + - coarse_grained:粗粒度模式,倾向于分出比较长的词,,比如 '武汉市长江大桥' 会分成 '武汉市' '长江大桥' 2个词 - 默认coarse_grained - support_phrase用于指定索引是否支持MATCH_PHRASE短语查询加速 - true为支持,但是索引需要更多的存储空间 diff --git a/regression-test/data/inverted_index_p0/test_inverted_index_keyword.out b/regression-test/data/inverted_index_p0/test_inverted_index_keyword.out new file mode 100644 index 0000000000..6d8a6429b8 --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_inverted_index_keyword.out @@ -0,0 +1,61 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 330204195805121025 + +-- !sql -- +2 36 + +-- !sql -- +2 330225197806187713 + +-- !sql -- +2 330227195911020791 + +-- !sql -- +2 330224196312012744 + +-- !sql -- +2 330205196003131214 + +-- !sql -- +2 330224197301242119 + +-- !sql -- +2 3302哈哈1645676 + +-- !sql -- +2 330225196202011579 + +-- !sql -- +2 33022719660610183x + +-- !sql -- +2 330225197801043198 + +-- !sql -- +3 中国 + +-- !sql -- +3 美国 + +-- !sql -- +3 英国 + +-- !sql -- +3 体育 + +-- !sql -- +3 体育场 + +-- !sql -- +3 中国人 + +-- !sql -- +3 北京市 + +-- !sql -- +3 我在北京市 + +-- !sql -- +3 我在西安市 + diff --git a/regression-test/suites/inverted_index_p0/test_inverted_index_keyword.groovy b/regression-test/suites/inverted_index_p0/test_inverted_index_keyword.groovy new file mode 100644 index 0000000000..1663de8e7c --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_inverted_index_keyword.groovy @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_inverted_index_keyword"){ + // prepare test table + + + def timeout = 60000 + def delta_time = 1000 + def alter_res = "null" + def useTime = 0 + + def indexTblName = "test_inverted_index_keyword" + + sql "DROP TABLE IF EXISTS ${indexTblName}" + // create 1 replica table + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + def var_result = sql "show variables" + logger.info("show variales result: " + var_result ) + + sql """INSERT INTO ${indexTblName} VALUES + (1, '330204195805121025'), + (2, '36'), + (2, '330225197806187713'), + (2, '330227195911020791'), + (2, '330224196312012744'), + (2, '330205196003131214'), + (2, '330224197301242119'), + (2, '3302哈哈1645676'), + (2, '330225196202011579'), + (2, '33022719660610183x'), + (2, '330225197801043198'), + (3, '中国'), + (3, '美国'), + (3, '英国'), + (3, '体育'), + (3, '体育场'), + (3, '中国人'), + (3, '北京市'), + (3, '我在北京市'), + (3, '我在西安市') + """ + + qt_sql "SELECT * FROM ${indexTblName} where c match '330204195805121025'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '36'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '330225197806187713'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '330227195911020791'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '330224196312012744'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '330205196003131214'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '330224197301242119'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '3302哈哈1645676'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '330225196202011579'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '33022719660610183x'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '330225197801043198'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '中国'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '美国'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '英国'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '体育'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '体育场'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '中国人'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '北京市'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '我在北京市'"; + qt_sql "SELECT * FROM ${indexTblName} where c match '我在西安市'"; +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org