This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 5d576b41d70 [opt](invert index) use lowercase by default #32405 (#32940) 5d576b41d70 is described below commit 5d576b41d70611c6724a6ea9b87c7b2c489125e8 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Fri Mar 29 14:37:40 2024 +0800 [opt](invert index) use lowercase by default #32405 (#32940) --- be/src/olap/compaction.cpp | 41 +++++++++++- be/src/olap/inverted_index_parser.cpp | 9 --- be/src/olap/inverted_index_parser.h | 23 ++++++- .../rowset/segment_v2/inverted_index_writer.cpp | 2 +- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 3 +- be/src/olap/tablet_schema.cpp | 8 +++ .../test_index_lowercase_fault_injection.out | 13 ++++ .../data/inverted_index_p0/test_lowercase.out | 6 ++ .../test_index_lowercase_fault_injection.groovy | 76 ++++++++++++++++++++++ 9 files changed, 166 insertions(+), 15 deletions(-) diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index e852344688c..9bedbce11ab 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -460,9 +460,11 @@ Status Compaction::do_compaction_impl(int64_t permits) { // src index files // format: rowsetId_segmentId std::vector<std::string> src_index_files(src_segment_num); + std::vector<RowsetId> src_rowset_ids; for (const auto& m : src_seg_to_id_map) { std::pair<RowsetId, uint32_t> p = m.first; src_index_files[m.second] = p.first.to_string() + "_" + std::to_string(p.second); + src_rowset_ids.push_back(p.first); } // dest index files @@ -597,9 +599,36 @@ Status Compaction::do_compaction_impl(int64_t permits) { } }; + Status status = Status::OK(); for (auto&& column_uniq_id : ctx.skip_inverted_index) { auto col = _cur_tablet_schema->column_by_uid(column_uniq_id); const auto* index_meta = _cur_tablet_schema->get_inverted_index(col); + + // if index properties are different, index compaction maybe needs to be skipped. + bool is_continue = false; + std::optional<std::map<std::string, std::string>> first_properties; + for (const auto& rowset_id : src_rowset_ids) { + auto rowset_ptr = _tablet->get_rowset(rowset_id); + const auto* tablet_index = rowset_ptr->tablet_schema()->get_inverted_index(col); + const auto& properties = tablet_index->properties(); + if (!first_properties.has_value()) { + first_properties = properties; + } else { + if (properties != first_properties.value()) { + error_handler(index_meta->index_id(), column_uniq_id); + status = Status::Error<INVERTED_INDEX_COMPACTION_ERROR>( + "if index properties are different, index compaction needs to " + "be " + "skipped."); + is_continue = true; + break; + } + } + } + if (is_continue) { + continue; + } + std::vector<lucene::store::Directory*> dest_index_dirs(dest_segment_num); std::vector<lucene::store::Directory*> src_index_dirs(src_segment_num); try { @@ -620,15 +649,21 @@ Status Compaction::do_compaction_impl(int64_t permits) { fs, index_tmp_path, trans_vec, dest_segment_num_rows); if (!st.ok()) { error_handler(index_meta->index_id(), column_uniq_id); - return Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(st.msg()); + status = Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(st.msg()); } } catch (CLuceneError& e) { error_handler(index_meta->index_id(), column_uniq_id); - return Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(e.what()); + status = Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(e.what()); } } for (auto& inverted_index_file_writer : inverted_index_file_writers) { - RETURN_IF_ERROR(inverted_index_file_writer->close()); + if (Status st = inverted_index_file_writer->close(); !st.ok()) { + status = Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(st.msg()); + } + } + // check index compaction status. If status is not ok, we should return error and end this compaction round. + if (!status.ok()) { + return status; } LOG(INFO) << "succeed to do index compaction" diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index 17cddc042f0..07a587dd2dd 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -126,13 +126,4 @@ std::string get_parser_ignore_above_value_from_properties( } } -std::string get_parser_lowercase_from_properties( - const std::map<std::string, std::string>& properties) { - if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != properties.end()) { - return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY); - } else { - return ""; - } -} - } // namespace doris diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index 4a84823d14c..9df825bf69d 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -21,6 +21,8 @@ #include <memory> #include <string> +#include "util/debug_points.h" + namespace lucene { namespace analysis { class Analyzer; @@ -49,6 +51,9 @@ struct InvertedIndexCtx { using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>; +const std::string INVERTED_INDEX_PARSER_TRUE = "true"; +const std::string INVERTED_INDEX_PARSER_FALSE = "false"; + const std::string INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode"; const std::string INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained"; const std::string INVERTED_INDEX_PARSER_COARSE_GRANULARITY = "coarse_grained"; @@ -90,6 +95,22 @@ CharFilterMap get_parser_char_filter_map_from_properties( // get parser ignore_above value from properties std::string get_parser_ignore_above_value_from_properties( const std::map<std::string, std::string>& properties); + +template <bool ReturnTrue = false> std::string get_parser_lowercase_from_properties( - const std::map<std::string, std::string>& properties); + const std::map<std::string, std::string>& properties) { + if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != properties.end()) { + return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY); + } else { + DBUG_EXECUTE_IF("inverted_index_parser.get_parser_lowercase_from_properties", + { return ""; }) + + if constexpr (ReturnTrue) { + return INVERTED_INDEX_PARSER_TRUE; + } else { + return ""; + } + } +} + } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 54f3feb3c5d..8b1ae50433c 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -222,7 +222,7 @@ public: } void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& analyzer) { - auto lowercase = get_parser_lowercase_from_properties(_index_meta->properties()); + auto lowercase = get_parser_lowercase_from_properties<true>(_index_meta->properties()); if (lowercase == "true") { analyzer->set_lowercase(true); } else if (lowercase == "false") { diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 67a5b9393e0..dc692fa9bc0 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1275,9 +1275,10 @@ Status SegmentIterator::_init_inverted_index_iterators() { } for (auto cid : _schema->column_ids()) { if (_inverted_index_iterators[cid] == nullptr) { + // Use segment’s own index_meta, for compatibility with future indexing needs to default to lowercase. RETURN_IF_ERROR(_segment->new_inverted_index_iterator( _opts.tablet_schema->column(cid), - _opts.tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)), + _segment->_tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)), _opts, &_inverted_index_iterators[cid])); } } diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index b0410a2b341..0e9376e09cf 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -799,6 +799,14 @@ void TabletIndex::to_schema_pb(TabletIndexPB* index) const { (*index->mutable_properties())[kv.first] = kv.second; } index->set_index_suffix_name(_escaped_index_suffix_path); + + DBUG_EXECUTE_IF("tablet_schema.to_schema_pb", { return; }) + + // lowercase by default + if (!_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) { + (*index->mutable_properties())[INVERTED_INDEX_PARSER_LOWERCASE_KEY] = + INVERTED_INDEX_PARSER_TRUE; + } } void TabletSchema::append_column(TabletColumn column, ColumnType col_type) { diff --git a/regression-test/data/fault_injection_p0/test_index_lowercase_fault_injection.out b/regression-test/data/fault_injection_p0/test_index_lowercase_fault_injection.out new file mode 100644 index 00000000000..196077986ec --- /dev/null +++ b/regression-test/data/fault_injection_p0/test_index_lowercase_fault_injection.out @@ -0,0 +1,13 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +5 + +-- !sql -- +0 + +-- !sql -- +8 + +-- !sql -- +3 + diff --git a/regression-test/data/inverted_index_p0/test_lowercase.out b/regression-test/data/inverted_index_p0/test_lowercase.out index 03c2f57468f..2ca46501026 100644 --- a/regression-test/data/inverted_index_p0/test_lowercase.out +++ b/regression-test/data/inverted_index_p0/test_lowercase.out @@ -31,11 +31,17 @@ -- !sql -- 1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 -- !sql -- +1 hello 我来到北京清华大学 2 HELLO 我爱你中国 +3 Hello 人民可以得到更多实惠 -- !sql -- +1 hello 我来到北京清华大学 +2 HELLO 我爱你中国 3 Hello 人民可以得到更多实惠 -- !sql -- diff --git a/regression-test/suites/fault_injection_p0/test_index_lowercase_fault_injection.groovy b/regression-test/suites/fault_injection_p0/test_index_lowercase_fault_injection.groovy new file mode 100644 index 00000000000..0f522652bb4 --- /dev/null +++ b/regression-test/suites/fault_injection_p0/test_index_lowercase_fault_injection.groovy @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_index_lowercase_fault_injection") { + // define a sql table + def testTable = "httplogs_lowercase" + + def create_httplogs_unique_table = {testTablex -> + // multi-line sql + def result = sql """ + CREATE TABLE ${testTablex} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` string NULL COMMENT "", + `request` string NULL COMMENT "", + `status` string NULL COMMENT "", + `size` string NULL COMMENT "", + INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "chinese", "support_phrase" = "true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY HASH(`@timestamp`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + } + + try { + sql "DROP TABLE IF EXISTS ${testTable}" + create_httplogs_unique_table.call(testTable) + + try { + GetDebugPoint().enableDebugPointForAllBEs("inverted_index_parser.get_parser_lowercase_from_properties") + GetDebugPoint().enableDebugPointForAllBEs("tablet_schema.to_schema_pb") + + sql """ INSERT INTO ${testTable} VALUES (893964617, '40.135.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 200, 24736); """ + sql """ INSERT INTO ${testTable} VALUES (893964653, '232.0.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 200, 3781); """ + sql """ INSERT INTO ${testTable} VALUES (893964672, '26.1.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 304, 0); """ + sql """ INSERT INTO ${testTable} VALUES (893964672, '26.1.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 304, 0); """ + sql """ INSERT INTO ${testTable} VALUES (893964653, '232.0.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 200, 3781); """ + + sql 'sync' + } finally { + GetDebugPoint().disableDebugPointForAllBEs("inverted_index_parser.get_parser_lowercase_from_properties") + GetDebugPoint().disableDebugPointForAllBEs("tablet_schema.to_schema_pb") + } + + qt_sql """ select count() from ${testTable} where (request match 'HTTP'); """ + qt_sql """ select count() from ${testTable} where (request match 'http'); """ + + sql """ INSERT INTO ${testTable} VALUES (893964672, '26.1.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 304, 0); """ + sql """ INSERT INTO ${testTable} VALUES (893964672, '26.1.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 304, 0); """ + sql """ INSERT INTO ${testTable} VALUES (893964653, '232.0.0.0', 'GET /images/hm_bg.jpg HTTP/1.0', 200, 3781); """ + + sql 'sync' + + qt_sql """ select count() from ${testTable} where (request match 'HTTP'); """ + qt_sql """ select count() from ${testTable} where (request match 'http'); """ + } finally { + } +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org