This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 5d576b41d70 [opt](invert index) use lowercase by default #32405 
(#32940)
5d576b41d70 is described below

commit 5d576b41d70611c6724a6ea9b87c7b2c489125e8
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Fri Mar 29 14:37:40 2024 +0800

    [opt](invert index) use lowercase by default #32405 (#32940)
---
 be/src/olap/compaction.cpp                         | 41 +++++++++++-
 be/src/olap/inverted_index_parser.cpp              |  9 ---
 be/src/olap/inverted_index_parser.h                | 23 ++++++-
 .../rowset/segment_v2/inverted_index_writer.cpp    |  2 +-
 be/src/olap/rowset/segment_v2/segment_iterator.cpp |  3 +-
 be/src/olap/tablet_schema.cpp                      |  8 +++
 .../test_index_lowercase_fault_injection.out       | 13 ++++
 .../data/inverted_index_p0/test_lowercase.out      |  6 ++
 .../test_index_lowercase_fault_injection.groovy    | 76 ++++++++++++++++++++++
 9 files changed, 166 insertions(+), 15 deletions(-)

diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp
index e852344688c..9bedbce11ab 100644
--- a/be/src/olap/compaction.cpp
+++ b/be/src/olap/compaction.cpp
@@ -460,9 +460,11 @@ Status Compaction::do_compaction_impl(int64_t permits) {
             // src index files
             // format: rowsetId_segmentId
             std::vector<std::string> src_index_files(src_segment_num);
+            std::vector<RowsetId> src_rowset_ids;
             for (const auto& m : src_seg_to_id_map) {
                 std::pair<RowsetId, uint32_t> p = m.first;
                 src_index_files[m.second] = p.first.to_string() + "_" + 
std::to_string(p.second);
+                src_rowset_ids.push_back(p.first);
             }
 
             // dest index files
@@ -597,9 +599,36 @@ Status Compaction::do_compaction_impl(int64_t permits) {
                 }
             };
 
+            Status status = Status::OK();
             for (auto&& column_uniq_id : ctx.skip_inverted_index) {
                 auto col = _cur_tablet_schema->column_by_uid(column_uniq_id);
                 const auto* index_meta = 
_cur_tablet_schema->get_inverted_index(col);
+
+                // if index properties are different, index compaction maybe 
needs to be skipped.
+                bool is_continue = false;
+                std::optional<std::map<std::string, std::string>> 
first_properties;
+                for (const auto& rowset_id : src_rowset_ids) {
+                    auto rowset_ptr = _tablet->get_rowset(rowset_id);
+                    const auto* tablet_index = 
rowset_ptr->tablet_schema()->get_inverted_index(col);
+                    const auto& properties = tablet_index->properties();
+                    if (!first_properties.has_value()) {
+                        first_properties = properties;
+                    } else {
+                        if (properties != first_properties.value()) {
+                            error_handler(index_meta->index_id(), 
column_uniq_id);
+                            status = 
Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(
+                                    "if index properties are different, index 
compaction needs to "
+                                    "be "
+                                    "skipped.");
+                            is_continue = true;
+                            break;
+                        }
+                    }
+                }
+                if (is_continue) {
+                    continue;
+                }
+
                 std::vector<lucene::store::Directory*> 
dest_index_dirs(dest_segment_num);
                 std::vector<lucene::store::Directory*> 
src_index_dirs(src_segment_num);
                 try {
@@ -620,15 +649,21 @@ Status Compaction::do_compaction_impl(int64_t permits) {
                                            fs, index_tmp_path, trans_vec, 
dest_segment_num_rows);
                     if (!st.ok()) {
                         error_handler(index_meta->index_id(), column_uniq_id);
-                        return 
Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(st.msg());
+                        status = 
Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(st.msg());
                     }
                 } catch (CLuceneError& e) {
                     error_handler(index_meta->index_id(), column_uniq_id);
-                    return 
Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(e.what());
+                    status = 
Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(e.what());
                 }
             }
             for (auto& inverted_index_file_writer : 
inverted_index_file_writers) {
-                RETURN_IF_ERROR(inverted_index_file_writer->close());
+                if (Status st = inverted_index_file_writer->close(); !st.ok()) 
{
+                    status = 
Status::Error<INVERTED_INDEX_COMPACTION_ERROR>(st.msg());
+                }
+            }
+            // check index compaction status. If status is not ok, we should 
return error and end this compaction round.
+            if (!status.ok()) {
+                return status;
             }
 
             LOG(INFO) << "succeed to do index compaction"
diff --git a/be/src/olap/inverted_index_parser.cpp 
b/be/src/olap/inverted_index_parser.cpp
index 17cddc042f0..07a587dd2dd 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -126,13 +126,4 @@ std::string get_parser_ignore_above_value_from_properties(
     }
 }
 
-std::string get_parser_lowercase_from_properties(
-        const std::map<std::string, std::string>& properties) {
-    if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != 
properties.end()) {
-        return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
-    } else {
-        return "";
-    }
-}
-
 } // namespace doris
diff --git a/be/src/olap/inverted_index_parser.h 
b/be/src/olap/inverted_index_parser.h
index 4a84823d14c..9df825bf69d 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -21,6 +21,8 @@
 #include <memory>
 #include <string>
 
+#include "util/debug_points.h"
+
 namespace lucene {
 namespace analysis {
 class Analyzer;
@@ -49,6 +51,9 @@ struct InvertedIndexCtx {
 
 using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>;
 
+const std::string INVERTED_INDEX_PARSER_TRUE = "true";
+const std::string INVERTED_INDEX_PARSER_FALSE = "false";
+
 const std::string INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode";
 const std::string INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained";
 const std::string INVERTED_INDEX_PARSER_COARSE_GRANULARITY = "coarse_grained";
@@ -90,6 +95,22 @@ CharFilterMap get_parser_char_filter_map_from_properties(
 // get parser ignore_above value from properties
 std::string get_parser_ignore_above_value_from_properties(
         const std::map<std::string, std::string>& properties);
+
+template <bool ReturnTrue = false>
 std::string get_parser_lowercase_from_properties(
-        const std::map<std::string, std::string>& properties);
+        const std::map<std::string, std::string>& properties) {
+    if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != 
properties.end()) {
+        return properties.at(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
+    } else {
+        
DBUG_EXECUTE_IF("inverted_index_parser.get_parser_lowercase_from_properties",
+                        { return ""; })
+
+        if constexpr (ReturnTrue) {
+            return INVERTED_INDEX_PARSER_TRUE;
+        } else {
+            return "";
+        }
+    }
+}
+
 } // namespace doris
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 54f3feb3c5d..8b1ae50433c 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -222,7 +222,7 @@ public:
     }
 
     void setup_analyzer_lowercase(std::unique_ptr<lucene::analysis::Analyzer>& 
analyzer) {
-        auto lowercase = 
get_parser_lowercase_from_properties(_index_meta->properties());
+        auto lowercase = 
get_parser_lowercase_from_properties<true>(_index_meta->properties());
         if (lowercase == "true") {
             analyzer->set_lowercase(true);
         } else if (lowercase == "false") {
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 67a5b9393e0..dc692fa9bc0 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1275,9 +1275,10 @@ Status SegmentIterator::_init_inverted_index_iterators() 
{
     }
     for (auto cid : _schema->column_ids()) {
         if (_inverted_index_iterators[cid] == nullptr) {
+            // Use segment’s own index_meta, for compatibility with future 
indexing needs to default to lowercase.
             RETURN_IF_ERROR(_segment->new_inverted_index_iterator(
                     _opts.tablet_schema->column(cid),
-                    
_opts.tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)),
+                    
_segment->_tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)),
                     _opts, &_inverted_index_iterators[cid]));
         }
     }
diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp
index b0410a2b341..0e9376e09cf 100644
--- a/be/src/olap/tablet_schema.cpp
+++ b/be/src/olap/tablet_schema.cpp
@@ -799,6 +799,14 @@ void TabletIndex::to_schema_pb(TabletIndexPB* index) const 
{
         (*index->mutable_properties())[kv.first] = kv.second;
     }
     index->set_index_suffix_name(_escaped_index_suffix_path);
+
+    DBUG_EXECUTE_IF("tablet_schema.to_schema_pb", { return; })
+
+    // lowercase by default
+    if (!_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) {
+        (*index->mutable_properties())[INVERTED_INDEX_PARSER_LOWERCASE_KEY] =
+                INVERTED_INDEX_PARSER_TRUE;
+    }
 }
 
 void TabletSchema::append_column(TabletColumn column, ColumnType col_type) {
diff --git 
a/regression-test/data/fault_injection_p0/test_index_lowercase_fault_injection.out
 
b/regression-test/data/fault_injection_p0/test_index_lowercase_fault_injection.out
new file mode 100644
index 00000000000..196077986ec
--- /dev/null
+++ 
b/regression-test/data/fault_injection_p0/test_index_lowercase_fault_injection.out
@@ -0,0 +1,13 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+5
+
+-- !sql --
+0
+
+-- !sql --
+8
+
+-- !sql --
+3
+
diff --git a/regression-test/data/inverted_index_p0/test_lowercase.out 
b/regression-test/data/inverted_index_p0/test_lowercase.out
index 03c2f57468f..2ca46501026 100644
--- a/regression-test/data/inverted_index_p0/test_lowercase.out
+++ b/regression-test/data/inverted_index_p0/test_lowercase.out
@@ -31,11 +31,17 @@
 
 -- !sql --
 1      hello 我来到北京清华大学
+2      HELLO 我爱你中国
+3      Hello 人民可以得到更多实惠
 
 -- !sql --
+1      hello 我来到北京清华大学
 2      HELLO 我爱你中国
+3      Hello 人民可以得到更多实惠
 
 -- !sql --
+1      hello 我来到北京清华大学
+2      HELLO 我爱你中国
 3      Hello 人民可以得到更多实惠
 
 -- !sql --
diff --git 
a/regression-test/suites/fault_injection_p0/test_index_lowercase_fault_injection.groovy
 
b/regression-test/suites/fault_injection_p0/test_index_lowercase_fault_injection.groovy
new file mode 100644
index 00000000000..0f522652bb4
--- /dev/null
+++ 
b/regression-test/suites/fault_injection_p0/test_index_lowercase_fault_injection.groovy
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_lowercase_fault_injection") {
+    // define a sql table
+    def testTable = "httplogs_lowercase"
+
+    def create_httplogs_unique_table = {testTablex ->
+      // multi-line sql
+      def result = sql """
+        CREATE TABLE ${testTablex} (
+          `@timestamp` int(11) NULL COMMENT "",
+          `clientip` string NULL COMMENT "",
+          `request` string NULL COMMENT "",
+          `status` string NULL COMMENT "",
+          `size` string NULL COMMENT "",
+          INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"chinese", "support_phrase" = "true") COMMENT ''
+          ) ENGINE=OLAP
+          DUPLICATE KEY(`@timestamp`)
+          COMMENT "OLAP"
+          DISTRIBUTED BY HASH(`@timestamp`) BUCKETS 1
+          PROPERTIES (
+          "replication_allocation" = "tag.location.default: 1"
+        );
+      """
+    }
+
+    try {
+      sql "DROP TABLE IF EXISTS ${testTable}"
+      create_httplogs_unique_table.call(testTable)
+
+      try {
+        
GetDebugPoint().enableDebugPointForAllBEs("inverted_index_parser.get_parser_lowercase_from_properties")
+        GetDebugPoint().enableDebugPointForAllBEs("tablet_schema.to_schema_pb")
+
+        sql """ INSERT INTO ${testTable} VALUES (893964617, '40.135.0.0', 'GET 
/images/hm_bg.jpg HTTP/1.0', 200, 24736); """
+        sql """ INSERT INTO ${testTable} VALUES (893964653, '232.0.0.0', 'GET 
/images/hm_bg.jpg HTTP/1.0', 200, 3781); """
+        sql """ INSERT INTO ${testTable} VALUES (893964672, '26.1.0.0', 'GET 
/images/hm_bg.jpg HTTP/1.0', 304, 0); """
+        sql """ INSERT INTO ${testTable} VALUES (893964672, '26.1.0.0', 'GET 
/images/hm_bg.jpg HTTP/1.0', 304, 0); """
+        sql """ INSERT INTO ${testTable} VALUES (893964653, '232.0.0.0', 'GET 
/images/hm_bg.jpg HTTP/1.0', 200, 3781); """
+
+        sql 'sync'
+      } finally {
+        
GetDebugPoint().disableDebugPointForAllBEs("inverted_index_parser.get_parser_lowercase_from_properties")
+        
GetDebugPoint().disableDebugPointForAllBEs("tablet_schema.to_schema_pb")
+      }
+
+      qt_sql """ select count() from ${testTable} where (request match 
'HTTP');  """
+      qt_sql """ select count() from ${testTable} where (request match 
'http');  """
+
+      sql """ INSERT INTO ${testTable} VALUES (893964672, '26.1.0.0', 'GET 
/images/hm_bg.jpg HTTP/1.0', 304, 0); """
+      sql """ INSERT INTO ${testTable} VALUES (893964672, '26.1.0.0', 'GET 
/images/hm_bg.jpg HTTP/1.0', 304, 0); """
+      sql """ INSERT INTO ${testTable} VALUES (893964653, '232.0.0.0', 'GET 
/images/hm_bg.jpg HTTP/1.0', 200, 3781); """
+
+      sql 'sync'
+
+      qt_sql """ select count() from ${testTable} where (request match 
'HTTP');  """
+      qt_sql """ select count() from ${testTable} where (request match 
'http');  """
+    } finally {
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to