This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit 2c546e885ac4abed61b7523d22345cf381f5a8f6
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Sun Sep 10 23:08:28 2023 +0800

    [feature](invert index) add tokenizer CharFilter preprocessing (#24102)
---
 be/src/olap/inverted_index_parser.cpp              | 36 ++++++++++
 be/src/olap/inverted_index_parser.h                |  8 +++
 .../char_filter/char_filter_factory.h              | 37 ++++++++++
 .../char_filter/char_replace_char_filter.cpp       | 59 ++++++++++++++++
 .../char_filter/char_replace_char_filter.h         | 48 +++++++++++++
 .../rowset/segment_v2/inverted_index_writer.cpp    | 13 +++-
 .../char_filter/test_char_replace.out              | 46 ++++++++++++
 .../char_filter/test_char_replace.groovy           | 82 ++++++++++++++++++++++
 8 files changed, 328 insertions(+), 1 deletion(-)

diff --git a/be/src/olap/inverted_index_parser.cpp 
b/be/src/olap/inverted_index_parser.cpp
index b0ab8c9d1a..5678a217b5 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -17,6 +17,7 @@
 
 #include "olap/inverted_index_parser.h"
 
+#include 
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
 #include "util/string_util.h"
 
 namespace doris {
@@ -83,4 +84,39 @@ std::string get_parser_phrase_support_string_from_properties(
         return INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO;
     }
 }
+
+CharFilterMap get_parser_char_filter_map_from_properties(
+        const std::map<std::string, std::string>& properties) {
+    CharFilterMap char_filter_map;
+
+    if (properties.find(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE) == 
properties.end()) {
+        return CharFilterMap();
+    }
+
+    std::string type = properties.at(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE);
+    if (type == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) {
+        // type
+        char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE] =
+                INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE;
+
+        // pattern
+        if (properties.find(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN) == 
properties.end()) {
+            return CharFilterMap();
+        }
+        std::string pattern = 
properties.at(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
+        char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN] = pattern;
+
+        // placement
+        std::string replacement = " ";
+        if (properties.find(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT) != 
properties.end()) {
+            replacement = 
properties.at(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT);
+        }
+        char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT] = 
replacement;
+    } else {
+        return CharFilterMap();
+    }
+
+    return char_filter_map;
+}
+
 } // namespace doris
diff --git a/be/src/olap/inverted_index_parser.h 
b/be/src/olap/inverted_index_parser.h
index eb4c414308..307c78e635 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -38,6 +38,7 @@ struct InvertedIndexCtx {
 };
 
 using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>;
+using CharFilterMap = std::map<std::string, std::string>;
 
 const std::string INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode";
 const std::string INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained";
@@ -55,6 +56,10 @@ const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY = 
"support_phrase";
 const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES = "true";
 const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO = "false";
 
+const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type";
+const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = 
"char_filter_pattern";
+const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = 
"char_filter_replacement";
+
 std::string inverted_index_parser_type_to_string(InvertedIndexParserType 
parser_type);
 
 InvertedIndexParserType get_inverted_index_parser_type_from_string(const 
std::string& parser_str);
@@ -65,4 +70,7 @@ std::string get_parser_mode_string_from_properties(
 std::string get_parser_phrase_support_string_from_properties(
         const std::map<std::string, std::string>& properties);
 
+CharFilterMap get_parser_char_filter_map_from_properties(
+        const std::map<std::string, std::string>& properties);
+
 } // namespace doris
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h
new file mode 100644
index 0000000000..561054863d
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include 
"olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h"
+
+namespace doris {
+
+static const std::string INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = 
"char_replace";
+
+class CharFilterFactory {
+public:
+    template <typename... Args>
+    static lucene::analysis::CharFilter* create(const std::string& name, 
Args&&... args) {
+        if (name == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) {
+            return new CharReplaceCharFilter(std::forward<Args>(args)...);
+        }
+        return nullptr;
+    }
+};
+
+} // namespace doris
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.cpp
new file mode 100644
index 0000000000..c0545b9cf9
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.cpp
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "char_replace_char_filter.h"
+
+#include <boost/algorithm/string/replace.hpp>
+
+namespace doris {
+
+CharReplaceCharFilter::CharReplaceCharFilter(lucene::util::Reader* in, const 
std::string& pattern,
+                                             const std::string& replacement)
+        : CharFilter(in), _replacement(replacement) {
+    std::for_each(pattern.begin(), pattern.end(), [this](uint8_t c) { 
_patterns.set(c); });
+}
+
+void CharReplaceCharFilter::init(const void* _value, int32_t _length, bool 
copyData) {
+    input_->init(_value, _length, copyData);
+    fill();
+}
+
+int32_t CharReplaceCharFilter::read(const void** start, int32_t min, int32_t 
max) {
+    return _transformed_input.read(start, min, max);
+}
+
+int32_t CharReplaceCharFilter::readCopy(void* start, int32_t off, int32_t len) 
{
+    return _transformed_input.readCopy(start, off, len);
+}
+
+void CharReplaceCharFilter::fill() {
+    _buf.resize(input_->size());
+    input_->readCopy(_buf.data(), 0, _buf.size());
+    process_pattern(_buf);
+    _transformed_input.init(_buf.data(), _buf.size(), false);
+}
+
+void CharReplaceCharFilter::process_pattern(std::string& buf) {
+    for (char& c : buf) {
+        uint8_t uc = static_cast<uint8_t>(c);
+        if (_patterns.test(uc)) {
+            c = _replacement[0];
+        }
+    }
+}
+
+} // namespace doris
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
new file mode 100644
index 0000000000..2867890b3e
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <CLucene.h>
+#include <CLucene/analysis/CharFilter.h>
+
+#include <bitset>
+
+namespace doris {
+
+class CharReplaceCharFilter : public lucene::analysis::CharFilter {
+public:
+    CharReplaceCharFilter(lucene::util::Reader* in, const std::string& pattern,
+                          const std::string& replacement);
+    virtual ~CharReplaceCharFilter() = default;
+
+    void init(const void* _value, int32_t _length, bool copyData) override;
+    int32_t read(const void** start, int32_t min, int32_t max) override;
+    int32_t readCopy(void* start, int32_t off, int32_t len) override;
+
+private:
+    void fill();
+    void process_pattern(std::string& buf);
+
+    std::bitset<256> _patterns;
+    std::string _replacement;
+
+    std::string _buf;
+    lucene::util::SStringReader<char> _transformed_input;
+};
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index 8cd65bbab6..07e8f7a886 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -37,6 +37,7 @@
 #include "olap/key_coder.h"
 #include "olap/olap_common.h"
 #include "olap/rowset/segment_v2/common.h"
+#include 
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
 #include "olap/rowset/segment_v2/inverted_index_cache.h"
 #include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
 #include "olap/rowset/segment_v2/inverted_index_desc.h"
@@ -152,6 +153,16 @@ public:
         }
 
         _char_string_reader = 
std::make_unique<lucene::util::SStringReader<char>>();
+        CharFilterMap char_filter_map =
+                
get_parser_char_filter_map_from_properties(_index_meta->properties());
+        if (!char_filter_map.empty()) {
+            _char_string_reader.reset(CharFilterFactory::create(
+                    char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE],
+                    _char_string_reader.release(),
+                    char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
+                    
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]));
+        }
+
         _doc = std::make_unique<lucene::document::Document>();
         _dir.reset(DorisCompoundDirectory::getDirectory(_fs, 
index_path.c_str(), true));
 
@@ -497,7 +508,7 @@ private:
     lucene::document::Field* _field {};
     std::unique_ptr<lucene::index::IndexWriter> _index_writer {};
     std::unique_ptr<lucene::analysis::Analyzer> _analyzer {};
-    std::unique_ptr<lucene::util::SStringReader<char>> _char_string_reader {};
+    std::unique_ptr<lucene::util::Reader> _char_string_reader {};
     std::shared_ptr<lucene::util::bkd::bkd_writer> _bkd_writer;
     std::string _segment_file_name;
     std::string _directory;
diff --git 
a/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out 
b/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out
new file mode 100644
index 0000000000..3cef00b125
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out
@@ -0,0 +1,46 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
diff --git 
a/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy 
b/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy
new file mode 100644
index 0000000000..c8916517f0
--- /dev/null
+++ 
b/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_char_replace") {
+    // prepare test table
+
+    def timeout = 60000
+    def delta_time = 1000
+    def alter_res = "null"
+    def useTime = 0
+
+    def indexTblName = "test_char_replace"
+
+    sql "DROP TABLE IF EXISTS ${indexTblName}"
+    // create 1 replica table
+    sql """
+       CREATE TABLE IF NOT EXISTS ${indexTblName}(
+               `id` int(11) NULL,
+               `a` text NULL,
+        `b` string NULL,
+        `c` string NULL,
+        INDEX a_idx(`a`) USING INVERTED PROPERTIES("parser" = "unicode") 
COMMENT '',
+               INDEX b_idx(`b`) USING INVERTED PROPERTIES("parser" = 
"unicode", "char_filter_type" = "char_replace", "char_filter_pattern" = "._", 
"char_filter_replacement" = " ") COMMENT '',
+        INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser" = "unicode", 
"char_filter_type" = "char_replace", "char_filter_pattern" = "._") COMMENT ''
+       ) ENGINE=OLAP
+       DUPLICATE KEY(`id`)
+       COMMENT 'OLAP'
+       DISTRIBUTED BY HASH(`id`) BUCKETS 1
+       PROPERTIES(
+               "replication_allocation" = "tag.location.default: 1"
+       );
+    """
+    
+    def var_result = sql "show variables"
+    logger.info("show variales result: " + var_result )
+
+    sql """INSERT INTO ${indexTblName} VALUES
+        (1, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg 
HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'),
+        (2, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg 
HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'),
+        (3, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg 
HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'),
+        (4, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg 
HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'),
+        (5, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg 
HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'),
+        (6, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg 
HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'),
+        (7, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg 
HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'),
+        (8, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg 
HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'),
+        (9, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg 
HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'),
+        (10, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg 
HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0')
+    """
+
+    qt_sql "SELECT count() FROM ${indexTblName} where a match 'hm'";
+    qt_sql "SELECT count() FROM ${indexTblName} where a match 'bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where a match 'jpg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where a match '1'";
+    qt_sql "SELECT count() FROM ${indexTblName} where a match '0'";
+    
+    qt_sql "SELECT count() FROM ${indexTblName} where b match 'hm'";
+    qt_sql "SELECT count() FROM ${indexTblName} where b match 'bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where b match 'jpg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where b match '1'";
+    qt_sql "SELECT count() FROM ${indexTblName} where b match '0'";
+
+    qt_sql "SELECT count() FROM ${indexTblName} where c match 'hm'";
+    qt_sql "SELECT count() FROM ${indexTblName} where c match 'bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where c match 'jpg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where c match '1'";
+    qt_sql "SELECT count() FROM ${indexTblName} where c match '0'";
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to