This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 2c546e885ac4abed61b7523d22345cf381f5a8f6 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Sun Sep 10 23:08:28 2023 +0800 [feature](invert index) add tokenizer CharFilter preprocessing (#24102) --- be/src/olap/inverted_index_parser.cpp | 36 ++++++++++ be/src/olap/inverted_index_parser.h | 8 +++ .../char_filter/char_filter_factory.h | 37 ++++++++++ .../char_filter/char_replace_char_filter.cpp | 59 ++++++++++++++++ .../char_filter/char_replace_char_filter.h | 48 +++++++++++++ .../rowset/segment_v2/inverted_index_writer.cpp | 13 +++- .../char_filter/test_char_replace.out | 46 ++++++++++++ .../char_filter/test_char_replace.groovy | 82 ++++++++++++++++++++++ 8 files changed, 328 insertions(+), 1 deletion(-) diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index b0ab8c9d1a..5678a217b5 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -17,6 +17,7 @@ #include "olap/inverted_index_parser.h" +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" #include "util/string_util.h" namespace doris { @@ -83,4 +84,39 @@ std::string get_parser_phrase_support_string_from_properties( return INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO; } } + +CharFilterMap get_parser_char_filter_map_from_properties( + const std::map<std::string, std::string>& properties) { + CharFilterMap char_filter_map; + + if (properties.find(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE) == properties.end()) { + return CharFilterMap(); + } + + std::string type = properties.at(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE); + if (type == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) { + // type + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE] = + INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE; + + // pattern + if (properties.find(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN) == properties.end()) { + return CharFilterMap(); + } + std::string pattern = properties.at(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN); + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN] = pattern; + + // placement + std::string replacement = " "; + if (properties.find(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT) != properties.end()) { + replacement = properties.at(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT); + } + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT] = replacement; + } else { + return CharFilterMap(); + } + + return char_filter_map; +} + } // namespace doris diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index eb4c414308..307c78e635 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -38,6 +38,7 @@ struct InvertedIndexCtx { }; using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>; +using CharFilterMap = std::map<std::string, std::string>; const std::string INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode"; const std::string INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained"; @@ -55,6 +56,10 @@ const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY = "support_phrase"; const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES = "true"; const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO = "false"; +const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type"; +const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern"; +const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement"; + std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type); InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str); @@ -65,4 +70,7 @@ std::string get_parser_mode_string_from_properties( std::string get_parser_phrase_support_string_from_properties( const std::map<std::string, std::string>& properties); +CharFilterMap get_parser_char_filter_map_from_properties( + const std::map<std::string, std::string>& properties); + } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h new file mode 100644 index 0000000000..561054863d --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h" + +namespace doris { + +static const std::string INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace"; + +class CharFilterFactory { +public: + template <typename... Args> + static lucene::analysis::CharFilter* create(const std::string& name, Args&&... args) { + if (name == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) { + return new CharReplaceCharFilter(std::forward<Args>(args)...); + } + return nullptr; + } +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.cpp b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.cpp new file mode 100644 index 0000000000..c0545b9cf9 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.cpp @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "char_replace_char_filter.h" + +#include <boost/algorithm/string/replace.hpp> + +namespace doris { + +CharReplaceCharFilter::CharReplaceCharFilter(lucene::util::Reader* in, const std::string& pattern, + const std::string& replacement) + : CharFilter(in), _replacement(replacement) { + std::for_each(pattern.begin(), pattern.end(), [this](uint8_t c) { _patterns.set(c); }); +} + +void CharReplaceCharFilter::init(const void* _value, int32_t _length, bool copyData) { + input_->init(_value, _length, copyData); + fill(); +} + +int32_t CharReplaceCharFilter::read(const void** start, int32_t min, int32_t max) { + return _transformed_input.read(start, min, max); +} + +int32_t CharReplaceCharFilter::readCopy(void* start, int32_t off, int32_t len) { + return _transformed_input.readCopy(start, off, len); +} + +void CharReplaceCharFilter::fill() { + _buf.resize(input_->size()); + input_->readCopy(_buf.data(), 0, _buf.size()); + process_pattern(_buf); + _transformed_input.init(_buf.data(), _buf.size(), false); +} + +void CharReplaceCharFilter::process_pattern(std::string& buf) { + for (char& c : buf) { + uint8_t uc = static_cast<uint8_t>(c); + if (_patterns.test(uc)) { + c = _replacement[0]; + } + } +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h new file mode 100644 index 0000000000..2867890b3e --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <CLucene.h> +#include <CLucene/analysis/CharFilter.h> + +#include <bitset> + +namespace doris { + +class CharReplaceCharFilter : public lucene::analysis::CharFilter { +public: + CharReplaceCharFilter(lucene::util::Reader* in, const std::string& pattern, + const std::string& replacement); + virtual ~CharReplaceCharFilter() = default; + + void init(const void* _value, int32_t _length, bool copyData) override; + int32_t read(const void** start, int32_t min, int32_t max) override; + int32_t readCopy(void* start, int32_t off, int32_t len) override; + +private: + void fill(); + void process_pattern(std::string& buf); + + std::bitset<256> _patterns; + std::string _replacement; + + std::string _buf; + lucene::util::SStringReader<char> _transformed_input; +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 8cd65bbab6..07e8f7a886 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -37,6 +37,7 @@ #include "olap/key_coder.h" #include "olap/olap_common.h" #include "olap/rowset/segment_v2/common.h" +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_compound_directory.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" @@ -152,6 +153,16 @@ public: } _char_string_reader = std::make_unique<lucene::util::SStringReader<char>>(); + CharFilterMap char_filter_map = + get_parser_char_filter_map_from_properties(_index_meta->properties()); + if (!char_filter_map.empty()) { + _char_string_reader.reset(CharFilterFactory::create( + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE], + _char_string_reader.release(), + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN], + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT])); + } + _doc = std::make_unique<lucene::document::Document>(); _dir.reset(DorisCompoundDirectory::getDirectory(_fs, index_path.c_str(), true)); @@ -497,7 +508,7 @@ private: lucene::document::Field* _field {}; std::unique_ptr<lucene::index::IndexWriter> _index_writer {}; std::unique_ptr<lucene::analysis::Analyzer> _analyzer {}; - std::unique_ptr<lucene::util::SStringReader<char>> _char_string_reader {}; + std::unique_ptr<lucene::util::Reader> _char_string_reader {}; std::shared_ptr<lucene::util::bkd::bkd_writer> _bkd_writer; std::string _segment_file_name; std::string _directory; diff --git a/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out b/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out new file mode 100644 index 0000000000..3cef00b125 --- /dev/null +++ b/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out @@ -0,0 +1,46 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +0 + +-- !sql -- +0 + +-- !sql -- +0 + +-- !sql -- +0 + +-- !sql -- +0 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + diff --git a/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy b/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy new file mode 100644 index 0000000000..c8916517f0 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_char_replace") { + // prepare test table + + def timeout = 60000 + def delta_time = 1000 + def alter_res = "null" + def useTime = 0 + + def indexTblName = "test_char_replace" + + sql "DROP TABLE IF EXISTS ${indexTblName}" + // create 1 replica table + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName}( + `id` int(11) NULL, + `a` text NULL, + `b` string NULL, + `c` string NULL, + INDEX a_idx(`a`) USING INVERTED PROPERTIES("parser" = "unicode") COMMENT '', + INDEX b_idx(`b`) USING INVERTED PROPERTIES("parser" = "unicode", "char_filter_type" = "char_replace", "char_filter_pattern" = "._", "char_filter_replacement" = " ") COMMENT '', + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser" = "unicode", "char_filter_type" = "char_replace", "char_filter_pattern" = "._") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + def var_result = sql "show variables" + logger.info("show variales result: " + var_result ) + + sql """INSERT INTO ${indexTblName} VALUES + (1, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'), + (2, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'), + (3, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'), + (4, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'), + (5, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'), + (6, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'), + (7, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'), + (8, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'), + (9, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0'), + (10, 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0', 'GET /images/hm_bg.jpg HTTP/1.0') + """ + + qt_sql "SELECT count() FROM ${indexTblName} where a match 'hm'"; + qt_sql "SELECT count() FROM ${indexTblName} where a match 'bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where a match 'jpg'"; + qt_sql "SELECT count() FROM ${indexTblName} where a match '1'"; + qt_sql "SELECT count() FROM ${indexTblName} where a match '0'"; + + qt_sql "SELECT count() FROM ${indexTblName} where b match 'hm'"; + qt_sql "SELECT count() FROM ${indexTblName} where b match 'bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where b match 'jpg'"; + qt_sql "SELECT count() FROM ${indexTblName} where b match '1'"; + qt_sql "SELECT count() FROM ${indexTblName} where b match '0'"; + + qt_sql "SELECT count() FROM ${indexTblName} where c match 'hm'"; + qt_sql "SELECT count() FROM ${indexTblName} where c match 'bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where c match 'jpg'"; + qt_sql "SELECT count() FROM ${indexTblName} where c match '1'"; + qt_sql "SELECT count() FROM ${indexTblName} where c match '0'"; +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org