Re: [PR] [feature](inverted index) add ngram tokenizer [doris]

via GitHub Tue, 15 Apr 2025 09:07:37 -0700


xiaokang commented on code in PR #49472:
URL: https://github.com/apache/doris/pull/49472#discussion_r2040870432



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.cpp:
##########
@@ -92,8 +92,7 @@ void BasicTokenizer::cut() {
             const int32_t prev_i = i;
 
             U8_NEXT(s, i, length, c);
-
-            if (c == U_SENTINEL) {
+            if (c < 0) {

Review Comment:
   ?



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp:
##########
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "custom_analyzer.h"
+
+#include <memory>
+#include <unordered_map>
+
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/loser_case_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+TokenizerFactoryPtr get_tokenizer_factory(const std::string& name, const 
Settings& params) {
+    using FactoryCreator = std::function<TokenizerFactoryPtr()>;
+
+    static const std::map<std::string, FactoryCreator> factoryCreators = {
+            {"standard", []() { return 
std::make_shared<StandardTokenizerFactory>(); }},
+            {"keyword", []() { return 
std::make_shared<KeywordTokenizerFactory>(); }},
+            {"edge_ngram", []() { return 
std::make_shared<EdgeNGramTokenizerFactory>(); }}};
+
+    auto it = factoryCreators.find(name);
+    if (it != factoryCreators.end()) {
+        auto tk = it->second();
+        tk->initialize(params);
+        return tk;
+    } else {
+        throw std::invalid_argument("Unknown tokenizer name: " + name);
+    }
+}
+
+TokenFilterFactoryPtr get_token_filter_factory(const std::string& name, const 
Settings& params) {
+    using FactoryCreator = std::function<TokenFilterFactoryPtr()>;
+
+    static const std::map<std::string, FactoryCreator> factoryCreators = {
+            {"lowercase", []() { return 
std::make_shared<LowerCaseFilterFactory>(); }},
+            {"asciifolding", []() { return 
std::make_shared<ASCIIFoldingFilterFactory>(); }},
+            {"word_delimiter", []() { return 
std::make_shared<WordDelimiterFilterFactory>(); }}};
+
+    auto it = factoryCreators.find(name);
+    if (it != factoryCreators.end()) {
+        auto tk = it->second();
+        tk->initialize(params);
+        return tk;
+    } else {
+        throw std::invalid_argument("Unknown token filter name: " + name);
+    }
+}
+
+CustomAnalyzer::CustomAnalyzer(Builder* builder) {
+    _tokenizer = builder->_tokenizer;
+    _token_filters = builder->_token_filters;
+}
+
+TokenStream* CustomAnalyzer::tokenStream(const TCHAR* fieldName, 
lucene::util::Reader* reader) {
+    return nullptr;

Review Comment:
   why?



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h:
##########
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "common/exception.h"
+#include 
"olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h"
+#include "olap/rowset/segment_v2/inverted_index/setting.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/token_filter_factory.h"
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class TokenStreamComponents;
+using TokenStreamComponentsPtr = std::shared_ptr<TokenStreamComponents>;
+
+class CustomAnalyzer;
+using CustomAnalyzerPtr = std::shared_ptr<CustomAnalyzer>;
+
+class CustomAnalyzer : public Analyzer {
+public:
+    class Builder {
+    public:
+        Builder() = default;
+        ~Builder() = default;
+
+        void with_tokenizer(const std::string& name, const Settings& params);
+        void add_token_filter(const std::string& name, const Settings& params);
+        CustomAnalyzerPtr build();
+
+    private:
+        TokenizerFactoryPtr _tokenizer;
+        std::vector<TokenFilterFactoryPtr> _token_filters;
+
+        friend class CustomAnalyzer;
+    };
+
+    CustomAnalyzer(Builder* builder);
+    ~CustomAnalyzer() override = default;
+
+    TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* 
reader) override;
+    TokenStream* reusableTokenStream(const TCHAR* fieldName, 
lucene::util::Reader* reader) override;
+
+    static CustomAnalyzerPtr build_custom_analyzer(const 
CustomAnalyzerConfigPtr& config);
+
+private:
+    TokenStreamComponentsPtr create_components();
+
+    TokenizerFactoryPtr _tokenizer;
+    std::vector<TokenFilterFactoryPtr> _token_filters;
+
+    TokenStreamComponentsPtr _reuse_token_stream;
+};
+
+class TokenStreamComponents {

Review Comment:
   It's not easy to undersstand.



##########
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/ngram/ngram_tokenizer_factory.h:
##########
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "char_matcher.h"
+#include "ngram_tokenizer.h"
+#include "olap/rowset/segment_v2/inverted_index/setting.h"
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class NGramTokenizerFactory : public TokenizerFactory {
+public:
+    NGramTokenizerFactory() = default;
+    ~NGramTokenizerFactory() override = default;
+
+    void initialize(const Settings& settings) override;
+
+    TokenizerPtr create() override {
+        if (_matcher == nullptr) {
+            return std::make_shared<NGramTokenizer>(_min_gram, _max_gram);
+        } else {
+            class NGramTokenizerWithMatcher : public NGramTokenizer {

Review Comment:
   The more simple way may be add a matcher parameter to NGramTokenizer and if 
the matcher is null (default value) return true in is_token_char.



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp:
##########
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "custom_analyzer.h"
+
+#include <memory>
+#include <unordered_map>
+
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/loser_case_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+TokenizerFactoryPtr get_tokenizer_factory(const std::string& name, const 
Settings& params) {
+    using FactoryCreator = std::function<TokenizerFactoryPtr()>;
+
+    static const std::map<std::string, FactoryCreator> factoryCreators = {
+            {"standard", []() { return 
std::make_shared<StandardTokenizerFactory>(); }},
+            {"keyword", []() { return 
std::make_shared<KeywordTokenizerFactory>(); }},
+            {"edge_ngram", []() { return 
std::make_shared<EdgeNGramTokenizerFactory>(); }}};

Review Comment:
   What about english, chinese, unicode?



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.h:
##########
@@ -19,9 +19,8 @@
 
 #include <unicode/utext.h>
 
-#include "CLucene.h"
+#include "CLucene.h" // IWYU pragma: keep

Review Comment:
   why keep it?



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.cpp:
##########
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "custom_analyzer_config.h"
+
+#include "olap/rowset/segment_v2/inverted_index/setting.h"
+
+namespace doris::segment_v2::inverted_index {
+
+CustomAnalyzerConfig::CustomAnalyzerConfig(Builder* builder) {
+    _tokenizer_config = builder->_tokenizer_config;
+    _token_filters = builder->_token_filters;
+}
+
+ComponentConfigPtr CustomAnalyzerConfig::get_tokenizer_config() {
+    return _tokenizer_config;
+}
+
+std::vector<ComponentConfigPtr> 
CustomAnalyzerConfig::get_token_filter_configs() {
+    return _token_filters;
+}
+
+void CustomAnalyzerConfig::Builder::add_tokenizer_config(const std::string& 
name,
+                                                         const Settings& 
params) {
+    _tokenizer_config = std::make_shared<ComponentConfig>(name, params);
+}
+
+void CustomAnalyzerConfig::Builder::add_token_filter_config(const std::string& 
name,
+                                                            const Settings& 
params) {
+    _token_filters.emplace_back(std::make_shared<ComponentConfig>(name, 
params));
+}
+
+CustomAnalyzerConfigPtr CustomAnalyzerConfig::Builder::build() {
+    return std::make_shared<CustomAnalyzerConfig>(this);
+}
+
+ComponentConfig::ComponentConfig(std::string name, Settings params)
+        : _name(std::move(name)), _params(std::move(params)) {}
+
+std::string ComponentConfig::get_name() const {
+    return _name;
+}
+
+Settings ComponentConfig::get_param() const {

Review Comment:
   get_params



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h:
##########
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "olap/rowset/segment_v2/inverted_index/setting.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class ComponentConfig;
+using ComponentConfigPtr = std::shared_ptr<ComponentConfig>;
+
+class CustomAnalyzerConfig;
+using CustomAnalyzerConfigPtr = std::shared_ptr<CustomAnalyzerConfig>;
+
+class CustomAnalyzerConfig {
+public:
+    class Builder {
+    public:
+        Builder() = default;
+        ~Builder() = default;
+
+        void add_tokenizer_config(const std::string& name, const Settings& 
params);
+        void add_token_filter_config(const std::string& name, const Settings& 
params);
+        CustomAnalyzerConfigPtr build();
+
+    private:
+        ComponentConfigPtr _tokenizer_config;
+        std::vector<ComponentConfigPtr> _token_filters;
+
+        friend class CustomAnalyzerConfig;
+    };
+
+    CustomAnalyzerConfig(Builder* builder);
+    ~CustomAnalyzerConfig() = default;
+
+    ComponentConfigPtr get_tokenizer_config();
+    std::vector<ComponentConfigPtr> get_token_filter_configs();
+
+private:
+    ComponentConfigPtr _tokenizer_config;
+    std::vector<ComponentConfigPtr> _token_filters;
+};
+
+class ComponentConfig {
+public:
+    ComponentConfig(std::string name, Settings params);
+    ~ComponentConfig() = default;
+
+    std::string get_name() const;
+    Settings get_param() const;

Review Comment:
   get_params



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h:
##########
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "common/exception.h"
+#include 
"olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h"
+#include "olap/rowset/segment_v2/inverted_index/setting.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/token_filter_factory.h"
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class TokenStreamComponents;
+using TokenStreamComponentsPtr = std::shared_ptr<TokenStreamComponents>;
+
+class CustomAnalyzer;
+using CustomAnalyzerPtr = std::shared_ptr<CustomAnalyzer>;
+
+class CustomAnalyzer : public Analyzer {
+public:
+    class Builder {
+    public:
+        Builder() = default;
+        ~Builder() = default;
+
+        void with_tokenizer(const std::string& name, const Settings& params);

Review Comment:
   set_tokenizer



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h:
##########
@@ -51,4 +51,5 @@ class InvertedIndexAnalyzer {
             InvertedIndexQueryType query_type,
             const std::map<std::string, std::string>& properties);
 };
+

Review Comment:
   useless change



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h:
##########
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "olap/rowset/segment_v2/inverted_index/setting.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class ComponentConfig;
+using ComponentConfigPtr = std::shared_ptr<ComponentConfig>;
+
+class CustomAnalyzerConfig;
+using CustomAnalyzerConfigPtr = std::shared_ptr<CustomAnalyzerConfig>;
+
+class CustomAnalyzerConfig {
+public:
+    class Builder {
+    public:
+        Builder() = default;
+        ~Builder() = default;
+
+        void add_tokenizer_config(const std::string& name, const Settings& 
params);

Review Comment:
   set_tokenizer_config



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp:
##########
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "custom_analyzer.h"
+
+#include <memory>
+#include <unordered_map>
+
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/loser_case_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+TokenizerFactoryPtr get_tokenizer_factory(const std::string& name, const 
Settings& params) {
+    using FactoryCreator = std::function<TokenizerFactoryPtr()>;
+
+    static const std::map<std::string, FactoryCreator> factoryCreators = {
+            {"standard", []() { return 
std::make_shared<StandardTokenizerFactory>(); }},
+            {"keyword", []() { return 
std::make_shared<KeywordTokenizerFactory>(); }},
+            {"edge_ngram", []() { return 
std::make_shared<EdgeNGramTokenizerFactory>(); }}};
+
+    auto it = factoryCreators.find(name);
+    if (it != factoryCreators.end()) {
+        auto tk = it->second();
+        tk->initialize(params);
+        return tk;
+    } else {
+        throw std::invalid_argument("Unknown tokenizer name: " + name);
+    }
+}
+
+TokenFilterFactoryPtr get_token_filter_factory(const std::string& name, const 
Settings& params) {
+    using FactoryCreator = std::function<TokenFilterFactoryPtr()>;
+
+    static const std::map<std::string, FactoryCreator> factoryCreators = {
+            {"lowercase", []() { return 
std::make_shared<LowerCaseFilterFactory>(); }},
+            {"asciifolding", []() { return 
std::make_shared<ASCIIFoldingFilterFactory>(); }},
+            {"word_delimiter", []() { return 
std::make_shared<WordDelimiterFilterFactory>(); }}};
+
+    auto it = factoryCreators.find(name);
+    if (it != factoryCreators.end()) {
+        auto tk = it->second();
+        tk->initialize(params);
+        return tk;
+    } else {
+        throw std::invalid_argument("Unknown token filter name: " + name);
+    }
+}
+
+CustomAnalyzer::CustomAnalyzer(Builder* builder) {
+    _tokenizer = builder->_tokenizer;
+    _token_filters = builder->_token_filters;
+}
+
+TokenStream* CustomAnalyzer::tokenStream(const TCHAR* fieldName, 
lucene::util::Reader* reader) {
+    return nullptr;
+}
+
+TokenStream* CustomAnalyzer::reusableTokenStream(const TCHAR* fieldName,
+                                                 lucene::util::Reader* reader) 
{
+    if (_reuse_token_stream == nullptr) {
+        _reuse_token_stream = create_components();
+    }
+    _reuse_token_stream->set_reader(reader);
+    _reuse_token_stream->get_token_stream()->reset();
+    return _reuse_token_stream->get_token_stream().get();
+}
+
+TokenStreamComponentsPtr CustomAnalyzer::create_components() {
+    auto tk = _tokenizer->create();
+    TokenStreamPtr ts = tk;
+    for (const auto& filter : _token_filters) {
+        ts = filter->create(ts);
+    }
+    return std::make_shared<TokenStreamComponents>(tk, ts);
+}
+
+CustomAnalyzerPtr CustomAnalyzer::build_custom_analyzer(const 
CustomAnalyzerConfigPtr& config) {
+    if (config == nullptr) {
+        throw Exception(ErrorCode::ILLEGAL_STATE, "Null configuration 
detected.");
+    }
+    CustomAnalyzer::Builder builder;
+    builder.with_tokenizer(config->get_tokenizer_config()->get_name(),
+                           config->get_tokenizer_config()->get_param());
+    for (const auto& filter_config : config->get_token_filter_configs()) {
+        builder.add_token_filter(filter_config->get_name(), 
filter_config->get_param());

Review Comment:
   dito



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp:
##########
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "custom_analyzer.h"
+
+#include <memory>
+#include <unordered_map>
+
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/loser_case_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+TokenizerFactoryPtr get_tokenizer_factory(const std::string& name, const 
Settings& params) {
+    using FactoryCreator = std::function<TokenizerFactoryPtr()>;
+
+    static const std::map<std::string, FactoryCreator> factoryCreators = {
+            {"standard", []() { return 
std::make_shared<StandardTokenizerFactory>(); }},
+            {"keyword", []() { return 
std::make_shared<KeywordTokenizerFactory>(); }},
+            {"edge_ngram", []() { return 
std::make_shared<EdgeNGramTokenizerFactory>(); }}};
+
+    auto it = factoryCreators.find(name);
+    if (it != factoryCreators.end()) {
+        auto tk = it->second();
+        tk->initialize(params);
+        return tk;
+    } else {
+        throw std::invalid_argument("Unknown tokenizer name: " + name);

Review Comment:
   Can you use doris Exception?



##########
be/src/olap/rowset/segment_v2/inverted_index/setting.h:
##########
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unicode/utf8.h>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string/trim.hpp>
+#include <memory>
+#include <regex>
+#include <unordered_map>
+#include <utility>
+
+#include "common/exception.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class Settings {
+public:
+    Settings() = default;
+    Settings(std::unordered_map<std::string, std::string> args) : 
_args(std::move(args)) {}
+    Settings(const Settings&) = default;
+    Settings(Settings&&) = default;
+    ~Settings() = default;
+
+    void set(const std::string& key, const std::string& value) {
+        _args.insert(std::make_pair(key, value));

Review Comment:
   what's the differenc to insert(key, value)?



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp:
##########
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "custom_analyzer.h"
+
+#include <memory>
+#include <unordered_map>
+
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/loser_case_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+TokenizerFactoryPtr get_tokenizer_factory(const std::string& name, const 
Settings& params) {
+    using FactoryCreator = std::function<TokenizerFactoryPtr()>;
+
+    static const std::map<std::string, FactoryCreator> factoryCreators = {
+            {"standard", []() { return 
std::make_shared<StandardTokenizerFactory>(); }},
+            {"keyword", []() { return 
std::make_shared<KeywordTokenizerFactory>(); }},
+            {"edge_ngram", []() { return 
std::make_shared<EdgeNGramTokenizerFactory>(); }}};
+
+    auto it = factoryCreators.find(name);
+    if (it != factoryCreators.end()) {
+        auto tk = it->second();
+        tk->initialize(params);
+        return tk;
+    } else {
+        throw std::invalid_argument("Unknown tokenizer name: " + name);
+    }
+}
+
+TokenFilterFactoryPtr get_token_filter_factory(const std::string& name, const 
Settings& params) {
+    using FactoryCreator = std::function<TokenFilterFactoryPtr()>;
+
+    static const std::map<std::string, FactoryCreator> factoryCreators = {
+            {"lowercase", []() { return 
std::make_shared<LowerCaseFilterFactory>(); }},
+            {"asciifolding", []() { return 
std::make_shared<ASCIIFoldingFilterFactory>(); }},
+            {"word_delimiter", []() { return 
std::make_shared<WordDelimiterFilterFactory>(); }}};
+
+    auto it = factoryCreators.find(name);
+    if (it != factoryCreators.end()) {
+        auto tk = it->second();
+        tk->initialize(params);
+        return tk;
+    } else {
+        throw std::invalid_argument("Unknown token filter name: " + name);
+    }
+}
+
+CustomAnalyzer::CustomAnalyzer(Builder* builder) {
+    _tokenizer = builder->_tokenizer;
+    _token_filters = builder->_token_filters;
+}
+
+TokenStream* CustomAnalyzer::tokenStream(const TCHAR* fieldName, 
lucene::util::Reader* reader) {
+    return nullptr;
+}
+
+TokenStream* CustomAnalyzer::reusableTokenStream(const TCHAR* fieldName,
+                                                 lucene::util::Reader* reader) 
{
+    if (_reuse_token_stream == nullptr) {
+        _reuse_token_stream = create_components();
+    }
+    _reuse_token_stream->set_reader(reader);
+    _reuse_token_stream->get_token_stream()->reset();
+    return _reuse_token_stream->get_token_stream().get();
+}
+
+TokenStreamComponentsPtr CustomAnalyzer::create_components() {
+    auto tk = _tokenizer->create();
+    TokenStreamPtr ts = tk;
+    for (const auto& filter : _token_filters) {
+        ts = filter->create(ts);
+    }
+    return std::make_shared<TokenStreamComponents>(tk, ts);
+}
+
+CustomAnalyzerPtr CustomAnalyzer::build_custom_analyzer(const 
CustomAnalyzerConfigPtr& config) {
+    if (config == nullptr) {
+        throw Exception(ErrorCode::ILLEGAL_STATE, "Null configuration 
detected.");
+    }
+    CustomAnalyzer::Builder builder;
+    builder.with_tokenizer(config->get_tokenizer_config()->get_name(),

Review Comment:
   why not just pass the whole ComponentConfig?



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp:
##########
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "custom_analyzer.h"
+
+#include <memory>
+#include <unordered_map>
+
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/loser_case_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+TokenizerFactoryPtr get_tokenizer_factory(const std::string& name, const 
Settings& params) {

Review Comment:
   get_tokenizer



##########
be/src/olap/rowset/segment_v2/inverted_index/setting.h:
##########
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unicode/utf8.h>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string/trim.hpp>
+#include <memory>
+#include <regex>
+#include <unordered_map>
+#include <utility>
+
+#include "common/exception.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class Settings {
+public:
+    Settings() = default;
+    Settings(std::unordered_map<std::string, std::string> args) : 
_args(std::move(args)) {}
+    Settings(const Settings&) = default;
+    Settings(Settings&&) = default;
+    ~Settings() = default;
+
+    void set(const std::string& key, const std::string& value) {
+        _args.insert(std::make_pair(key, value));
+    }
+
+    bool empty() const { return _args.empty(); }
+
+    bool get_bool(const std::string& key, bool default_value) const {
+        auto it = _args.find(key);
+        if (it != _args.end()) {
+            std::string value = it->second;
+            std::transform(value.begin(), value.end(), value.begin(),
+                           [](unsigned char c) { return std::tolower(c); });
+            if (value == "true" || value == "1") {
+                return true;
+            } else if (value == "false" || value == "0") {
+                return false;
+            }
+        }
+        return default_value;
+    }
+
+    int32_t get_int(const std::string& key, int32_t default_value) const {
+        auto it = _args.find(key);
+        if (it != _args.end()) {
+            try {
+                size_t pos;
+                int num = std::stoi(it->second, &pos);
+                if (pos == it->second.size()) {
+                    return num;
+                }
+            } catch (...) {
+                throw Exception(ErrorCode::INVALID_ARGUMENT,
+                                "stoi failed (invalid argument or out of 
range): " + it->second);
+            }
+        }
+        return default_value;
+    }
+
+    std::string get_string(const std::string& key) const {

Review Comment:
   why not add default_value as get_bool and get_int?



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.h:
##########
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "common/exception.h"
+#include 
"olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer_config.h"
+#include "olap/rowset/segment_v2/inverted_index/setting.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/token_filter_factory.h"
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class TokenStreamComponents;
+using TokenStreamComponentsPtr = std::shared_ptr<TokenStreamComponents>;
+
+class CustomAnalyzer;
+using CustomAnalyzerPtr = std::shared_ptr<CustomAnalyzer>;
+
+class CustomAnalyzer : public Analyzer {
+public:
+    class Builder {
+    public:
+        Builder() = default;
+        ~Builder() = default;
+
+        void with_tokenizer(const std::string& name, const Settings& params);
+        void add_token_filter(const std::string& name, const Settings& params);
+        CustomAnalyzerPtr build();
+
+    private:
+        TokenizerFactoryPtr _tokenizer;
+        std::vector<TokenFilterFactoryPtr> _token_filters;
+
+        friend class CustomAnalyzer;
+    };
+
+    CustomAnalyzer(Builder* builder);
+    ~CustomAnalyzer() override = default;
+
+    TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* 
reader) override;
+    TokenStream* reusableTokenStream(const TCHAR* fieldName, 
lucene::util::Reader* reader) override;
+
+    static CustomAnalyzerPtr build_custom_analyzer(const 
CustomAnalyzerConfigPtr& config);

Review Comment:
   uniform config, settings, params to a unique name



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp:
##########
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "custom_analyzer.h"
+
+#include <memory>
+#include <unordered_map>
+
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/loser_case_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+TokenizerFactoryPtr get_tokenizer_factory(const std::string& name, const 
Settings& params) {
+    using FactoryCreator = std::function<TokenizerFactoryPtr()>;
+
+    static const std::map<std::string, FactoryCreator> factoryCreators = {
+            {"standard", []() { return 
std::make_shared<StandardTokenizerFactory>(); }},
+            {"keyword", []() { return 
std::make_shared<KeywordTokenizerFactory>(); }},
+            {"edge_ngram", []() { return 
std::make_shared<EdgeNGramTokenizerFactory>(); }}};
+
+    auto it = factoryCreators.find(name);
+    if (it != factoryCreators.end()) {
+        auto tk = it->second();
+        tk->initialize(params);
+        return tk;
+    } else {
+        throw std::invalid_argument("Unknown tokenizer name: " + name);
+    }
+}
+
+TokenFilterFactoryPtr get_token_filter_factory(const std::string& name, const 
Settings& params) {

Review Comment:
   get_token_filter



##########
be/src/olap/rowset/segment_v2/inverted_index/setting.h:
##########
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unicode/utf8.h>
+
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string/trim.hpp>
+#include <memory>
+#include <regex>
+#include <unordered_map>
+#include <utility>
+
+#include "common/exception.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class Settings {

Review Comment:
   Parameters



##########
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer.h:
##########
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h"
+
+using namespace lucene::analysis;
+
+namespace doris::segment_v2::inverted_index {
+
+class KeywordTokenizer : public DorisTokenizer {
+public:
+    KeywordTokenizer() = default;
+    ~KeywordTokenizer() override = default;
+
+    void initialize(int32_t buffer_size = DEFAULT_BUFFER_SIZE) {
+        if (buffer_size > MAX_TOKEN_LENGTH_LIMIT || buffer_size <= 0) {
+            throw Exception(ErrorCode::INVALID_ARGUMENT,
+                            "maxTokenLen must be greater than 0 and less than 
" +
+                                    std::to_string(MAX_TOKEN_LENGTH_LIMIT) +
+                                    " passed: " + std::to_string(buffer_size));
+        }
+        _buffer_size = std::min(buffer_size, MAX_TOKEN_LENGTH_LIMIT);
+    }
+
+    Token* next(Token* token) override {
+        if (!_done) {
+            _done = true;
+            if (_char_buffer == nullptr) {
+                return nullptr;
+            }
+            int32_t length = std::min(_char_length, MAX_TOKEN_LENGTH_LIMIT);
+            std::string_view term(_char_buffer, length);

Review Comment:
   tmp var term can be elimited.



##########
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h:
##########
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "keyword_tokenizer.h"
+#include "olap/rowset/segment_v2/inverted_index/setting.h"
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class KeywordTokenizerFactory : public TokenizerFactory {
+public:
+    KeywordTokenizerFactory() = default;
+    ~KeywordTokenizerFactory() override = default;
+
+    void initialize(const Settings& settings) override {
+        _max_token_len = settings.get_int("max_token_len", 
KeywordTokenizer::DEFAULT_BUFFER_SIZE);
+        if (_max_token_len > KeywordTokenizer::MAX_TOKEN_LENGTH_LIMIT || 
_max_token_len <= 0) {

Review Comment:
   the check is duplicated with KeywordTokenizer::initialize



##########
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h:
##########
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "CLucene.h" // IWYU pragma: keep
+#include "CLucene/analysis/AnalysisHeader.h"
+#include "common/exception.h"
+
+using namespace lucene::analysis;
+
+using TokenStreamPtr = std::shared_ptr<TokenStream>;
+using TokenPtr = std::shared_ptr<Token>;
+
+namespace doris::segment_v2::inverted_index {
+
+class DorisTokenizer : public Tokenizer {

Review Comment:
   Add comment for the interfaces that should be implemented by a 
DorisTokenizer developer.



##########
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_impl.h:
##########
@@ -0,0 +1,212 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unicode/uchar.h>
+#include <unicode/unistr.h>
+
+#include <cstdint>
+#include <memory>
+#include <string_view>
+#include <vector>
+
+namespace doris::segment_v2::inverted_index {
+
+class StandardTokenizerImpl {

Review Comment:
   Is it the same to StandardTokenizer?



##########
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/ngram/ngram_tokenizer.h:
##########
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unicode/unistr.h>
+#include <unicode/utext.h>
+
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h"
+
+using namespace lucene::analysis;
+
+namespace doris::segment_v2::inverted_index {
+
+class NGramTokenizer : public DorisTokenizer {
+public:
+    NGramTokenizer(int32_t min_gram, int32_t max_gram, bool edges_only);
+    NGramTokenizer(int32_t min_gram, int32_t max_gram);
+    NGramTokenizer();
+    ~NGramTokenizer() override = default;
+
+    Token* next(Token* token) override;
+    void reset() override;
+
+    static constexpr int32_t DEFAULT_MIN_NGRAM_SIZE = 1;
+    static constexpr int32_t DEFAULT_MAX_NGRAM_SIZE = 2;
+
+private:
+    void init(int32_t min_gram, int32_t max_gram, bool edges_only);
+    void update_last_non_token_char();
+
+    void consume() {
+        uint8_t c = _buffer[_buffer_start++];
+        _offset += U8_LENGTH(c);
+    }
+
+    virtual bool is_token_char(UChar32 chr) { return true; }
+
+    std::pair<int32_t, int32_t> to_code_points(const char* char_buffer, 
int32_t char_offset,
+                                               int32_t char_length, 
std::vector<UChar32>& buffer,
+                                               int32_t buffer_end) const;
+
+    std::string to_chars(const std::vector<UChar32>& buffer, int32_t start, 
int32_t size);
+
+    int32_t _buffer_start = 0;

Review Comment:
   add comment for the memory layout and variables.



##########
be/src/olap/rowset/segment_v2/inverted_index/analyzer/custom_analyzer.cpp:
##########
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "custom_analyzer.h"
+
+#include <memory>
+#include <unordered_map>
+
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/loser_case_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+TokenizerFactoryPtr get_tokenizer_factory(const std::string& name, const 
Settings& params) {
+    using FactoryCreator = std::function<TokenizerFactoryPtr()>;
+
+    static const std::map<std::string, FactoryCreator> factoryCreators = {
+            {"standard", []() { return 
std::make_shared<StandardTokenizerFactory>(); }},
+            {"keyword", []() { return 
std::make_shared<KeywordTokenizerFactory>(); }},
+            {"edge_ngram", []() { return 
std::make_shared<EdgeNGramTokenizerFactory>(); }}};
+
+    auto it = factoryCreators.find(name);
+    if (it != factoryCreators.end()) {
+        auto tk = it->second();
+        tk->initialize(params);

Review Comment:
   It will make factory state. Can you just call create() with params and 
return a Tokenizer?



##########
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/ngram/ngram_tokenizer_factory.h:
##########
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "char_matcher.h"
+#include "ngram_tokenizer.h"
+#include "olap/rowset/segment_v2/inverted_index/setting.h"
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class NGramTokenizerFactory : public TokenizerFactory {
+public:
+    NGramTokenizerFactory() = default;
+    ~NGramTokenizerFactory() override = default;
+
+    void initialize(const Settings& settings) override;
+
+    TokenizerPtr create() override {
+        if (_matcher == nullptr) {
+            return std::make_shared<NGramTokenizer>(_min_gram, _max_gram);
+        } else {
+            class NGramTokenizerWithMatcher : public NGramTokenizer {

Review Comment:
   Move the code to ngram_tokenizer.h



##########
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer.h:
##########
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h"
+#include "standard_tokenizer_impl.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class StandardTokenizer : public DorisTokenizer {
+public:
+    StandardTokenizer() { _scanner = 
std::make_unique<StandardTokenizerImpl>(); }
+
+    Token* next(Token* t) override {
+        while (true) {
+            int32_t tokenType = _scanner->get_next_token();
+
+            if (tokenType == StandardTokenizerImpl::YYEOF) {
+                break;
+            }
+
+            if (_scanner->yylength() <= _max_token_length) {
+                std::string_view term = _scanner->get_text();
+                t->setNoCopy(term.data(), 0, term.size());
+                return t;
+            }

Review Comment:
   Is it right to continue loop for the `else` condition?



##########
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/ngram/ngram_tokenizer.cpp:
##########
@@ -0,0 +1,149 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "ngram_tokenizer.h"
+
+#include "common/exception.h"
+
+namespace doris::segment_v2::inverted_index {
+
+NGramTokenizer::NGramTokenizer(int32_t min_gram, int32_t max_gram, bool 
edges_only) {
+    init(min_gram, max_gram, edges_only);
+}
+
+NGramTokenizer::NGramTokenizer(int32_t min_gram, int32_t max_gram)
+        : NGramTokenizer(min_gram, max_gram, false) {}
+
+NGramTokenizer::NGramTokenizer() : NGramTokenizer(DEFAULT_MIN_NGRAM_SIZE, 
DEFAULT_MAX_NGRAM_SIZE) {}
+
+Token* NGramTokenizer::next(Token* token) {

Review Comment:
   The code in this function is not so easy to understand. Please add some 
comment.



##########
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer.h:
##########
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h"
+
+using namespace lucene::analysis;
+
+namespace doris::segment_v2::inverted_index {
+
+class KeywordTokenizer : public DorisTokenizer {
+public:
+    KeywordTokenizer() = default;
+    ~KeywordTokenizer() override = default;
+
+    void initialize(int32_t buffer_size = DEFAULT_BUFFER_SIZE) {
+        if (buffer_size > MAX_TOKEN_LENGTH_LIMIT || buffer_size <= 0) {
+            throw Exception(ErrorCode::INVALID_ARGUMENT,
+                            "maxTokenLen must be greater than 0 and less than 
" +
+                                    std::to_string(MAX_TOKEN_LENGTH_LIMIT) +
+                                    " passed: " + std::to_string(buffer_size));
+        }
+        _buffer_size = std::min(buffer_size, MAX_TOKEN_LENGTH_LIMIT);
+    }
+
+    Token* next(Token* token) override {
+        if (!_done) {
+            _done = true;
+            if (_char_buffer == nullptr) {
+                return nullptr;
+            }
+            int32_t length = std::min(_char_length, MAX_TOKEN_LENGTH_LIMIT);
+            std::string_view term(_char_buffer, length);
+            token->set(term.data(), 0, term.size());
+            return token;
+        }
+        return nullptr;
+    }
+
+    void reset() override {
+        DorisTokenizer::reset();
+        _done = false;
+        _char_buffer = nullptr;
+        _char_length = _in->read((const void**)&_char_buffer, 0, _in->size());
+    }
+
+    static constexpr int32_t DEFAULT_BUFFER_SIZE = 256;

Review Comment:
   Is it equal to ES?



##########
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/ngram/ngram_tokenizer_factory.cpp:
##########
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "ngram_tokenizer_factory.h"
+
+#include "common/exception.h"
+
+namespace doris::segment_v2::inverted_index {
+
+std::unordered_map<std::string, CharMatcherPtr> 
NGramTokenizerFactory::MATCHERS;
+
+void NGramTokenizerFactory::initialize(const Settings& settings) {
+    _min_gram = settings.get_int("min_gram", 
NGramTokenizer::DEFAULT_MIN_NGRAM_SIZE);
+    _max_gram = settings.get_int("max_gram", 
NGramTokenizer::DEFAULT_MAX_NGRAM_SIZE);
+    int32_t ngram_diff = _max_gram - _min_gram;
+    if (ngram_diff > 1) {
+        throw Exception(
+                ErrorCode::INVALID_ARGUMENT,
+                "The difference between max_gram and min_gram in NGram 
Tokenizer must be less "
+                "than or equal to: [ 1 ] but was [" +
+                        std::to_string(ngram_diff) + "]");
+    }
+    _matcher = parse_token_chars(settings);
+}
+
+void NGramTokenizerFactory::initialize_matchers() {
+    static std::once_flag once_flag;
+    std::call_once(once_flag, []() {
+        MATCHERS["letter"] = 
std::make_shared<BasicCharMatcher>(BasicCharMatcher::Type::LETTER);
+        MATCHERS["digit"] = 
std::make_shared<BasicCharMatcher>(BasicCharMatcher::Type::DIGIT);
+        MATCHERS["whitespace"] =
+                
std::make_shared<BasicCharMatcher>(BasicCharMatcher::Type::WHITESPACE);
+        MATCHERS["punctuation"] =
+                
std::make_shared<BasicCharMatcher>(BasicCharMatcher::Type::PUNCTUATION);
+        MATCHERS["symbol"] = 
std::make_shared<BasicCharMatcher>(BasicCharMatcher::Type::SYMBOL);
+    });
+}
+
+CharMatcherPtr NGramTokenizerFactory::parse_token_chars(const Settings& 
settings) {
+    if (settings.empty()) {
+        return nullptr;
+    }
+    auto characters = settings.get_word_list("token_chars");
+    if (characters.empty()) {
+        return nullptr;
+    }
+    CharMatcherBuilder builder;
+    for (const auto& character : characters) {
+        initialize_matchers();
+        auto matcher = MATCHERS.find(character);

Review Comment:
   I think the more clear logic should be
   ```
   if (character == "custom") {
       // process custom matcher
   } else {
       // process pre-defined matcher
   }
   ```



##########
be/src/olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter.cpp:
##########
@@ -0,0 +1,2014 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "ascii_folding_filter.h"
+
+#include <string_view>
+
+namespace doris::segment_v2::inverted_index {
+
+ASCIIFoldingFilter::ASCIIFoldingFilter(const TokenStreamPtr& in, bool 
preserve_original)
+        : DorisTokenFilter(in), _preserve_original(preserve_original), 
_output(512, 0) {}
+
+Token* ASCIIFoldingFilter::next(Token* t) {
+    if (!_state.empty()) {
+        assert(_preserve_original);
+        t->set(_state.data(), 0, _state.size());
+        t->setPositionIncrement(0);
+        return t;
+    }
+    if (_in->next(t)) {
+        const char* buffer = t->termBuffer<char>();
+        int32_t length = t->termLength<char>();
+        for (int32_t i = 0; i < length;) {
+            UChar32 c = U_UNASSIGNED;
+            U8_NEXT(buffer, i, length, c);
+            if (c < 0) {

Review Comment:
   add comment for < 0



##########
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer.h:
##########
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h"
+#include "standard_tokenizer_impl.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class StandardTokenizer : public DorisTokenizer {
+public:
+    StandardTokenizer() { _scanner = 
std::make_unique<StandardTokenizerImpl>(); }
+
+    Token* next(Token* t) override {
+        while (true) {
+            int32_t tokenType = _scanner->get_next_token();
+
+            if (tokenType == StandardTokenizerImpl::YYEOF) {
+                break;
+            }
+
+            if (_scanner->yylength() <= _max_token_length) {
+                std::string_view term = _scanner->get_text();
+                t->setNoCopy(term.data(), 0, term.size());
+                return t;
+            }
+        }
+        return nullptr;
+    }
+
+    void reset() override {
+        DorisTokenizer::reset();
+        const char* _char_buffer = nullptr;
+        size_t length = _in->read((const void**)&_char_buffer, 0, _in->size());
+        _scanner->yyreset({_char_buffer, length});
+    };
+
+    void set_max_token_length(int32_t length) {
+        if (length < 1) {
+            throw Exception(ErrorCode::INVALID_ARGUMENT,
+                            "max_token_length must be greater than zero");
+        } else if (length > MAX_TOKEN_LENGTH_LIMIT) {
+            throw Exception(
+                    ErrorCode::INVALID_ARGUMENT,
+                    "max_token_length may not exceed " + 
std::to_string(MAX_TOKEN_LENGTH_LIMIT));
+        }
+        if (length != _max_token_length) {

Review Comment:
   just assign to avoid branch miss.



##########
be/src/olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter.cpp:
##########
@@ -0,0 +1,2014 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "ascii_folding_filter.h"
+
+#include <string_view>
+
+namespace doris::segment_v2::inverted_index {
+
+ASCIIFoldingFilter::ASCIIFoldingFilter(const TokenStreamPtr& in, bool 
preserve_original)
+        : DorisTokenFilter(in), _preserve_original(preserve_original), 
_output(512, 0) {}
+
+Token* ASCIIFoldingFilter::next(Token* t) {
+    if (!_state.empty()) {

Review Comment:
   can you use a more meaningful name for _state?



##########
be/src/olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter.h:
##########
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "token_filter.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class LowerCaseFilter : public DorisTokenFilter {
+public:
+    LowerCaseFilter(const TokenStreamPtr& in) : DorisTokenFilter(in) {}
+    ~LowerCaseFilter() override = default;
+
+    Token* next(Token* t) override {
+        if (_in->next(t) == nullptr) {
+            return nullptr;
+        }
+
+        std::string_view term(t->termBuffer<char>(), t->termLength<char>());
+        std::transform(term.begin(), term.end(), 
const_cast<char*>(term.data()), [](char c) {
+            if (static_cast<uint8_t>(c) < 0x80) {

Review Comment:
   add comment for 0x80



##########
be/src/olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter.cpp:
##########
@@ -0,0 +1,2014 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "ascii_folding_filter.h"
+
+#include <string_view>
+
+namespace doris::segment_v2::inverted_index {
+
+ASCIIFoldingFilter::ASCIIFoldingFilter(const TokenStreamPtr& in, bool 
preserve_original)
+        : DorisTokenFilter(in), _preserve_original(preserve_original), 
_output(512, 0) {}
+
+Token* ASCIIFoldingFilter::next(Token* t) {
+    if (!_state.empty()) {
+        assert(_preserve_original);
+        t->set(_state.data(), 0, _state.size());
+        t->setPositionIncrement(0);
+        return t;
+    }
+    if (_in->next(t)) {
+        const char* buffer = t->termBuffer<char>();
+        int32_t length = t->termLength<char>();
+        for (int32_t i = 0; i < length;) {
+            UChar32 c = U_UNASSIGNED;
+            U8_NEXT(buffer, i, length, c);
+            if (c < 0) {
+                continue;
+            }
+            if (c >= 0x0080) {
+                fold_to_ascii(buffer, length);
+                t->setNoCopy(_output.data(), 0, _output_pos);
+                break;
+            }
+        }
+        return t;
+    }
+    return nullptr;
+}
+
+void ASCIIFoldingFilter::reset() {
+    DorisTokenFilter::reset();
+    _state.clear();
+}
+
+void ASCIIFoldingFilter::fold_to_ascii(const char* in, int32_t length) {
+    int32_t max_size_needed = 4 * length;
+    if (_output.size() < max_size_needed) {
+        _output.resize(max_size_needed);
+    }
+
+    _output_pos = fold_to_ascii(in, 0, _output.data(), 0, length);

Review Comment:
   Is pass 0 to input_pos right? I think it should be `i`.



##########
be/src/olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter.h:
##########
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "token_filter.h"
+#include "word_delimiter_iterator.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class WordDelimiterConcatenation;
+using WordDelimiterConcatenationPtr = 
std::unique_ptr<WordDelimiterConcatenation>;
+
+class WordDelimiterFilter : public DorisTokenFilter {
+public:
+    WordDelimiterFilter(const TokenStreamPtr& in, std::vector<char> 
char_type_table,
+                        int32_t configuration_flags, 
std::unordered_set<std::string> prot_words);
+    ~WordDelimiterFilter() override = default;
+
+    Token* next(Token* t) override;
+    void reset() override;
+
+    static bool is_alpha(int32_t type) { return (type & ALPHA) != 0; }
+    static bool is_digit(int32_t type) { return (type & DIGIT) != 0; }
+
+    static constexpr int32_t LOWER = 0x01;

Review Comment:
   add comment for the types and flags



##########
be/src/olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter.cpp:
##########
@@ -0,0 +1,2014 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "ascii_folding_filter.h"
+
+#include <string_view>
+
+namespace doris::segment_v2::inverted_index {
+
+ASCIIFoldingFilter::ASCIIFoldingFilter(const TokenStreamPtr& in, bool 
preserve_original)
+        : DorisTokenFilter(in), _preserve_original(preserve_original), 
_output(512, 0) {}
+
+Token* ASCIIFoldingFilter::next(Token* t) {
+    if (!_state.empty()) {
+        assert(_preserve_original);
+        t->set(_state.data(), 0, _state.size());
+        t->setPositionIncrement(0);
+        return t;
+    }
+    if (_in->next(t)) {
+        const char* buffer = t->termBuffer<char>();
+        int32_t length = t->termLength<char>();
+        for (int32_t i = 0; i < length;) {
+            UChar32 c = U_UNASSIGNED;
+            U8_NEXT(buffer, i, length, c);
+            if (c < 0) {
+                continue;
+            }
+            if (c >= 0x0080) {

Review Comment:
   add comment for 0x0080



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Re: [PR] [feature](inverted index) add ngram tokenizer [doris]

Reply via email to