Re: [PR] [feature](inverted index) support multiple tokenize index in one column [doris]

via GitHub Thu, 18 Dec 2025 02:42:11 -0800


Copilot commented on code in PR #59117:
URL: https://github.com/apache/doris/pull/59117#discussion_r2630506527



##########
fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java:
##########
@@ -395,4 +395,11 @@ public boolean isAnalyzedInvertedIndex() {
                 || 
properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_ANALYZER_NAME_KEY)
                 || 
properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_NORMALIZER_NAME_KEY));
     }
+
+    public String getAnalyzerIdentity() {
+        if (indexType != IndexDefinition.IndexType.INVERTED) {
+            return "";
+        }
+        return InvertedIndexUtil.buildAnalyzerIdentity(properties);
+    }

Review Comment:
   Missing test coverage for the new getAnalyzerIdentity method. This new 
public method returns analyzer identity for indexes but doesn't appear to be 
tested. Consider adding unit tests to verify it returns the correct identity 
for different property configurations.



##########
regression-test/suites/inverted_index_p0/test_match_using_analyzer.groovy:
##########
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_match_using_analyzer", "p0") {
+    def tableName = "tbl_match_using_analyzer"
+    def tokenizerName = "edge_ngram_name_tokenizer_match_test"
+    def analyzerStandard = "first_name_standard_match_test"
+    def analyzerEdge = "edge_ngram_name_match_test"
+    def analyzerKeyword = "first_name_keyword_match_test"
+
+    def waitAnalyzerReady = { analyzerName ->
+        sleep(10000)
+        int maxRetry = 30
+        boolean ready = false
+        Exception lastException = null
+        for (int i = 0; i < maxRetry && !ready; i++) {
+            try {
+                sql """ select tokenize("probe", 
'"analyzer"="${analyzerName}"'); """
+                ready = true
+            } catch (Exception e) {
+                lastException = e
+                logger.info("Analyzer ${analyzerName} not ready yet: 
${e.message}")
+                sql """ select sleep(1) """
+            }
+        }
+        assertTrue(ready, "Analyzer ${analyzerName} not ready after waiting: 
${lastException?.message}")
+    }
+
+    try {
+        sql "DROP TABLE IF EXISTS ${tableName}"
+
+        sql """
+            CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS ${tokenizerName}
+            PROPERTIES
+            (
+                "type" = "edge_ngram",
+                "min_gram" = "1",
+                "max_gram" = "20",
+                "token_chars" = "letter"
+            );
+        """
+
+        sql """
+            CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerStandard}
+            PROPERTIES
+            (
+                "tokenizer" = "standard",
+                "token_filter" = "lowercase"
+            );
+        """
+
+        sql """
+            CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerEdge}
+            PROPERTIES
+            (
+                "tokenizer" = "${tokenizerName}",
+                "token_filter" = "lowercase"
+            );
+        """
+
+        sql """
+            CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerKeyword}
+            PROPERTIES
+            (
+                "tokenizer" = "keyword",
+                "token_filter" = "lowercase"
+            );
+        """
+
+        //waitAnalyzerReady(analyzerStandard)
+        //waitAnalyzerReady(analyzerEdge)
+        //waitAnalyzerReady(analyzerKeyword)

Review Comment:
   Commented-out code should be removed. These analyzer readiness check calls 
appear to be intentionally disabled but if they're not needed, they should be 
deleted rather than commented out. If they're needed for future use, consider 
documenting why they're disabled.



##########
regression-test/suites/inverted_index_p0/test_match_using_analyzer.groovy:
##########
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_match_using_analyzer", "p0") {
+    def tableName = "tbl_match_using_analyzer"
+    def tokenizerName = "edge_ngram_name_tokenizer_match_test"
+    def analyzerStandard = "first_name_standard_match_test"
+    def analyzerEdge = "edge_ngram_name_match_test"
+    def analyzerKeyword = "first_name_keyword_match_test"
+
+    def waitAnalyzerReady = { analyzerName ->
+        sleep(10000)
+        int maxRetry = 30
+        boolean ready = false
+        Exception lastException = null
+        for (int i = 0; i < maxRetry && !ready; i++) {
+            try {
+                sql """ select tokenize("probe", 
'"analyzer"="${analyzerName}"'); """
+                ready = true
+            } catch (Exception e) {
+                lastException = e
+                logger.info("Analyzer ${analyzerName} not ready yet: 
${e.message}")
+                sql """ select sleep(1) """
+            }
+        }
+        assertTrue(ready, "Analyzer ${analyzerName} not ready after waiting: 
${lastException?.message}")
+    }
+
+    try {
+        sql "DROP TABLE IF EXISTS ${tableName}"
+
+        sql """
+            CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS ${tokenizerName}
+            PROPERTIES
+            (
+                "type" = "edge_ngram",
+                "min_gram" = "1",
+                "max_gram" = "20",
+                "token_chars" = "letter"
+            );
+        """
+
+        sql """
+            CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerStandard}
+            PROPERTIES
+            (
+                "tokenizer" = "standard",
+                "token_filter" = "lowercase"
+            );
+        """
+
+        sql """
+            CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerEdge}
+            PROPERTIES
+            (
+                "tokenizer" = "${tokenizerName}",
+                "token_filter" = "lowercase"
+            );
+        """
+
+        sql """
+            CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerKeyword}
+            PROPERTIES
+            (
+                "tokenizer" = "keyword",
+                "token_filter" = "lowercase"
+            );
+        """
+
+        //waitAnalyzerReady(analyzerStandard)
+        //waitAnalyzerReady(analyzerEdge)
+        //waitAnalyzerReady(analyzerKeyword)
+
+        sql """
+            CREATE TABLE ${tableName} (
+                id INT,
+                first_name STRING,
+                INDEX idx_fn_std (first_name) USING INVERTED 
PROPERTIES("analyzer"="${analyzerStandard}", "support_phrase"="true"),
+                INDEX idx_fn_ngr (first_name) USING INVERTED 
PROPERTIES("analyzer"="${analyzerEdge}"),
+                INDEX idx_fn_exact (first_name) USING INVERTED 
PROPERTIES("analyzer"="${analyzerKeyword}")
+            ) DUPLICATE KEY(id)
+            DISTRIBUTED BY HASH(id) BUCKETS 1
+            PROPERTIES (
+                "replication_allocation" = "tag.location.default: 1",
+                "disable_auto_compaction" = "true"
+            )
+        """
+        sleep(10000)
+        sql """
+            INSERT INTO ${tableName} VALUES
+                (1, 'alice'),
+                (2, 'alice cooper'),
+                (3, 'bob'),
+                (4, 'allyson');
+        """
+
+        sql "sync"
+
+        // default analyzer should match both tokenized rows that contain 
"alice"
+        qt_match_default """
+            SELECT id FROM ${tableName}
+            WHERE first_name MATCH 'alice'
+            ORDER BY id
+        """
+
+        // keyword analyzer only matches the full literal "alice"
+        qt_match_keyword """
+            SELECT id FROM ${tableName}
+            WHERE first_name MATCH 'alice' USING ANALYZER ${analyzerKeyword}
+            ORDER BY id
+        """
+
+        // edge ngram analyzer should cover names beginning with "al"
+        qt_match_edge """
+            SELECT id FROM ${tableName}
+            WHERE first_name MATCH_PHRASE_PREFIX 'al' USING ANALYZER 
${analyzerEdge}
+            ORDER BY id
+        """
+
+        // show create table should contain analyzer names
+        def showCreate = sql """ SHOW CREATE TABLE ${tableName} """
+        def createStmt = showCreate.collect { row -> 
row[1].toString().toLowerCase() }.join("\n")
+        assertTrue(createStmt.contains(analyzerStandard.toLowerCase()))
+        assertTrue(createStmt.contains(analyzerEdge.toLowerCase()))
+        assertTrue(createStmt.contains(analyzerKeyword.toLowerCase()))
+    } finally {
+        /*sql "DROP TABLE IF EXISTS ${tableName}"
+
+        try {
+            sql "DROP INVERTED INDEX ANALYZER ${analyzerStandard}"
+        } catch (Exception ignored) {
+        }
+
+        try {
+            sql "DROP INVERTED INDEX ANALYZER ${analyzerEdge}"
+        } catch (Exception ignored) {
+        }
+
+        try {
+            sql "DROP INVERTED INDEX ANALYZER ${analyzerKeyword}"
+        } catch (Exception ignored) {
+        }
+
+        try {
+            sql "DROP INVERTED INDEX TOKENIZER ${tokenizerName}"
+        } catch (Exception ignored) {
+        }*/

Review Comment:
   Large block of commented-out cleanup code should be removed or uncommented. 
The finally block contains extensive cleanup logic that is commented out. If 
this cleanup is necessary, it should be active. If it's not needed (perhaps 
handled elsewhere), the commented code should be deleted to improve code 
clarity.



##########
fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java:
##########
@@ -137,25 +157,32 @@ public boolean equals(Object obj) {
         if (!super.equals(obj)) {
             return false;
         }
-        return ((MatchPredicate) obj).op == op;
+        MatchPredicate other = (MatchPredicate) obj;
+        return other.op == op
+                && Objects.equals(explicitAnalyzer, other.explicitAnalyzer)
+                && Objects.equals(invertedIndexAnalyzerName, 
other.invertedIndexAnalyzerName)
+                && Objects.equals(invertedIndexParser, 
other.invertedIndexParser);
     }
 
     @Override
     public String toSqlImpl() {
-        return getChild(0).toSql() + " " + op.toString() + " " + 
getChild(1).toSql();
+        return getChild(0).toSql() + " " + op.toString() + " " + 
getChild(1).toSql()
+                + analyzerSqlFragment();
     }
 
     @Override
     public String toSqlImpl(boolean disableTableName, boolean needExternalSql, 
TableType tableType,
             TableIf table) {
         return getChild(0).toSql(disableTableName, needExternalSql, tableType, 
table) + " " + op.toString() + " "
-                + getChild(1).toSql(disableTableName, needExternalSql, 
tableType, table);
+                + getChild(1).toSql(disableTableName, needExternalSql, 
tableType, table)
+                + analyzerSqlFragment();
     }
 
     @Override
     protected void toThrift(TExprNode msg) {
         msg.node_type = TExprNodeType.MATCH_PRED;
         msg.setOpcode(op.getOpcode());
+        // Use new TMatchPredicate constructor with required fields

Review Comment:
   Incorrect comment about TMatchPredicate constructor. The comment states "Use 
new TMatchPredicate constructor with required fields" but the code is actually 
using the existing TMatchPredicate constructor with two parameters. This 
comment is misleading and should be removed or clarified.
   ```suggestion
           // Initialize TMatchPredicate with parser settings and additional 
configuration
   ```



##########
be/src/olap/rowset/segment_v2/inverted_index_iterator.h:
##########
@@ -31,7 +33,6 @@ struct InvertedIndexParam {
     uint32_t num_rows;
     std::shared_ptr<roaring::Roaring> roaring;
     bool skip_try = false;
-
     // Pointer to analyzer context (can be nullptr if not needed)

Review Comment:
   Unnecessary trailing whitespace in comment. Line 36 ends with a comment that 
appears to have trailing whitespace based on the diff context. This affects 
code cleanliness.



##########
fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java:
##########
@@ -75,54 +77,72 @@ public TExprOpcode getOpcode() {
 
     @SerializedName("op")
     private Operator op;
+    // Fields for thrift serialization (restored from old version)
     private String invertedIndexParser;
     private String invertedIndexParserMode;
     private Map<String, String> invertedIndexCharFilter;
     private boolean invertedIndexParserLowercase = true;
     private String invertedIndexParserStopwords = "";
     private String invertedIndexAnalyzerName = "";
+    // Fields for SQL generation
+    private String explicitAnalyzer = "";
 
     private MatchPredicate() {
         // use for serde only
         invertedIndexParser = InvertedIndexUtil.INVERTED_INDEX_PARSER_UNKNOWN;
         invertedIndexParserMode = 
InvertedIndexUtil.INVERTED_INDEX_PARSER_FINE_GRANULARITY;
     }
 
+    protected MatchPredicate(MatchPredicate other) {
+        super(other);
+        op = other.op;
+        invertedIndexParser = other.invertedIndexParser;
+        invertedIndexParserMode = other.invertedIndexParserMode;
+        invertedIndexCharFilter = other.invertedIndexCharFilter;
+        invertedIndexParserLowercase = other.invertedIndexParserLowercase;
+        invertedIndexParserStopwords = other.invertedIndexParserStopwords;
+        invertedIndexAnalyzerName = other.invertedIndexAnalyzerName;
+        explicitAnalyzer = other.explicitAnalyzer;
+    }
+
     /**
      * use for Nereids ONLY
      */
     public MatchPredicate(Operator op, Expr e1, Expr e2, Type retType,
             NullableMode nullableMode, Index invertedIndex, boolean nullable) {
+        this(op, e1, e2, retType, nullableMode, invertedIndex, nullable, null);
+    }

Review Comment:
   Constructor order is unusual - simpler constructor delegates to more complex 
one which then delegates to another. Consider reordering: have the most complex 
constructor first, then simpler ones delegate to it. This would make the 
delegation chain clearer and follow common Java constructor patterns.



##########
fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java:
##########
@@ -75,54 +77,72 @@ public TExprOpcode getOpcode() {
 
     @SerializedName("op")
     private Operator op;
+    // Fields for thrift serialization (restored from old version)
     private String invertedIndexParser;
     private String invertedIndexParserMode;
     private Map<String, String> invertedIndexCharFilter;
     private boolean invertedIndexParserLowercase = true;
     private String invertedIndexParserStopwords = "";
     private String invertedIndexAnalyzerName = "";
+    // Fields for SQL generation

Review Comment:
   Unnecessary comment about code that was moved. The comment "Fields for 
thrift serialization (restored from old version)" seems to reference 
implementation history rather than explaining the purpose of these fields. 
Consider updating to describe what these fields are used for functionally, or 
remove if self-explanatory.
   ```suggestion
       // Inverted index analyzer configuration persisted via Thrift 
serialization
       private String invertedIndexParser;
       private String invertedIndexParserMode;
       private Map<String, String> invertedIndexCharFilter;
       private boolean invertedIndexParserLowercase = true;
       private String invertedIndexParserStopwords = "";
       private String invertedIndexAnalyzerName = "";
       // Analyzer name explicitly specified in the SQL MATCH predicate, used 
for SQL generation
   ```



##########
be/src/vec/exprs/vmatch_predicate.cpp:
##########
@@ -29,10 +29,10 @@
 #include <gen_cpp/Exprs_types.h>
 #include <glog/logging.h>
 
-#include <cstddef>
 #include <memory>
-#include <ostream>
+#include <string>
 #include <string_view>
+#include <type_traits>

Review Comment:
   Unnecessary inclusion of includes in comment. The comment references 
"cstddef" and "ostream" which are no longer included after the changes. These 
appear to be removed includes that shouldn't be mentioned in the comment about 
what changed. Consider removing them from the diff context or clarifying the 
comment intent.



##########
be/src/vec/functions/match.cpp:
##########
@@ -121,7 +127,6 @@ Status FunctionMatchBase::execute_impl(FunctionContext* 
context, Block& block,
     VLOG_DEBUG << "begin to execute match directly, column_name=" << 
column_name
                << ", match_query_str=" << match_query_str;
     auto* analyzer_ctx = get_match_analyzer_ctx(context);

Review Comment:
   Unnecessary change removing blank line. This change removes a blank line 
that was separating the analyzer_ctx initialization from the source column 
extraction. The blank line improved readability by grouping related operations. 
Consider keeping it.
   ```suggestion
       auto* analyzer_ctx = get_match_analyzer_ctx(context);
   ```



##########
fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzerSelector.java:
##########
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.analysis;
+
+import org.apache.doris.catalog.Index;
+
+import com.google.common.base.Strings;
+
+import java.util.Collections;
+import java.util.Map;
+
+/**
+ * Helper for selecting the analyzer that should be attached to a MATCH 
predicate.
+ * The current implementation is intentionally simple so that future rule-based
+ * selection can plug in without touching the planner surface again.
+ */
+public final class AnalyzerSelector {
+
+    private AnalyzerSelector() {
+    }
+
+    public static Selection select(Index index, String requestedAnalyzer) {
+        Map<String, String> properties = index == null ? 
Collections.emptyMap() : index.getProperties();
+        return select(properties, requestedAnalyzer);
+    }
+
+    public static Selection select(Map<String, String> properties, String 
requestedAnalyzer) {
+        String normalizedRequest = normalize(requestedAnalyzer);
+        String preferredAnalyzer = selectDefaultAnalyzer(properties);
+        String parser = InvertedIndexUtil.getInvertedIndexParser(properties);
+
+        if (!Strings.isNullOrEmpty(normalizedRequest)) {
+            return new Selection(normalizedRequest, parser, true);
+        }
+
+        String resolvedAnalyzer = Strings.isNullOrEmpty(preferredAnalyzer) ? 
parser : preferredAnalyzer;
+        return new Selection(resolvedAnalyzer, parser, false);
+    }
+
+    private static String normalize(String analyzer) {
+        return analyzer == null ? "" : analyzer.trim();
+    }
+
+    private static String selectDefaultAnalyzer(Map<String, String> 
properties) {
+        // Placeholder for future rule-based selection. At the moment we simply
+        // reuse the analyzer attached to the index, if any.
+        return InvertedIndexUtil.getPreferredAnalyzer(properties);
+    }
+
+    public static final class Selection {
+        private final String analyzer;
+        private final String parser;
+        private final boolean explicit;
+
+        private Selection(String analyzer, String parser, boolean explicit) {
+            this.analyzer = normalize(analyzer);
+            String normalizedParser = normalize(parser);
+            this.parser = Strings.isNullOrEmpty(normalizedParser)
+                    ? InvertedIndexUtil.INVERTED_INDEX_PARSER_NONE
+                    : normalizedParser;
+            this.explicit = explicit;
+        }
+
+        public String analyzer() {
+            return analyzer;
+        }
+
+        public String parser() {
+            return parser;
+        }
+
+        public boolean explicit() {
+            return explicit;
+        }
+    }

Review Comment:
   Missing test coverage for the Selection class methods. The new Selection 
class in AnalyzerSelector has public methods (analyzer(), parser(), explicit()) 
that don't appear to have direct unit test coverage. While the class is tested 
through the parse() method tests, consider adding explicit tests for the 
Selection class behavior.



##########
fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java:
##########
@@ -225,12 +225,17 @@ public Expr visitMatch(Match match, PlanTranslatorContext 
context) {
             throw new AnalysisException("SlotReference in Match failed to get 
OlapTable, SQL is " + match.toSql());
         }
 
-        Index invertedIndex = olapTbl.getInvertedIndex(column, 
slot.getSubPath());
+        String analyzer = match.getAnalyzer().orElse(null);
+        Index invertedIndex = olapTbl.getInvertedIndex(column, 
slot.getSubPath(), analyzer);
+        if (analyzer != null && invertedIndex == null) {
+            throw new AnalysisException("No inverted index found for analyzer 
'" + analyzer

Review Comment:
   Missing validation of null analyzer parameter. The method accepts a nullable 
analyzer parameter but doesn't validate it before using it in the error 
message. While analyzer is used in a null-safe way in the condition, if it were 
null and the condition were true, the error message would display "null" which 
is not user-friendly.
   ```suggestion
           String analyzerForMsg = (analyzer == null) ? "<unspecified 
analyzer>" : analyzer;
           Index invertedIndex = olapTbl.getInvertedIndex(column, 
slot.getSubPath(), analyzer);
           if (analyzer != null && invertedIndex == null) {
               throw new AnalysisException("No inverted index found for 
analyzer '" + analyzerForMsg
   ```



##########
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java:
##########
@@ -500,4 +500,11 @@ public boolean isAnalyzedInvertedIndex() {
                 || 
properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_ANALYZER_NAME_KEY)
                 || 
properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_NORMALIZER_NAME_KEY));
     }
+
+    public String getAnalyzerIdentity() {
+        if (indexType != IndexType.INVERTED) {
+            return "";
+        }
+        return InvertedIndexUtil.buildAnalyzerIdentity(properties);
+    }

Review Comment:
   Missing test coverage for the new getAnalyzerIdentity method. This new 
public method returns analyzer identity for inverted indexes but doesn't appear 
to be tested. Consider adding unit tests to verify it returns the correct 
identity for different property configurations.



##########
be/src/vec/exprs/vmatch_predicate.cpp:
##########
@@ -167,7 +172,7 @@ Status VMatchPredicate::execute_column(VExprContext* 
context, const Block* block
 
         auto* column_slot_ref = assert_cast<VSlotRef*>(get_child(0).get());
         std::string column_name = column_slot_ref->expr_name();
-        auto it = std::find(column_names.begin(), column_names.end(), 
column_name);
+        auto it = std::ranges::find(column_names, column_name);

Review Comment:
   Inconsistent range-based for loop usage. The code changes lines 97, 106 to 
use "const auto&" for range-based loops but then uses "std::ranges::find" on 
line 175 which requires C++20 ranges. Consider using consistent modern C++ 
patterns throughout, or ensure all usages are compatible with the project's C++ 
standard.
   ```suggestion
           auto it = std::find(column_names.begin(), column_names.end(), 
column_name);
   ```



##########
fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java:
##########
@@ -2816,12 +2816,26 @@ private boolean checkDuplicateIndexes(List<Index> 
indexes, IndexDefinition index
                     String columnName = indexDef.getColumnNames().get(0);
                     Column column = olapTable.getColumn(columnName);
                     if (column != null && (column.getType().isStringType() || 
column.getType().isVariantType())) {
-                        boolean isExistingIndexAnalyzer = 
index.isAnalyzedInvertedIndex();
-                        boolean isNewIndexAnalyzer = 
indexDef.isAnalyzedInvertedIndex();
-                        if (isExistingIndexAnalyzer == isNewIndexAnalyzer) {
-                            throw new DdlException(
-                                indexDef.getIndexType() + " index for column 
(" + columnName + ") with "
-                                    + (isNewIndexAnalyzer ? "analyzed" : 
"non-analyzed") + " type already exists.");
+                        if (index.getIndexType() == IndexType.INVERTED) {
+                            String existingIdentity = 
index.getAnalyzerIdentity();
+                            String newIdentity = 
indexDef.getAnalyzerIdentity();
+                            if (Objects.equals(existingIdentity, newIdentity)) 
{
+                                String analyzerDesc = 
"__default__".equals(newIdentity)
+                                        ? "default analyzer"
+                                        : "analyzer identity '" + newIdentity 
+ "'";
+                                throw new DdlException(indexDef.getIndexType()
+                                        + " index for column (" + columnName + 
") with analyzer "
+                                        + analyzerDesc + " already exists.");
+                            }
+                        } else {
+                            boolean isExistingIndexAnalyzer = 
index.isAnalyzedInvertedIndex();
+                            boolean isNewIndexAnalyzer = 
indexDef.isAnalyzedInvertedIndex();
+                            if (isExistingIndexAnalyzer == isNewIndexAnalyzer) 
{
+                                throw new DdlException(
+                                    indexDef.getIndexType() + " index for 
column (" + columnName + ") with "
+                                        + (isNewIndexAnalyzer ? "analyzed" : 
"non-analyzed")
+                                        + " type already exists.");

Review Comment:
   Inconsistent error message format. The error message format changes between 
line 2827-2828 (using "analyzer identity") versus lines 2835-2836 (using 
"analyzed"/"non-analyzed"). For consistency, consider using the analyzer 
identity format for both branches, or restructure to have uniform messaging.
   ```suggestion
                                       indexDef.getIndexType() + " index for 
column (" + columnName + ") with analyzer "
                                           + (isNewIndexAnalyzer ? "analyzed" : 
"non-analyzed")
                                           + " already exists.");
   ```



##########
regression-test/suites/inverted_index_p0/test_match_using_analyzer.groovy:
##########
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_match_using_analyzer", "p0") {
+    def tableName = "tbl_match_using_analyzer"
+    def tokenizerName = "edge_ngram_name_tokenizer_match_test"
+    def analyzerStandard = "first_name_standard_match_test"
+    def analyzerEdge = "edge_ngram_name_match_test"
+    def analyzerKeyword = "first_name_keyword_match_test"
+
+    def waitAnalyzerReady = { analyzerName ->
+        sleep(10000)
+        int maxRetry = 30
+        boolean ready = false
+        Exception lastException = null
+        for (int i = 0; i < maxRetry && !ready; i++) {
+            try {
+                sql """ select tokenize("probe", 
'"analyzer"="${analyzerName}"'); """
+                ready = true
+            } catch (Exception e) {
+                lastException = e
+                logger.info("Analyzer ${analyzerName} not ready yet: 
${e.message}")
+                sql """ select sleep(1) """
+            }
+        }
+        assertTrue(ready, "Analyzer ${analyzerName} not ready after waiting: 
${lastException?.message}")
+    }
+
+    try {
+        sql "DROP TABLE IF EXISTS ${tableName}"
+
+        sql """
+            CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS ${tokenizerName}
+            PROPERTIES
+            (
+                "type" = "edge_ngram",
+                "min_gram" = "1",
+                "max_gram" = "20",
+                "token_chars" = "letter"
+            );
+        """
+
+        sql """
+            CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerStandard}
+            PROPERTIES
+            (
+                "tokenizer" = "standard",
+                "token_filter" = "lowercase"
+            );
+        """
+
+        sql """
+            CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerEdge}
+            PROPERTIES
+            (
+                "tokenizer" = "${tokenizerName}",
+                "token_filter" = "lowercase"
+            );
+        """
+
+        sql """
+            CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerKeyword}
+            PROPERTIES
+            (
+                "tokenizer" = "keyword",
+                "token_filter" = "lowercase"
+            );
+        """
+
+        //waitAnalyzerReady(analyzerStandard)
+        //waitAnalyzerReady(analyzerEdge)
+        //waitAnalyzerReady(analyzerKeyword)
+
+        sql """
+            CREATE TABLE ${tableName} (
+                id INT,
+                first_name STRING,
+                INDEX idx_fn_std (first_name) USING INVERTED 
PROPERTIES("analyzer"="${analyzerStandard}", "support_phrase"="true"),
+                INDEX idx_fn_ngr (first_name) USING INVERTED 
PROPERTIES("analyzer"="${analyzerEdge}"),
+                INDEX idx_fn_exact (first_name) USING INVERTED 
PROPERTIES("analyzer"="${analyzerKeyword}")
+            ) DUPLICATE KEY(id)
+            DISTRIBUTED BY HASH(id) BUCKETS 1
+            PROPERTIES (
+                "replication_allocation" = "tag.location.default: 1",
+                "disable_auto_compaction" = "true"
+            )
+        """
+        sleep(10000)

Review Comment:
   Hardcoded sleep of 10 seconds without explanation. This appears arbitrary 
and could make tests unnecessarily slow. Consider either using the existing 
waitAnalyzerReady mechanism (which is commented out above) or documenting why 
this specific duration is required.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [feature](inverted index) support multiple tokenize index in one column [doris]

Reply via email to