Copilot commented on code in PR #59117:
URL: https://github.com/apache/doris/pull/59117#discussion_r2630506527
##########
fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java:
##########
@@ -395,4 +395,11 @@ public boolean isAnalyzedInvertedIndex() {
||
properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_ANALYZER_NAME_KEY)
||
properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_NORMALIZER_NAME_KEY));
}
+
+ public String getAnalyzerIdentity() {
+ if (indexType != IndexDefinition.IndexType.INVERTED) {
+ return "";
+ }
+ return InvertedIndexUtil.buildAnalyzerIdentity(properties);
+ }
Review Comment:
Missing test coverage for the new getAnalyzerIdentity method. This new
public method returns analyzer identity for indexes but doesn't appear to be
tested. Consider adding unit tests to verify it returns the correct identity
for different property configurations.
##########
regression-test/suites/inverted_index_p0/test_match_using_analyzer.groovy:
##########
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_match_using_analyzer", "p0") {
+ def tableName = "tbl_match_using_analyzer"
+ def tokenizerName = "edge_ngram_name_tokenizer_match_test"
+ def analyzerStandard = "first_name_standard_match_test"
+ def analyzerEdge = "edge_ngram_name_match_test"
+ def analyzerKeyword = "first_name_keyword_match_test"
+
+ def waitAnalyzerReady = { analyzerName ->
+ sleep(10000)
+ int maxRetry = 30
+ boolean ready = false
+ Exception lastException = null
+ for (int i = 0; i < maxRetry && !ready; i++) {
+ try {
+ sql """ select tokenize("probe",
'"analyzer"="${analyzerName}"'); """
+ ready = true
+ } catch (Exception e) {
+ lastException = e
+ logger.info("Analyzer ${analyzerName} not ready yet:
${e.message}")
+ sql """ select sleep(1) """
+ }
+ }
+ assertTrue(ready, "Analyzer ${analyzerName} not ready after waiting:
${lastException?.message}")
+ }
+
+ try {
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS ${tokenizerName}
+ PROPERTIES
+ (
+ "type" = "edge_ngram",
+ "min_gram" = "1",
+ "max_gram" = "20",
+ "token_chars" = "letter"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerStandard}
+ PROPERTIES
+ (
+ "tokenizer" = "standard",
+ "token_filter" = "lowercase"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerEdge}
+ PROPERTIES
+ (
+ "tokenizer" = "${tokenizerName}",
+ "token_filter" = "lowercase"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerKeyword}
+ PROPERTIES
+ (
+ "tokenizer" = "keyword",
+ "token_filter" = "lowercase"
+ );
+ """
+
+ //waitAnalyzerReady(analyzerStandard)
+ //waitAnalyzerReady(analyzerEdge)
+ //waitAnalyzerReady(analyzerKeyword)
Review Comment:
Commented-out code should be removed. These analyzer readiness check calls
appear to be intentionally disabled but if they're not needed, they should be
deleted rather than commented out. If they're needed for future use, consider
documenting why they're disabled.
##########
regression-test/suites/inverted_index_p0/test_match_using_analyzer.groovy:
##########
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_match_using_analyzer", "p0") {
+ def tableName = "tbl_match_using_analyzer"
+ def tokenizerName = "edge_ngram_name_tokenizer_match_test"
+ def analyzerStandard = "first_name_standard_match_test"
+ def analyzerEdge = "edge_ngram_name_match_test"
+ def analyzerKeyword = "first_name_keyword_match_test"
+
+ def waitAnalyzerReady = { analyzerName ->
+ sleep(10000)
+ int maxRetry = 30
+ boolean ready = false
+ Exception lastException = null
+ for (int i = 0; i < maxRetry && !ready; i++) {
+ try {
+ sql """ select tokenize("probe",
'"analyzer"="${analyzerName}"'); """
+ ready = true
+ } catch (Exception e) {
+ lastException = e
+ logger.info("Analyzer ${analyzerName} not ready yet:
${e.message}")
+ sql """ select sleep(1) """
+ }
+ }
+ assertTrue(ready, "Analyzer ${analyzerName} not ready after waiting:
${lastException?.message}")
+ }
+
+ try {
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS ${tokenizerName}
+ PROPERTIES
+ (
+ "type" = "edge_ngram",
+ "min_gram" = "1",
+ "max_gram" = "20",
+ "token_chars" = "letter"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerStandard}
+ PROPERTIES
+ (
+ "tokenizer" = "standard",
+ "token_filter" = "lowercase"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerEdge}
+ PROPERTIES
+ (
+ "tokenizer" = "${tokenizerName}",
+ "token_filter" = "lowercase"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerKeyword}
+ PROPERTIES
+ (
+ "tokenizer" = "keyword",
+ "token_filter" = "lowercase"
+ );
+ """
+
+ //waitAnalyzerReady(analyzerStandard)
+ //waitAnalyzerReady(analyzerEdge)
+ //waitAnalyzerReady(analyzerKeyword)
+
+ sql """
+ CREATE TABLE ${tableName} (
+ id INT,
+ first_name STRING,
+ INDEX idx_fn_std (first_name) USING INVERTED
PROPERTIES("analyzer"="${analyzerStandard}", "support_phrase"="true"),
+ INDEX idx_fn_ngr (first_name) USING INVERTED
PROPERTIES("analyzer"="${analyzerEdge}"),
+ INDEX idx_fn_exact (first_name) USING INVERTED
PROPERTIES("analyzer"="${analyzerKeyword}")
+ ) DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "disable_auto_compaction" = "true"
+ )
+ """
+ sleep(10000)
+ sql """
+ INSERT INTO ${tableName} VALUES
+ (1, 'alice'),
+ (2, 'alice cooper'),
+ (3, 'bob'),
+ (4, 'allyson');
+ """
+
+ sql "sync"
+
+ // default analyzer should match both tokenized rows that contain
"alice"
+ qt_match_default """
+ SELECT id FROM ${tableName}
+ WHERE first_name MATCH 'alice'
+ ORDER BY id
+ """
+
+ // keyword analyzer only matches the full literal "alice"
+ qt_match_keyword """
+ SELECT id FROM ${tableName}
+ WHERE first_name MATCH 'alice' USING ANALYZER ${analyzerKeyword}
+ ORDER BY id
+ """
+
+ // edge ngram analyzer should cover names beginning with "al"
+ qt_match_edge """
+ SELECT id FROM ${tableName}
+ WHERE first_name MATCH_PHRASE_PREFIX 'al' USING ANALYZER
${analyzerEdge}
+ ORDER BY id
+ """
+
+ // show create table should contain analyzer names
+ def showCreate = sql """ SHOW CREATE TABLE ${tableName} """
+ def createStmt = showCreate.collect { row ->
row[1].toString().toLowerCase() }.join("\n")
+ assertTrue(createStmt.contains(analyzerStandard.toLowerCase()))
+ assertTrue(createStmt.contains(analyzerEdge.toLowerCase()))
+ assertTrue(createStmt.contains(analyzerKeyword.toLowerCase()))
+ } finally {
+ /*sql "DROP TABLE IF EXISTS ${tableName}"
+
+ try {
+ sql "DROP INVERTED INDEX ANALYZER ${analyzerStandard}"
+ } catch (Exception ignored) {
+ }
+
+ try {
+ sql "DROP INVERTED INDEX ANALYZER ${analyzerEdge}"
+ } catch (Exception ignored) {
+ }
+
+ try {
+ sql "DROP INVERTED INDEX ANALYZER ${analyzerKeyword}"
+ } catch (Exception ignored) {
+ }
+
+ try {
+ sql "DROP INVERTED INDEX TOKENIZER ${tokenizerName}"
+ } catch (Exception ignored) {
+ }*/
Review Comment:
Large block of commented-out cleanup code should be removed or uncommented.
The finally block contains extensive cleanup logic that is commented out. If
this cleanup is necessary, it should be active. If it's not needed (perhaps
handled elsewhere), the commented code should be deleted to improve code
clarity.
##########
fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java:
##########
@@ -137,25 +157,32 @@ public boolean equals(Object obj) {
if (!super.equals(obj)) {
return false;
}
- return ((MatchPredicate) obj).op == op;
+ MatchPredicate other = (MatchPredicate) obj;
+ return other.op == op
+ && Objects.equals(explicitAnalyzer, other.explicitAnalyzer)
+ && Objects.equals(invertedIndexAnalyzerName,
other.invertedIndexAnalyzerName)
+ && Objects.equals(invertedIndexParser,
other.invertedIndexParser);
}
@Override
public String toSqlImpl() {
- return getChild(0).toSql() + " " + op.toString() + " " +
getChild(1).toSql();
+ return getChild(0).toSql() + " " + op.toString() + " " +
getChild(1).toSql()
+ + analyzerSqlFragment();
}
@Override
public String toSqlImpl(boolean disableTableName, boolean needExternalSql,
TableType tableType,
TableIf table) {
return getChild(0).toSql(disableTableName, needExternalSql, tableType,
table) + " " + op.toString() + " "
- + getChild(1).toSql(disableTableName, needExternalSql,
tableType, table);
+ + getChild(1).toSql(disableTableName, needExternalSql,
tableType, table)
+ + analyzerSqlFragment();
}
@Override
protected void toThrift(TExprNode msg) {
msg.node_type = TExprNodeType.MATCH_PRED;
msg.setOpcode(op.getOpcode());
+ // Use new TMatchPredicate constructor with required fields
Review Comment:
Incorrect comment about TMatchPredicate constructor. The comment states "Use
new TMatchPredicate constructor with required fields" but the code is actually
using the existing TMatchPredicate constructor with two parameters. This
comment is misleading and should be removed or clarified.
```suggestion
// Initialize TMatchPredicate with parser settings and additional
configuration
```
##########
be/src/olap/rowset/segment_v2/inverted_index_iterator.h:
##########
@@ -31,7 +33,6 @@ struct InvertedIndexParam {
uint32_t num_rows;
std::shared_ptr<roaring::Roaring> roaring;
bool skip_try = false;
-
// Pointer to analyzer context (can be nullptr if not needed)
Review Comment:
Unnecessary trailing whitespace in comment. Line 36 ends with a comment that
appears to have trailing whitespace based on the diff context. This affects
code cleanliness.
##########
fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java:
##########
@@ -75,54 +77,72 @@ public TExprOpcode getOpcode() {
@SerializedName("op")
private Operator op;
+ // Fields for thrift serialization (restored from old version)
private String invertedIndexParser;
private String invertedIndexParserMode;
private Map<String, String> invertedIndexCharFilter;
private boolean invertedIndexParserLowercase = true;
private String invertedIndexParserStopwords = "";
private String invertedIndexAnalyzerName = "";
+ // Fields for SQL generation
+ private String explicitAnalyzer = "";
private MatchPredicate() {
// use for serde only
invertedIndexParser = InvertedIndexUtil.INVERTED_INDEX_PARSER_UNKNOWN;
invertedIndexParserMode =
InvertedIndexUtil.INVERTED_INDEX_PARSER_FINE_GRANULARITY;
}
+ protected MatchPredicate(MatchPredicate other) {
+ super(other);
+ op = other.op;
+ invertedIndexParser = other.invertedIndexParser;
+ invertedIndexParserMode = other.invertedIndexParserMode;
+ invertedIndexCharFilter = other.invertedIndexCharFilter;
+ invertedIndexParserLowercase = other.invertedIndexParserLowercase;
+ invertedIndexParserStopwords = other.invertedIndexParserStopwords;
+ invertedIndexAnalyzerName = other.invertedIndexAnalyzerName;
+ explicitAnalyzer = other.explicitAnalyzer;
+ }
+
/**
* use for Nereids ONLY
*/
public MatchPredicate(Operator op, Expr e1, Expr e2, Type retType,
NullableMode nullableMode, Index invertedIndex, boolean nullable) {
+ this(op, e1, e2, retType, nullableMode, invertedIndex, nullable, null);
+ }
Review Comment:
Constructor order is unusual - simpler constructor delegates to more complex
one which then delegates to another. Consider reordering: have the most complex
constructor first, then simpler ones delegate to it. This would make the
delegation chain clearer and follow common Java constructor patterns.
##########
fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java:
##########
@@ -75,54 +77,72 @@ public TExprOpcode getOpcode() {
@SerializedName("op")
private Operator op;
+ // Fields for thrift serialization (restored from old version)
private String invertedIndexParser;
private String invertedIndexParserMode;
private Map<String, String> invertedIndexCharFilter;
private boolean invertedIndexParserLowercase = true;
private String invertedIndexParserStopwords = "";
private String invertedIndexAnalyzerName = "";
+ // Fields for SQL generation
Review Comment:
Unnecessary comment about code that was moved. The comment "Fields for
thrift serialization (restored from old version)" seems to reference
implementation history rather than explaining the purpose of these fields.
Consider updating to describe what these fields are used for functionally, or
remove if self-explanatory.
```suggestion
// Inverted index analyzer configuration persisted via Thrift
serialization
private String invertedIndexParser;
private String invertedIndexParserMode;
private Map<String, String> invertedIndexCharFilter;
private boolean invertedIndexParserLowercase = true;
private String invertedIndexParserStopwords = "";
private String invertedIndexAnalyzerName = "";
// Analyzer name explicitly specified in the SQL MATCH predicate, used
for SQL generation
```
##########
be/src/vec/exprs/vmatch_predicate.cpp:
##########
@@ -29,10 +29,10 @@
#include <gen_cpp/Exprs_types.h>
#include <glog/logging.h>
-#include <cstddef>
#include <memory>
-#include <ostream>
+#include <string>
#include <string_view>
+#include <type_traits>
Review Comment:
Unnecessary inclusion of includes in comment. The comment references
"cstddef" and "ostream" which are no longer included after the changes. These
appear to be removed includes that shouldn't be mentioned in the comment about
what changed. Consider removing them from the diff context or clarifying the
comment intent.
##########
be/src/vec/functions/match.cpp:
##########
@@ -121,7 +127,6 @@ Status FunctionMatchBase::execute_impl(FunctionContext*
context, Block& block,
VLOG_DEBUG << "begin to execute match directly, column_name=" <<
column_name
<< ", match_query_str=" << match_query_str;
auto* analyzer_ctx = get_match_analyzer_ctx(context);
Review Comment:
Unnecessary change removing blank line. This change removes a blank line
that was separating the analyzer_ctx initialization from the source column
extraction. The blank line improved readability by grouping related operations.
Consider keeping it.
```suggestion
auto* analyzer_ctx = get_match_analyzer_ctx(context);
```
##########
fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzerSelector.java:
##########
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.analysis;
+
+import org.apache.doris.catalog.Index;
+
+import com.google.common.base.Strings;
+
+import java.util.Collections;
+import java.util.Map;
+
+/**
+ * Helper for selecting the analyzer that should be attached to a MATCH
predicate.
+ * The current implementation is intentionally simple so that future rule-based
+ * selection can plug in without touching the planner surface again.
+ */
+public final class AnalyzerSelector {
+
+ private AnalyzerSelector() {
+ }
+
+ public static Selection select(Index index, String requestedAnalyzer) {
+ Map<String, String> properties = index == null ?
Collections.emptyMap() : index.getProperties();
+ return select(properties, requestedAnalyzer);
+ }
+
+ public static Selection select(Map<String, String> properties, String
requestedAnalyzer) {
+ String normalizedRequest = normalize(requestedAnalyzer);
+ String preferredAnalyzer = selectDefaultAnalyzer(properties);
+ String parser = InvertedIndexUtil.getInvertedIndexParser(properties);
+
+ if (!Strings.isNullOrEmpty(normalizedRequest)) {
+ return new Selection(normalizedRequest, parser, true);
+ }
+
+ String resolvedAnalyzer = Strings.isNullOrEmpty(preferredAnalyzer) ?
parser : preferredAnalyzer;
+ return new Selection(resolvedAnalyzer, parser, false);
+ }
+
+ private static String normalize(String analyzer) {
+ return analyzer == null ? "" : analyzer.trim();
+ }
+
+ private static String selectDefaultAnalyzer(Map<String, String>
properties) {
+ // Placeholder for future rule-based selection. At the moment we simply
+ // reuse the analyzer attached to the index, if any.
+ return InvertedIndexUtil.getPreferredAnalyzer(properties);
+ }
+
+ public static final class Selection {
+ private final String analyzer;
+ private final String parser;
+ private final boolean explicit;
+
+ private Selection(String analyzer, String parser, boolean explicit) {
+ this.analyzer = normalize(analyzer);
+ String normalizedParser = normalize(parser);
+ this.parser = Strings.isNullOrEmpty(normalizedParser)
+ ? InvertedIndexUtil.INVERTED_INDEX_PARSER_NONE
+ : normalizedParser;
+ this.explicit = explicit;
+ }
+
+ public String analyzer() {
+ return analyzer;
+ }
+
+ public String parser() {
+ return parser;
+ }
+
+ public boolean explicit() {
+ return explicit;
+ }
+ }
Review Comment:
Missing test coverage for the Selection class methods. The new Selection
class in AnalyzerSelector has public methods (analyzer(), parser(), explicit())
that don't appear to have direct unit test coverage. While the class is tested
through the parse() method tests, consider adding explicit tests for the
Selection class behavior.
##########
fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java:
##########
@@ -225,12 +225,17 @@ public Expr visitMatch(Match match, PlanTranslatorContext
context) {
throw new AnalysisException("SlotReference in Match failed to get
OlapTable, SQL is " + match.toSql());
}
- Index invertedIndex = olapTbl.getInvertedIndex(column,
slot.getSubPath());
+ String analyzer = match.getAnalyzer().orElse(null);
+ Index invertedIndex = olapTbl.getInvertedIndex(column,
slot.getSubPath(), analyzer);
+ if (analyzer != null && invertedIndex == null) {
+ throw new AnalysisException("No inverted index found for analyzer
'" + analyzer
Review Comment:
Missing validation of null analyzer parameter. The method accepts a nullable
analyzer parameter but doesn't validate it before using it in the error
message. While analyzer is used in a null-safe way in the condition, if it were
null and the condition were true, the error message would display "null" which
is not user-friendly.
```suggestion
String analyzerForMsg = (analyzer == null) ? "<unspecified
analyzer>" : analyzer;
Index invertedIndex = olapTbl.getInvertedIndex(column,
slot.getSubPath(), analyzer);
if (analyzer != null && invertedIndex == null) {
throw new AnalysisException("No inverted index found for
analyzer '" + analyzerForMsg
```
##########
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/IndexDefinition.java:
##########
@@ -500,4 +500,11 @@ public boolean isAnalyzedInvertedIndex() {
||
properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_ANALYZER_NAME_KEY)
||
properties.containsKey(InvertedIndexUtil.INVERTED_INDEX_NORMALIZER_NAME_KEY));
}
+
+ public String getAnalyzerIdentity() {
+ if (indexType != IndexType.INVERTED) {
+ return "";
+ }
+ return InvertedIndexUtil.buildAnalyzerIdentity(properties);
+ }
Review Comment:
Missing test coverage for the new getAnalyzerIdentity method. This new
public method returns analyzer identity for inverted indexes but doesn't appear
to be tested. Consider adding unit tests to verify it returns the correct
identity for different property configurations.
##########
be/src/vec/exprs/vmatch_predicate.cpp:
##########
@@ -167,7 +172,7 @@ Status VMatchPredicate::execute_column(VExprContext*
context, const Block* block
auto* column_slot_ref = assert_cast<VSlotRef*>(get_child(0).get());
std::string column_name = column_slot_ref->expr_name();
- auto it = std::find(column_names.begin(), column_names.end(),
column_name);
+ auto it = std::ranges::find(column_names, column_name);
Review Comment:
Inconsistent range-based for loop usage. The code changes lines 97, 106 to
use "const auto&" for range-based loops but then uses "std::ranges::find" on
line 175 which requires C++20 ranges. Consider using consistent modern C++
patterns throughout, or ensure all usages are compatible with the project's C++
standard.
```suggestion
auto it = std::find(column_names.begin(), column_names.end(),
column_name);
```
##########
fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java:
##########
@@ -2816,12 +2816,26 @@ private boolean checkDuplicateIndexes(List<Index>
indexes, IndexDefinition index
String columnName = indexDef.getColumnNames().get(0);
Column column = olapTable.getColumn(columnName);
if (column != null && (column.getType().isStringType() ||
column.getType().isVariantType())) {
- boolean isExistingIndexAnalyzer =
index.isAnalyzedInvertedIndex();
- boolean isNewIndexAnalyzer =
indexDef.isAnalyzedInvertedIndex();
- if (isExistingIndexAnalyzer == isNewIndexAnalyzer) {
- throw new DdlException(
- indexDef.getIndexType() + " index for column
(" + columnName + ") with "
- + (isNewIndexAnalyzer ? "analyzed" :
"non-analyzed") + " type already exists.");
+ if (index.getIndexType() == IndexType.INVERTED) {
+ String existingIdentity =
index.getAnalyzerIdentity();
+ String newIdentity =
indexDef.getAnalyzerIdentity();
+ if (Objects.equals(existingIdentity, newIdentity))
{
+ String analyzerDesc =
"__default__".equals(newIdentity)
+ ? "default analyzer"
+ : "analyzer identity '" + newIdentity
+ "'";
+ throw new DdlException(indexDef.getIndexType()
+ + " index for column (" + columnName +
") with analyzer "
+ + analyzerDesc + " already exists.");
+ }
+ } else {
+ boolean isExistingIndexAnalyzer =
index.isAnalyzedInvertedIndex();
+ boolean isNewIndexAnalyzer =
indexDef.isAnalyzedInvertedIndex();
+ if (isExistingIndexAnalyzer == isNewIndexAnalyzer)
{
+ throw new DdlException(
+ indexDef.getIndexType() + " index for
column (" + columnName + ") with "
+ + (isNewIndexAnalyzer ? "analyzed" :
"non-analyzed")
+ + " type already exists.");
Review Comment:
Inconsistent error message format. The error message format changes between
line 2827-2828 (using "analyzer identity") versus lines 2835-2836 (using
"analyzed"/"non-analyzed"). For consistency, consider using the analyzer
identity format for both branches, or restructure to have uniform messaging.
```suggestion
indexDef.getIndexType() + " index for
column (" + columnName + ") with analyzer "
+ (isNewIndexAnalyzer ? "analyzed" :
"non-analyzed")
+ " already exists.");
```
##########
regression-test/suites/inverted_index_p0/test_match_using_analyzer.groovy:
##########
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_match_using_analyzer", "p0") {
+ def tableName = "tbl_match_using_analyzer"
+ def tokenizerName = "edge_ngram_name_tokenizer_match_test"
+ def analyzerStandard = "first_name_standard_match_test"
+ def analyzerEdge = "edge_ngram_name_match_test"
+ def analyzerKeyword = "first_name_keyword_match_test"
+
+ def waitAnalyzerReady = { analyzerName ->
+ sleep(10000)
+ int maxRetry = 30
+ boolean ready = false
+ Exception lastException = null
+ for (int i = 0; i < maxRetry && !ready; i++) {
+ try {
+ sql """ select tokenize("probe",
'"analyzer"="${analyzerName}"'); """
+ ready = true
+ } catch (Exception e) {
+ lastException = e
+ logger.info("Analyzer ${analyzerName} not ready yet:
${e.message}")
+ sql """ select sleep(1) """
+ }
+ }
+ assertTrue(ready, "Analyzer ${analyzerName} not ready after waiting:
${lastException?.message}")
+ }
+
+ try {
+ sql "DROP TABLE IF EXISTS ${tableName}"
+
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS ${tokenizerName}
+ PROPERTIES
+ (
+ "type" = "edge_ngram",
+ "min_gram" = "1",
+ "max_gram" = "20",
+ "token_chars" = "letter"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerStandard}
+ PROPERTIES
+ (
+ "tokenizer" = "standard",
+ "token_filter" = "lowercase"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerEdge}
+ PROPERTIES
+ (
+ "tokenizer" = "${tokenizerName}",
+ "token_filter" = "lowercase"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS ${analyzerKeyword}
+ PROPERTIES
+ (
+ "tokenizer" = "keyword",
+ "token_filter" = "lowercase"
+ );
+ """
+
+ //waitAnalyzerReady(analyzerStandard)
+ //waitAnalyzerReady(analyzerEdge)
+ //waitAnalyzerReady(analyzerKeyword)
+
+ sql """
+ CREATE TABLE ${tableName} (
+ id INT,
+ first_name STRING,
+ INDEX idx_fn_std (first_name) USING INVERTED
PROPERTIES("analyzer"="${analyzerStandard}", "support_phrase"="true"),
+ INDEX idx_fn_ngr (first_name) USING INVERTED
PROPERTIES("analyzer"="${analyzerEdge}"),
+ INDEX idx_fn_exact (first_name) USING INVERTED
PROPERTIES("analyzer"="${analyzerKeyword}")
+ ) DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "disable_auto_compaction" = "true"
+ )
+ """
+ sleep(10000)
Review Comment:
Hardcoded sleep of 10 seconds without explanation. This appears arbitrary
and could make tests unnecessarily slow. Consider either using the existing
waitAnalyzerReady mechanism (which is commented out above) or documenting why
this specific duration is required.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]