This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 2cf81becff Enhance REGEXP_LIKE to scan dictionary when dictionary is
small (#16478)
2cf81becff is described below
commit 2cf81becffbb1e4df5985f973219b451617f4646
Author: Xiaotian (Jackie) Jiang <[email protected]>
AuthorDate: Fri Aug 1 00:35:04 2025 -0600
Enhance REGEXP_LIKE to scan dictionary when dictionary is small (#16478)
---
.../core/operator/filter/FilterOperatorUtils.java | 36 ++++-------
...aseDictIdBasedRegexpLikePredicateEvaluator.java | 31 ++++++++++
.../FSTBasedRegexpPredicateEvaluatorFactory.java | 2 +-
.../IFSTBasedRegexpPredicateEvaluatorFactory.java | 2 +-
.../RegexpLikePredicateEvaluatorFactory.java | 70 ++++++++++++++++++++--
.../tests/OfflineClusterIntegrationTest.java | 19 ++++++
6 files changed, 127 insertions(+), 33 deletions(-)
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/FilterOperatorUtils.java
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/FilterOperatorUtils.java
index b6cab14d83..2f38d18443 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/FilterOperatorUtils.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/FilterOperatorUtils.java
@@ -23,7 +23,7 @@ import java.util.Comparator;
import java.util.List;
import java.util.OptionalInt;
import org.apache.pinot.common.request.context.predicate.Predicate;
-import org.apache.pinot.common.request.context.predicate.RegexpLikePredicate;
+import
org.apache.pinot.core.operator.filter.predicate.BaseDictIdBasedRegexpLikePredicateEvaluator;
import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator;
import org.apache.pinot.core.query.request.context.QueryContext;
import org.apache.pinot.segment.spi.datasource.DataSource;
@@ -54,20 +54,19 @@ public class FilterOperatorUtils {
/**
* Returns the AND filter operator or equivalent filter operator.
*/
- BaseFilterOperator getAndFilterOperator(QueryContext queryContext,
- List<BaseFilterOperator> filterOperators, int numDocs);
+ BaseFilterOperator getAndFilterOperator(QueryContext queryContext,
List<BaseFilterOperator> filterOperators,
+ int numDocs);
/**
* Returns the OR filter operator or equivalent filter operator.
*/
- BaseFilterOperator getOrFilterOperator(QueryContext queryContext,
- List<BaseFilterOperator> filterOperators, int numDocs);
+ BaseFilterOperator getOrFilterOperator(QueryContext queryContext,
List<BaseFilterOperator> filterOperators,
+ int numDocs);
/**
* Returns the NOT filter operator or equivalent filter operator.
*/
- BaseFilterOperator getNotFilterOperator(QueryContext queryContext,
BaseFilterOperator filterOperator,
- int numDocs);
+ BaseFilterOperator getNotFilterOperator(QueryContext queryContext,
BaseFilterOperator filterOperator, int numDocs);
}
public static class DefaultImplementation implements Implementation {
@@ -107,24 +106,12 @@ public class FilterOperatorUtils {
}
return new ScanBasedFilterOperator(queryContext, predicateEvaluator,
dataSource, numDocs);
} else if (predicateType == Predicate.Type.REGEXP_LIKE) {
- // Check if case-insensitive flag is present
- RegexpLikePredicate regexpLikePredicate = (RegexpLikePredicate)
predicateEvaluator.getPredicate();
- boolean caseInsensitive = regexpLikePredicate.isCaseInsensitive();
- if (caseInsensitive) {
- if (dataSource.getIFSTIndex() != null &&
dataSource.getDataSourceMetadata().isSorted()
+ if (predicateEvaluator instanceof
BaseDictIdBasedRegexpLikePredicateEvaluator) {
+ if (dataSource.getDataSourceMetadata().isSorted()
&& queryContext.isIndexUseAllowed(dataSource,
FieldConfig.IndexType.SORTED)) {
return new SortedIndexBasedFilterOperator(queryContext,
predicateEvaluator, dataSource, numDocs);
}
- if (dataSource.getIFSTIndex() != null &&
dataSource.getInvertedIndex() != null
- && queryContext.isIndexUseAllowed(dataSource,
FieldConfig.IndexType.INVERTED)) {
- return new InvertedIndexFilterOperator(queryContext,
predicateEvaluator, dataSource, numDocs);
- }
- } else {
- if (dataSource.getFSTIndex() != null &&
dataSource.getDataSourceMetadata().isSorted()
- && queryContext.isIndexUseAllowed(dataSource,
FieldConfig.IndexType.SORTED)) {
- return new SortedIndexBasedFilterOperator(queryContext,
predicateEvaluator, dataSource, numDocs);
- }
- if (dataSource.getFSTIndex() != null &&
dataSource.getInvertedIndex() != null
+ if (dataSource.getInvertedIndex() != null
&& queryContext.isIndexUseAllowed(dataSource,
FieldConfig.IndexType.INVERTED)) {
return new InvertedIndexFilterOperator(queryContext,
predicateEvaluator, dataSource, numDocs);
}
@@ -174,8 +161,8 @@ public class FilterOperatorUtils {
}
@Override
- public BaseFilterOperator getOrFilterOperator(QueryContext queryContext,
- List<BaseFilterOperator> filterOperators, int numDocs) {
+ public BaseFilterOperator getOrFilterOperator(QueryContext queryContext,
List<BaseFilterOperator> filterOperators,
+ int numDocs) {
List<BaseFilterOperator> childFilterOperators = new
ArrayList<>(filterOperators.size());
for (BaseFilterOperator filterOperator : filterOperators) {
if (filterOperator.isResultMatchingAll()) {
@@ -210,7 +197,6 @@ public class FilterOperatorUtils {
return new NotFilterOperator(filterOperator, numDocs,
queryContext.isNullHandlingEnabled());
}
-
/**
* For AND filter operator, reorders its child filter operators based on
their cost and puts the ones with
* inverted index first in order to reduce the number of documents to be
processed.
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/BaseDictIdBasedRegexpLikePredicateEvaluator.java
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/BaseDictIdBasedRegexpLikePredicateEvaluator.java
new file mode 100644
index 0000000000..8067377635
--- /dev/null
+++
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/BaseDictIdBasedRegexpLikePredicateEvaluator.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.operator.filter.predicate;
+
+import org.apache.pinot.common.request.context.predicate.Predicate;
+import org.apache.pinot.segment.spi.index.reader.Dictionary;
+
+
+/// Base class for dictionary-based REGEXP_LIKE predicate evaluators that use
dictionary IDs for matching.
+public abstract class BaseDictIdBasedRegexpLikePredicateEvaluator extends
BaseDictionaryBasedPredicateEvaluator {
+
+ protected BaseDictIdBasedRegexpLikePredicateEvaluator(Predicate predicate,
Dictionary dictionary) {
+ super(predicate, dictionary);
+ }
+}
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/FSTBasedRegexpPredicateEvaluatorFactory.java
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/FSTBasedRegexpPredicateEvaluatorFactory.java
index 11dbe7aa99..8cbdb42884 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/FSTBasedRegexpPredicateEvaluatorFactory.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/FSTBasedRegexpPredicateEvaluatorFactory.java
@@ -49,7 +49,7 @@ public class FSTBasedRegexpPredicateEvaluatorFactory {
/**
* Matches regexp query using FSTIndexReader.
*/
- private static class FSTBasedRegexpPredicateEvaluator extends
BaseDictionaryBasedPredicateEvaluator {
+ private static class FSTBasedRegexpPredicateEvaluator extends
BaseDictIdBasedRegexpLikePredicateEvaluator {
final ImmutableRoaringBitmap _matchingDictIdBitmap;
public FSTBasedRegexpPredicateEvaluator(RegexpLikePredicate
regexpLikePredicate, TextIndexReader fstIndexReader,
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/IFSTBasedRegexpPredicateEvaluatorFactory.java
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/IFSTBasedRegexpPredicateEvaluatorFactory.java
index a2d82b765d..074d44d3e9 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/IFSTBasedRegexpPredicateEvaluatorFactory.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/IFSTBasedRegexpPredicateEvaluatorFactory.java
@@ -45,7 +45,7 @@ public class IFSTBasedRegexpPredicateEvaluatorFactory {
return new IFSTBasedRegexpPredicateEvaluator(regexpLikePredicate,
ifstIndexReader, dictionary);
}
- private static class IFSTBasedRegexpPredicateEvaluator extends
BaseDictionaryBasedPredicateEvaluator {
+ private static class IFSTBasedRegexpPredicateEvaluator extends
BaseDictIdBasedRegexpLikePredicateEvaluator {
final ImmutableRoaringBitmap _matchingDictIdBitmap;
public IFSTBasedRegexpPredicateEvaluator(RegexpLikePredicate
regexpLikePredicate,
diff --git
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
index e9c2bb73fd..169230831c 100644
---
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
+++
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
@@ -19,6 +19,10 @@
package org.apache.pinot.core.operator.filter.predicate;
import com.google.common.base.Preconditions;
+import it.unimi.dsi.fastutil.ints.IntArrayList;
+import it.unimi.dsi.fastutil.ints.IntList;
+import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
+import it.unimi.dsi.fastutil.ints.IntSet;
import org.apache.pinot.common.request.context.predicate.RegexpLikePredicate;
import org.apache.pinot.common.utils.regex.Matcher;
import org.apache.pinot.segment.spi.index.reader.Dictionary;
@@ -32,6 +36,9 @@ public class RegexpLikePredicateEvaluatorFactory {
private RegexpLikePredicateEvaluatorFactory() {
}
+ /// When the cardinality of the dictionary is less than this threshold, scan
the dictionary to get the matching ids.
+ public static final int DICTIONARY_CARDINALITY_THRESHOLD_FOR_SCAN = 10000;
+
/**
* Create a new instance of dictionary based REGEXP_LIKE predicate evaluator.
*
@@ -42,9 +49,12 @@ public class RegexpLikePredicateEvaluatorFactory {
*/
public static BaseDictionaryBasedPredicateEvaluator
newDictionaryBasedEvaluator(
RegexpLikePredicate regexpLikePredicate, Dictionary dictionary, DataType
dataType) {
- boolean condition = (dataType == DataType.STRING || dataType ==
DataType.JSON);
- Preconditions.checkArgument(condition, "Unsupported data type: " +
dataType);
- return new
DictionaryBasedRegexpLikePredicateEvaluator(regexpLikePredicate, dictionary);
+ Preconditions.checkArgument(dataType.getStoredType() == DataType.STRING,
"Unsupported data type: " + dataType);
+ if (dictionary.length() < DICTIONARY_CARDINALITY_THRESHOLD_FOR_SCAN) {
+ return new DictIdBasedRegexpLikePredicateEvaluator(regexpLikePredicate,
dictionary);
+ } else {
+ return new ScanBasedRegexpLikePredicateEvaluator(regexpLikePredicate,
dictionary);
+ }
}
/**
@@ -56,16 +66,64 @@ public class RegexpLikePredicateEvaluatorFactory {
*/
public static BaseRawValueBasedPredicateEvaluator
newRawValueBasedEvaluator(RegexpLikePredicate regexpLikePredicate,
DataType dataType) {
- Preconditions.checkArgument(dataType == DataType.STRING, "Unsupported data
type: " + dataType);
+ Preconditions.checkArgument(dataType.getStoredType() == DataType.STRING,
"Unsupported data type: " + dataType);
return new RawValueBasedRegexpLikePredicateEvaluator(regexpLikePredicate);
}
- private static final class DictionaryBasedRegexpLikePredicateEvaluator
extends BaseDictionaryBasedPredicateEvaluator {
+ private static final class DictIdBasedRegexpLikePredicateEvaluator
+ extends BaseDictIdBasedRegexpLikePredicateEvaluator {
+ final IntSet _matchingDictIdSet;
+
+ public DictIdBasedRegexpLikePredicateEvaluator(RegexpLikePredicate
regexpLikePredicate, Dictionary dictionary) {
+ super(regexpLikePredicate, dictionary);
+ Matcher matcher = regexpLikePredicate.getPattern().matcher("");
+ IntList matchingDictIds = new IntArrayList();
+ int dictionarySize = _dictionary.length();
+ for (int dictId = 0; dictId < dictionarySize; dictId++) {
+ if (matcher.reset(dictionary.getStringValue(dictId)).find()) {
+ matchingDictIds.add(dictId);
+ }
+ }
+ int numMatchingDictIds = matchingDictIds.size();
+ if (numMatchingDictIds == 0) {
+ _alwaysFalse = true;
+ } else if (dictionarySize == numMatchingDictIds) {
+ _alwaysTrue = true;
+ }
+ _matchingDictIds = matchingDictIds.toIntArray();
+ _matchingDictIdSet = new IntOpenHashSet(_matchingDictIds);
+ }
+
+ @Override
+ public int getNumMatchingItems() {
+ return _matchingDictIdSet.size();
+ }
+
+ @Override
+ public boolean applySV(int dictId) {
+ return _matchingDictIdSet.contains(dictId);
+ }
+
+ @Override
+ public int applySV(int limit, int[] docIds, int[] values) {
+ // reimplemented here to ensure applySV can be inlined
+ int matches = 0;
+ for (int i = 0; i < limit; i++) {
+ int value = values[i];
+ if (applySV(value)) {
+ docIds[matches++] = docIds[i];
+ }
+ }
+ return matches;
+ }
+ }
+
+ private static final class ScanBasedRegexpLikePredicateEvaluator extends
BaseDictionaryBasedPredicateEvaluator {
// Reuse matcher to avoid excessive allocation. This is safe to do because
the evaluator is always used
// within the scope of a single thread.
final Matcher _matcher;
- public DictionaryBasedRegexpLikePredicateEvaluator(RegexpLikePredicate
regexpLikePredicate, Dictionary dictionary) {
+ public ScanBasedRegexpLikePredicateEvaluator(RegexpLikePredicate
regexpLikePredicate, Dictionary dictionary) {
super(regexpLikePredicate, dictionary);
_matcher = regexpLikePredicate.getPattern().matcher("");
}
diff --git
a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java
b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java
index 36af424482..83ea214643 100644
---
a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java
+++
b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java
@@ -1893,6 +1893,8 @@ public class OfflineClusterIntegrationTest extends
BaseClusterIntegrationTestSet
}, 60_000L, "Failed to query new added columns without reload");
// Table size shouldn't change without reload
assertEquals(getTableSize(getTableName()), _tableSize);
+ // Test REGEXP_LIKE on new added columns
+ testRegexpLikeOnNewAddedColumns();
// Trigger reload and verify column count
reloadAllSegments(TEST_EXTRA_COLUMNS_QUERY, false, numTotalDocs);
@@ -1942,6 +1944,7 @@ public class OfflineClusterIntegrationTest extends
BaseClusterIntegrationTestSet
assertTrue(derivedNullStringColumnIndex.has(StandardIndexes.NULL_VALUE_VECTOR_ID));
testNewAddedColumns();
+ testRegexpLikeOnNewAddedColumns();
// The multi-stage query engine doesn't support expression overrides
currently
if (!useMultiStageQueryEngine()) {
@@ -2209,6 +2212,22 @@ public class OfflineClusterIntegrationTest extends
BaseClusterIntegrationTestSet
assertEquals(row.get(12).asDouble(), 0.0);
}
+ private void testRegexpLikeOnNewAddedColumns()
+ throws Exception {
+ int numTotalDocs = (int) getCountStarResult();
+
+ // REGEXP_LIKE on new added dictionary-encoded columns should not scan the
table when it matches all or nothing
+ for (String column : List.of("NewAddedSVJSONDimension",
"NewAddedDerivedNullString")) {
+ JsonNode response = postQuery("SELECT COUNT(*) FROM mytable WHERE
REGEXP_LIKE(" + column + ", 'foo')");
+
assertEquals(response.get("resultTable").get("rows").get(0).get(0).asInt(), 0);
+ assertEquals(response.get("numEntriesScannedInFilter").asInt(), 0);
+
+ response = postQuery("SELECT COUNT(*) FROM mytable WHERE REGEXP_LIKE(" +
column + ", '.*')");
+
assertEquals(response.get("resultTable").get("rows").get(0).get(0).asInt(),
numTotalDocs);
+ assertEquals(response.get("numEntriesScannedInFilter").asInt(), 0);
+ }
+ }
+
private void testExpressionOverride()
throws Exception {
String query = "SELECT COUNT(*) FROM mytable WHERE DaysSinceEpoch * 24 =
392184";
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]