This is an automated email from the ASF dual-hosted git repository.

jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 2cf81becff Enhance REGEXP_LIKE to scan dictionary when dictionary is 
small (#16478)
2cf81becff is described below

commit 2cf81becffbb1e4df5985f973219b451617f4646
Author: Xiaotian (Jackie) Jiang <[email protected]>
AuthorDate: Fri Aug 1 00:35:04 2025 -0600

    Enhance REGEXP_LIKE to scan dictionary when dictionary is small (#16478)
---
 .../core/operator/filter/FilterOperatorUtils.java  | 36 ++++-------
 ...aseDictIdBasedRegexpLikePredicateEvaluator.java | 31 ++++++++++
 .../FSTBasedRegexpPredicateEvaluatorFactory.java   |  2 +-
 .../IFSTBasedRegexpPredicateEvaluatorFactory.java  |  2 +-
 .../RegexpLikePredicateEvaluatorFactory.java       | 70 ++++++++++++++++++++--
 .../tests/OfflineClusterIntegrationTest.java       | 19 ++++++
 6 files changed, 127 insertions(+), 33 deletions(-)

diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/FilterOperatorUtils.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/FilterOperatorUtils.java
index b6cab14d83..2f38d18443 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/FilterOperatorUtils.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/FilterOperatorUtils.java
@@ -23,7 +23,7 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.OptionalInt;
 import org.apache.pinot.common.request.context.predicate.Predicate;
-import org.apache.pinot.common.request.context.predicate.RegexpLikePredicate;
+import 
org.apache.pinot.core.operator.filter.predicate.BaseDictIdBasedRegexpLikePredicateEvaluator;
 import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator;
 import org.apache.pinot.core.query.request.context.QueryContext;
 import org.apache.pinot.segment.spi.datasource.DataSource;
@@ -54,20 +54,19 @@ public class FilterOperatorUtils {
     /**
      * Returns the AND filter operator or equivalent filter operator.
      */
-    BaseFilterOperator getAndFilterOperator(QueryContext queryContext,
-        List<BaseFilterOperator> filterOperators, int numDocs);
+    BaseFilterOperator getAndFilterOperator(QueryContext queryContext, 
List<BaseFilterOperator> filterOperators,
+        int numDocs);
 
     /**
      * Returns the OR filter operator or equivalent filter operator.
      */
-    BaseFilterOperator getOrFilterOperator(QueryContext queryContext,
-        List<BaseFilterOperator> filterOperators, int numDocs);
+    BaseFilterOperator getOrFilterOperator(QueryContext queryContext, 
List<BaseFilterOperator> filterOperators,
+        int numDocs);
 
     /**
      * Returns the NOT filter operator or equivalent filter operator.
      */
-    BaseFilterOperator getNotFilterOperator(QueryContext queryContext, 
BaseFilterOperator filterOperator,
-        int numDocs);
+    BaseFilterOperator getNotFilterOperator(QueryContext queryContext, 
BaseFilterOperator filterOperator, int numDocs);
   }
 
   public static class DefaultImplementation implements Implementation {
@@ -107,24 +106,12 @@ public class FilterOperatorUtils {
         }
         return new ScanBasedFilterOperator(queryContext, predicateEvaluator, 
dataSource, numDocs);
       } else if (predicateType == Predicate.Type.REGEXP_LIKE) {
-        // Check if case-insensitive flag is present
-        RegexpLikePredicate regexpLikePredicate = (RegexpLikePredicate) 
predicateEvaluator.getPredicate();
-        boolean caseInsensitive = regexpLikePredicate.isCaseInsensitive();
-        if (caseInsensitive) {
-          if (dataSource.getIFSTIndex() != null && 
dataSource.getDataSourceMetadata().isSorted()
+        if (predicateEvaluator instanceof 
BaseDictIdBasedRegexpLikePredicateEvaluator) {
+          if (dataSource.getDataSourceMetadata().isSorted()
               && queryContext.isIndexUseAllowed(dataSource, 
FieldConfig.IndexType.SORTED)) {
             return new SortedIndexBasedFilterOperator(queryContext, 
predicateEvaluator, dataSource, numDocs);
           }
-          if (dataSource.getIFSTIndex() != null && 
dataSource.getInvertedIndex() != null
-              && queryContext.isIndexUseAllowed(dataSource, 
FieldConfig.IndexType.INVERTED)) {
-            return new InvertedIndexFilterOperator(queryContext, 
predicateEvaluator, dataSource, numDocs);
-          }
-        } else {
-          if (dataSource.getFSTIndex() != null && 
dataSource.getDataSourceMetadata().isSorted()
-              && queryContext.isIndexUseAllowed(dataSource, 
FieldConfig.IndexType.SORTED)) {
-            return new SortedIndexBasedFilterOperator(queryContext, 
predicateEvaluator, dataSource, numDocs);
-          }
-          if (dataSource.getFSTIndex() != null && 
dataSource.getInvertedIndex() != null
+          if (dataSource.getInvertedIndex() != null
               && queryContext.isIndexUseAllowed(dataSource, 
FieldConfig.IndexType.INVERTED)) {
             return new InvertedIndexFilterOperator(queryContext, 
predicateEvaluator, dataSource, numDocs);
           }
@@ -174,8 +161,8 @@ public class FilterOperatorUtils {
     }
 
     @Override
-    public BaseFilterOperator getOrFilterOperator(QueryContext queryContext,
-        List<BaseFilterOperator> filterOperators, int numDocs) {
+    public BaseFilterOperator getOrFilterOperator(QueryContext queryContext, 
List<BaseFilterOperator> filterOperators,
+        int numDocs) {
       List<BaseFilterOperator> childFilterOperators = new 
ArrayList<>(filterOperators.size());
       for (BaseFilterOperator filterOperator : filterOperators) {
         if (filterOperator.isResultMatchingAll()) {
@@ -210,7 +197,6 @@ public class FilterOperatorUtils {
       return new NotFilterOperator(filterOperator, numDocs, 
queryContext.isNullHandlingEnabled());
     }
 
-
     /**
      * For AND filter operator, reorders its child filter operators based on 
their cost and puts the ones with
      * inverted index first in order to reduce the number of documents to be 
processed.
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/BaseDictIdBasedRegexpLikePredicateEvaluator.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/BaseDictIdBasedRegexpLikePredicateEvaluator.java
new file mode 100644
index 0000000000..8067377635
--- /dev/null
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/BaseDictIdBasedRegexpLikePredicateEvaluator.java
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.operator.filter.predicate;
+
+import org.apache.pinot.common.request.context.predicate.Predicate;
+import org.apache.pinot.segment.spi.index.reader.Dictionary;
+
+
+/// Base class for dictionary-based REGEXP_LIKE predicate evaluators that use 
dictionary IDs for matching.
+public abstract class BaseDictIdBasedRegexpLikePredicateEvaluator extends 
BaseDictionaryBasedPredicateEvaluator {
+
+  protected BaseDictIdBasedRegexpLikePredicateEvaluator(Predicate predicate, 
Dictionary dictionary) {
+    super(predicate, dictionary);
+  }
+}
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/FSTBasedRegexpPredicateEvaluatorFactory.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/FSTBasedRegexpPredicateEvaluatorFactory.java
index 11dbe7aa99..8cbdb42884 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/FSTBasedRegexpPredicateEvaluatorFactory.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/FSTBasedRegexpPredicateEvaluatorFactory.java
@@ -49,7 +49,7 @@ public class FSTBasedRegexpPredicateEvaluatorFactory {
   /**
    * Matches regexp query using FSTIndexReader.
    */
-  private static class FSTBasedRegexpPredicateEvaluator extends 
BaseDictionaryBasedPredicateEvaluator {
+  private static class FSTBasedRegexpPredicateEvaluator extends 
BaseDictIdBasedRegexpLikePredicateEvaluator {
     final ImmutableRoaringBitmap _matchingDictIdBitmap;
 
     public FSTBasedRegexpPredicateEvaluator(RegexpLikePredicate 
regexpLikePredicate, TextIndexReader fstIndexReader,
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/IFSTBasedRegexpPredicateEvaluatorFactory.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/IFSTBasedRegexpPredicateEvaluatorFactory.java
index a2d82b765d..074d44d3e9 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/IFSTBasedRegexpPredicateEvaluatorFactory.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/IFSTBasedRegexpPredicateEvaluatorFactory.java
@@ -45,7 +45,7 @@ public class IFSTBasedRegexpPredicateEvaluatorFactory {
     return new IFSTBasedRegexpPredicateEvaluator(regexpLikePredicate, 
ifstIndexReader, dictionary);
   }
 
-  private static class IFSTBasedRegexpPredicateEvaluator extends 
BaseDictionaryBasedPredicateEvaluator {
+  private static class IFSTBasedRegexpPredicateEvaluator extends 
BaseDictIdBasedRegexpLikePredicateEvaluator {
     final ImmutableRoaringBitmap _matchingDictIdBitmap;
 
     public IFSTBasedRegexpPredicateEvaluator(RegexpLikePredicate 
regexpLikePredicate,
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
index e9c2bb73fd..169230831c 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
@@ -19,6 +19,10 @@
 package org.apache.pinot.core.operator.filter.predicate;
 
 import com.google.common.base.Preconditions;
+import it.unimi.dsi.fastutil.ints.IntArrayList;
+import it.unimi.dsi.fastutil.ints.IntList;
+import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
+import it.unimi.dsi.fastutil.ints.IntSet;
 import org.apache.pinot.common.request.context.predicate.RegexpLikePredicate;
 import org.apache.pinot.common.utils.regex.Matcher;
 import org.apache.pinot.segment.spi.index.reader.Dictionary;
@@ -32,6 +36,9 @@ public class RegexpLikePredicateEvaluatorFactory {
   private RegexpLikePredicateEvaluatorFactory() {
   }
 
+  /// When the cardinality of the dictionary is less than this threshold, scan 
the dictionary to get the matching ids.
+  public static final int DICTIONARY_CARDINALITY_THRESHOLD_FOR_SCAN = 10000;
+
   /**
    * Create a new instance of dictionary based REGEXP_LIKE predicate evaluator.
    *
@@ -42,9 +49,12 @@ public class RegexpLikePredicateEvaluatorFactory {
    */
   public static BaseDictionaryBasedPredicateEvaluator 
newDictionaryBasedEvaluator(
       RegexpLikePredicate regexpLikePredicate, Dictionary dictionary, DataType 
dataType) {
-    boolean condition = (dataType == DataType.STRING || dataType == 
DataType.JSON);
-    Preconditions.checkArgument(condition, "Unsupported data type: " + 
dataType);
-    return new 
DictionaryBasedRegexpLikePredicateEvaluator(regexpLikePredicate, dictionary);
+    Preconditions.checkArgument(dataType.getStoredType() == DataType.STRING, 
"Unsupported data type: " + dataType);
+    if (dictionary.length() < DICTIONARY_CARDINALITY_THRESHOLD_FOR_SCAN) {
+      return new DictIdBasedRegexpLikePredicateEvaluator(regexpLikePredicate, 
dictionary);
+    } else {
+      return new ScanBasedRegexpLikePredicateEvaluator(regexpLikePredicate, 
dictionary);
+    }
   }
 
   /**
@@ -56,16 +66,64 @@ public class RegexpLikePredicateEvaluatorFactory {
    */
   public static BaseRawValueBasedPredicateEvaluator 
newRawValueBasedEvaluator(RegexpLikePredicate regexpLikePredicate,
       DataType dataType) {
-    Preconditions.checkArgument(dataType == DataType.STRING, "Unsupported data 
type: " + dataType);
+    Preconditions.checkArgument(dataType.getStoredType() == DataType.STRING, 
"Unsupported data type: " + dataType);
     return new RawValueBasedRegexpLikePredicateEvaluator(regexpLikePredicate);
   }
 
-  private static final class DictionaryBasedRegexpLikePredicateEvaluator 
extends BaseDictionaryBasedPredicateEvaluator {
+  private static final class DictIdBasedRegexpLikePredicateEvaluator
+      extends BaseDictIdBasedRegexpLikePredicateEvaluator {
+    final IntSet _matchingDictIdSet;
+
+    public DictIdBasedRegexpLikePredicateEvaluator(RegexpLikePredicate 
regexpLikePredicate, Dictionary dictionary) {
+      super(regexpLikePredicate, dictionary);
+      Matcher matcher = regexpLikePredicate.getPattern().matcher("");
+      IntList matchingDictIds = new IntArrayList();
+      int dictionarySize = _dictionary.length();
+      for (int dictId = 0; dictId < dictionarySize; dictId++) {
+        if (matcher.reset(dictionary.getStringValue(dictId)).find()) {
+          matchingDictIds.add(dictId);
+        }
+      }
+      int numMatchingDictIds = matchingDictIds.size();
+      if (numMatchingDictIds == 0) {
+        _alwaysFalse = true;
+      } else if (dictionarySize == numMatchingDictIds) {
+        _alwaysTrue = true;
+      }
+      _matchingDictIds = matchingDictIds.toIntArray();
+      _matchingDictIdSet = new IntOpenHashSet(_matchingDictIds);
+    }
+
+    @Override
+    public int getNumMatchingItems() {
+      return _matchingDictIdSet.size();
+    }
+
+    @Override
+    public boolean applySV(int dictId) {
+      return _matchingDictIdSet.contains(dictId);
+    }
+
+    @Override
+    public int applySV(int limit, int[] docIds, int[] values) {
+      // reimplemented here to ensure applySV can be inlined
+      int matches = 0;
+      for (int i = 0; i < limit; i++) {
+        int value = values[i];
+        if (applySV(value)) {
+          docIds[matches++] = docIds[i];
+        }
+      }
+      return matches;
+    }
+  }
+
+  private static final class ScanBasedRegexpLikePredicateEvaluator extends 
BaseDictionaryBasedPredicateEvaluator {
     // Reuse matcher to avoid excessive allocation. This is safe to do because 
the evaluator is always used
     // within the scope of a single thread.
     final Matcher _matcher;
 
-    public DictionaryBasedRegexpLikePredicateEvaluator(RegexpLikePredicate 
regexpLikePredicate, Dictionary dictionary) {
+    public ScanBasedRegexpLikePredicateEvaluator(RegexpLikePredicate 
regexpLikePredicate, Dictionary dictionary) {
       super(regexpLikePredicate, dictionary);
       _matcher = regexpLikePredicate.getPattern().matcher("");
     }
diff --git 
a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java
 
b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java
index 36af424482..83ea214643 100644
--- 
a/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java
+++ 
b/pinot-integration-tests/src/test/java/org/apache/pinot/integration/tests/OfflineClusterIntegrationTest.java
@@ -1893,6 +1893,8 @@ public class OfflineClusterIntegrationTest extends 
BaseClusterIntegrationTestSet
     }, 60_000L, "Failed to query new added columns without reload");
     // Table size shouldn't change without reload
     assertEquals(getTableSize(getTableName()), _tableSize);
+    // Test REGEXP_LIKE on new added columns
+    testRegexpLikeOnNewAddedColumns();
 
     // Trigger reload and verify column count
     reloadAllSegments(TEST_EXTRA_COLUMNS_QUERY, false, numTotalDocs);
@@ -1942,6 +1944,7 @@ public class OfflineClusterIntegrationTest extends 
BaseClusterIntegrationTestSet
     
assertTrue(derivedNullStringColumnIndex.has(StandardIndexes.NULL_VALUE_VECTOR_ID));
 
     testNewAddedColumns();
+    testRegexpLikeOnNewAddedColumns();
 
     // The multi-stage query engine doesn't support expression overrides 
currently
     if (!useMultiStageQueryEngine()) {
@@ -2209,6 +2212,22 @@ public class OfflineClusterIntegrationTest extends 
BaseClusterIntegrationTestSet
     assertEquals(row.get(12).asDouble(), 0.0);
   }
 
+  private void testRegexpLikeOnNewAddedColumns()
+      throws Exception {
+    int numTotalDocs = (int) getCountStarResult();
+
+    // REGEXP_LIKE on new added dictionary-encoded columns should not scan the 
table when it matches all or nothing
+    for (String column : List.of("NewAddedSVJSONDimension", 
"NewAddedDerivedNullString")) {
+      JsonNode response = postQuery("SELECT COUNT(*) FROM mytable WHERE 
REGEXP_LIKE(" + column + ", 'foo')");
+      
assertEquals(response.get("resultTable").get("rows").get(0).get(0).asInt(), 0);
+      assertEquals(response.get("numEntriesScannedInFilter").asInt(), 0);
+
+      response = postQuery("SELECT COUNT(*) FROM mytable WHERE REGEXP_LIKE(" + 
column + ", '.*')");
+      
assertEquals(response.get("resultTable").get("rows").get(0).get(0).asInt(), 
numTotalDocs);
+      assertEquals(response.get("numEntriesScannedInFilter").asInt(), 0);
+    }
+  }
+
   private void testExpressionOverride()
       throws Exception {
     String query = "SELECT COUNT(*) FROM mytable WHERE DaysSinceEpoch * 24 = 
392184";


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to