This is an automated email from the ASF dual-hosted git repository.

jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new a4193e6c270 Use 2 bitsets in ScanBasedRegexpLikePredicateEvaluator to 
track DictIdToRegexMatcher for REGEX_LIKE predicate (#16922)
a4193e6c270 is described below

commit a4193e6c270b207fcf9310dc477c2fba2330d1c0
Author: Chaitanya Deepthi <[email protected]>
AuthorDate: Mon Oct 6 17:25:24 2025 -0700

    Use 2 bitsets in ScanBasedRegexpLikePredicateEvaluator to track 
DictIdToRegexMatcher for REGEX_LIKE predicate (#16922)
    
    * Take in common constants
    
    * Checkstyle fixes
    
    * Remove unused variable
    
    * Format the comment
    
    * Checkstyle fix
    
    * Remove the count in ScanBasedRegexpLikePredicateEvaluator
    
    * Address review comments
    
    * Add comment
    
    * Review comment
    
    * Change the datatype to BitSet
    
    * Remove the dictionary based scan for Regex Like Expressions
    
    * Remove the configs that are unused
    
    * checkstyle changes
    
    * checkstyle fixes
    
    * fix test
    
    * Add back dict based scanning in REGEX_LIKE expressions
    
    * Revert back the test
    
    * Change config key and minor cleanup
    
    * Rename a variable
    
    ---------
    
    Co-authored-by: Xiaotian (Jackie) Jiang <[email protected]>
---
 .../requesthandler/BaseBrokerRequestHandler.java   |  7 ++++
 .../BaseSingleStageBrokerRequestHandler.java       |  4 ++
 .../common/utils/config/QueryOptionsUtils.java     | 11 ++++++
 .../predicate/PredicateEvaluatorProvider.java      | 14 +++----
 .../RegexpLikePredicateEvaluatorFactory.java       | 45 ++++++++++++++++++----
 .../apache/pinot/spi/utils/CommonConstants.java    | 10 +++++
 6 files changed, 76 insertions(+), 15 deletions(-)

diff --git 
a/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseBrokerRequestHandler.java
 
b/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseBrokerRequestHandler.java
index 615d9c0383b..269cc395e67 100644
--- 
a/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseBrokerRequestHandler.java
+++ 
b/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseBrokerRequestHandler.java
@@ -90,6 +90,8 @@ public abstract class BaseBrokerRequestHandler implements 
BrokerRequestHandler {
   protected final QueryLogger _queryLogger;
   @Nullable
   protected final String _enableNullHandling;
+  @Nullable
+  protected final String _regexDictSizeThreshold;
   protected final boolean _enableQueryCancellation;
 
   /**
@@ -121,6 +123,7 @@ public abstract class BaseBrokerRequestHandler implements 
BrokerRequestHandler {
         Broker.DEFAULT_BROKER_ENABLE_ROW_COLUMN_LEVEL_AUTH);
     _queryLogger = new QueryLogger(config);
     _enableNullHandling = 
config.getProperty(Broker.CONFIG_OF_BROKER_QUERY_ENABLE_NULL_HANDLING);
+    _regexDictSizeThreshold = 
config.getProperty(Broker.CONFIG_OF_BROKER_QUERY_REGEX_DICT_SIZE_THRESHOLD);
     _enableQueryCancellation = 
config.getProperty(Broker.CONFIG_OF_BROKER_ENABLE_QUERY_CANCELLATION,
         Broker.DEFAULT_BROKER_ENABLE_QUERY_CANCELLATION);
     if (_enableQueryCancellation) {
@@ -203,6 +206,10 @@ public abstract class BaseBrokerRequestHandler implements 
BrokerRequestHandler {
       
sqlNodeAndOptions.getOptions().putIfAbsent(QueryOptionKey.ENABLE_NULL_HANDLING, 
_enableNullHandling);
     }
 
+    if (_regexDictSizeThreshold != null) {
+      
sqlNodeAndOptions.getOptions().putIfAbsent(QueryOptionKey.REGEX_DICT_SIZE_THRESHOLD,
 _regexDictSizeThreshold);
+    }
+
     BrokerResponse brokerResponse =
         handleRequest(requestId, query, sqlNodeAndOptions, request, 
requesterIdentity, requestContext, httpHeaders,
             accessControl);
diff --git 
a/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseSingleStageBrokerRequestHandler.java
 
b/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseSingleStageBrokerRequestHandler.java
index 36feabc993a..4cf79bd233d 100644
--- 
a/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseSingleStageBrokerRequestHandler.java
+++ 
b/pinot-broker/src/main/java/org/apache/pinot/broker/requesthandler/BaseSingleStageBrokerRequestHandler.java
@@ -1213,6 +1213,10 @@ public abstract class 
BaseSingleStageBrokerRequestHandler extends BaseBrokerRequ
             .putIfAbsent(Broker.Request.QueryOptionKey.ENABLE_NULL_HANDLING, 
_enableNullHandling);
       }
 
+      if (_regexDictSizeThreshold != null) {
+        
sqlNodeAndOptions.getOptions().putIfAbsent(QueryOptionKey.REGEX_DICT_SIZE_THRESHOLD,
 _regexDictSizeThreshold);
+      }
+
       BrokerResponse response =
           doHandleRequest(requestId, subquery, sqlNodeAndOptions, jsonRequest, 
requesterIdentity, requestContext,
               httpHeaders, accessControl);
diff --git 
a/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java
 
b/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java
index f4bb0469ed5..4236e379617 100644
--- 
a/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java
+++ 
b/pinot-common/src/main/java/org/apache/pinot/common/utils/config/QueryOptionsUtils.java
@@ -585,4 +585,15 @@ public class QueryOptionsUtils {
     }
     return Boolean.parseBoolean(value);
   }
+
+  /// When evaluating REGEXP_LIKE predicate on a dictionary encoded column:
+  /// - If dictionary size is smaller than this threshold, scan the dictionary 
to get the matching dictionary ids
+  ///   first, where inverted index can be applied if exists
+  /// - Otherwise, read dictionary while scanning the forward index, cache the 
matching/unmatching dictionary ids
+  ///   during the scan
+  @Nullable
+  public static Integer getRegexDictSizeThreshold(Map<String, String> 
queryOptions) {
+    String regexDictSizeThreshold = 
queryOptions.get(QueryOptionKey.REGEX_DICT_SIZE_THRESHOLD);
+    return uncheckedParseInt(QueryOptionKey.REGEX_DICT_SIZE_THRESHOLD, 
regexDictSizeThreshold);
+  }
 }
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluatorProvider.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluatorProvider.java
index 6db11163282..033cb68b537 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluatorProvider.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/PredicateEvaluatorProvider.java
@@ -49,11 +49,11 @@ public class PredicateEvaluatorProvider {
         // dictionary based predicate evaluators
         switch (predicate.getType()) {
           case EQ:
-            return EqualsPredicateEvaluatorFactory
-                .newDictionaryBasedEvaluator((EqPredicate) predicate, 
dictionary, dataType);
+            return 
EqualsPredicateEvaluatorFactory.newDictionaryBasedEvaluator((EqPredicate) 
predicate, dictionary,
+                dataType);
           case NOT_EQ:
-            return NotEqualsPredicateEvaluatorFactory
-                .newDictionaryBasedEvaluator((NotEqPredicate) predicate, 
dictionary, dataType);
+            return 
NotEqualsPredicateEvaluatorFactory.newDictionaryBasedEvaluator((NotEqPredicate) 
predicate,
+                dictionary, dataType);
           case IN:
             return 
InPredicateEvaluatorFactory.newDictionaryBasedEvaluator((InPredicate) 
predicate, dictionary,
                 dataType, queryContext);
@@ -61,11 +61,11 @@ public class PredicateEvaluatorProvider {
             return 
NotInPredicateEvaluatorFactory.newDictionaryBasedEvaluator((NotInPredicate) 
predicate, dictionary,
                 dataType, queryContext);
           case RANGE:
-            return RangePredicateEvaluatorFactory
-                .newDictionaryBasedEvaluator((RangePredicate) predicate, 
dictionary, dataType);
+            return 
RangePredicateEvaluatorFactory.newDictionaryBasedEvaluator((RangePredicate) 
predicate, dictionary,
+                dataType);
           case REGEXP_LIKE:
             return 
RegexpLikePredicateEvaluatorFactory.newDictionaryBasedEvaluator((RegexpLikePredicate)
 predicate,
-                dictionary, dataType);
+                dictionary, dataType, queryContext);
           default:
             throw new UnsupportedOperationException("Unsupported predicate 
type: " + predicate.getType());
         }
diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
index ac91cad9c52..a022f905cf7 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
@@ -23,10 +23,16 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
 import it.unimi.dsi.fastutil.ints.IntList;
 import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
 import it.unimi.dsi.fastutil.ints.IntSet;
+import java.util.BitSet;
+import javax.annotation.Nullable;
 import org.apache.pinot.common.request.context.predicate.RegexpLikePredicate;
+import org.apache.pinot.common.utils.config.QueryOptionsUtils;
 import org.apache.pinot.common.utils.regex.Matcher;
+import org.apache.pinot.core.query.request.context.QueryContext;
 import org.apache.pinot.segment.spi.index.reader.Dictionary;
 import org.apache.pinot.spi.data.FieldSpec.DataType;
+import 
org.apache.pinot.spi.utils.CommonConstants.Broker.Request.QueryOptionValue;
+
 
 /**
  * Factory for REGEXP_LIKE predicate evaluators.
@@ -35,21 +41,27 @@ public class RegexpLikePredicateEvaluatorFactory {
   private RegexpLikePredicateEvaluatorFactory() {
   }
 
-  /// When the cardinality of the dictionary is less than this threshold, scan 
the dictionary to get the matching ids.
-  public static final int DICTIONARY_CARDINALITY_THRESHOLD_FOR_SCAN = 10000;
-
   /**
    * Create a new instance of dictionary based REGEXP_LIKE predicate evaluator.
    *
    * @param regexpLikePredicate REGEXP_LIKE predicate to evaluate
-   * @param dictionary Dictionary for the column
-   * @param dataType Data type for the column
+   * @param dictionary          Dictionary for the column
+   * @param dataType            Data type for the column
+   * @param queryContext
    * @return Dictionary based REGEXP_LIKE predicate evaluator
    */
   public static BaseDictionaryBasedPredicateEvaluator 
newDictionaryBasedEvaluator(
-      RegexpLikePredicate regexpLikePredicate, Dictionary dictionary, DataType 
dataType) {
+      RegexpLikePredicate regexpLikePredicate, Dictionary dictionary, DataType 
dataType,
+      @Nullable QueryContext queryContext) {
     Preconditions.checkArgument(dataType.getStoredType() == DataType.STRING, 
"Unsupported data type: " + dataType);
-    if (dictionary.length() < DICTIONARY_CARDINALITY_THRESHOLD_FOR_SCAN) {
+    Integer regexDictSizeThreshold = null;
+    if (queryContext != null) {
+      regexDictSizeThreshold = 
QueryOptionsUtils.getRegexDictSizeThreshold(queryContext.getQueryOptions());
+    }
+    if (regexDictSizeThreshold == null) {
+      regexDictSizeThreshold = 
QueryOptionValue.DEFAULT_REGEX_DICT_SIZE_THRESHOLD;
+    }
+    if (dictionary.length() < regexDictSizeThreshold) {
       return new DictIdBasedRegexpLikePredicateEvaluator(regexpLikePredicate, 
dictionary);
     } else {
       return new ScanBasedRegexpLikePredicateEvaluator(regexpLikePredicate, 
dictionary);
@@ -122,14 +134,31 @@ public class RegexpLikePredicateEvaluatorFactory {
     // within the scope of a single thread.
     final Matcher _matcher;
 
+    // _evaluatedIds: tracks which dictionary IDs have been evaluated
+    // _matchingIds: tracks which dictionary IDs match the regex pattern
+    final BitSet _evaluatedIds;
+    final BitSet _matchingIds;
+
     public ScanBasedRegexpLikePredicateEvaluator(RegexpLikePredicate 
regexpLikePredicate, Dictionary dictionary) {
       super(regexpLikePredicate, dictionary);
       _matcher = regexpLikePredicate.getPattern().matcher("");
+      int dictionarySize = dictionary.length();
+      _evaluatedIds = new BitSet(dictionarySize);
+      _matchingIds = new BitSet(dictionarySize);
     }
 
     @Override
     public boolean applySV(int dictId) {
-      return _matcher.reset(_dictionary.getStringValue(dictId)).find();
+      // Check if already evaluated
+      if (_evaluatedIds.get(dictId)) {
+        return _matchingIds.get(dictId);
+      }
+      boolean match = 
_matcher.reset(_dictionary.getStringValue(dictId)).find();
+      _evaluatedIds.set(dictId);
+      if (match) {
+        _matchingIds.set(dictId);
+      }
+      return match;
     }
 
     @Override
diff --git 
a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java 
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
index 9864ad80fb5..2b229abe89d 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
@@ -350,6 +350,9 @@ public class CommonConstants {
         "pinot.broker.query.log.logBeforeProcessing";
     public static final boolean DEFAULT_BROKER_QUERY_LOG_BEFORE_PROCESSING = 
true;
     public static final String CONFIG_OF_BROKER_QUERY_ENABLE_NULL_HANDLING = 
"pinot.broker.query.enable.null.handling";
+    /// Provide broker level default for query option 
[Request.QueryOptionKey#REGEX_DICT_SIZE_THRESHOLD]
+    public static final String 
CONFIG_OF_BROKER_QUERY_REGEX_DICT_SIZE_THRESHOLD =
+        "pinot.broker.query.regex.dict.size.threshold";
     public static final String CONFIG_OF_BROKER_ENABLE_QUERY_CANCELLATION = 
"pinot.broker.enable.query.cancellation";
     public static final boolean DEFAULT_BROKER_ENABLE_QUERY_CANCELLATION = 
true;
     public static final double DEFAULT_BROKER_QUERY_LOG_MAX_RATE_PER_SECOND = 
10_000d;
@@ -715,6 +718,12 @@ public class CommonConstants {
         public static final String IN_PREDICATE_PRE_SORTED = 
"inPredicatePreSorted";
         public static final String IN_PREDICATE_LOOKUP_ALGORITHM = 
"inPredicateLookupAlgorithm";
 
+        // When evaluating REGEXP_LIKE predicate on a dictionary encoded 
column:
+        // - If dictionary size is smaller than this threshold, scan the 
dictionary to get the matching dictionary ids
+        //   first, where inverted index can be applied if exists
+        // - Otherwise, read dictionary while scanning the forward index, 
cache the matching/unmatching dictionary ids
+        //   during the scan
+        public static final String REGEX_DICT_SIZE_THRESHOLD = 
"regexDictSizeThreshold";
 
         public static final String DROP_RESULTS = "dropResults";
 
@@ -816,6 +825,7 @@ public class CommonConstants {
 
       public static class QueryOptionValue {
         public static final int DEFAULT_MAX_STREAMING_PENDING_BLOCKS = 100;
+        public static final int DEFAULT_REGEX_DICT_SIZE_THRESHOLD = 10000;
       }
     }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to