This is an automated email from the ASF dual-hosted git repository.

richardstartin pushed a commit to branch rgs/optimise-regexp-like
in repository https://gitbox.apache.org/repos/asf/pinot.git

commit 7aca15f3f48490b909c917d891cae152eef7192f
Author: Richard Startin <richardstar...@apache.org>
AuthorDate: Thu Mar 9 22:36:16 2023 +0000

    improve regexp like evaluation against dictionary by scanning dictionary to 
build set of matching dictIds
---
 .../RegexpLikePredicateEvaluatorFactory.java       | 38 ++++++++++++++--------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
index a3af9de194..956c9f3743 100644
--- 
a/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
+++ 
b/pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/RegexpLikePredicateEvaluatorFactory.java
@@ -19,12 +19,13 @@
 package org.apache.pinot.core.operator.filter.predicate;
 
 import com.google.common.base.Preconditions;
-import it.unimi.dsi.fastutil.ints.IntArrayList;
-import it.unimi.dsi.fastutil.ints.IntList;
 import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import org.apache.pinot.common.request.context.predicate.RegexpLikePredicate;
 import org.apache.pinot.segment.spi.index.reader.Dictionary;
 import org.apache.pinot.spi.data.FieldSpec.DataType;
+import org.roaringbitmap.RoaringBitmap;
+import org.roaringbitmap.RoaringBitmapWriter;
 
 
 /**
@@ -62,21 +63,22 @@ public class RegexpLikePredicateEvaluatorFactory {
   }
 
   private static final class DictionaryBasedRegexpLikePredicateEvaluator 
extends BaseDictionaryBasedPredicateEvaluator {
-    // Reuse matcher to avoid excessive allocation. This is safe to do because 
the evaluator is always used
-    // within the scope of a single thread.
-    final Matcher _matcher;
+    final Pattern _pattern;
     final Dictionary _dictionary;
+    RoaringBitmap _matchingDictIdsBitmap;
     int[] _matchingDictIds;
 
     public DictionaryBasedRegexpLikePredicateEvaluator(RegexpLikePredicate 
regexpLikePredicate, Dictionary dictionary) {
       super(regexpLikePredicate);
       _dictionary = dictionary;
-      _matcher = regexpLikePredicate.getPattern().matcher("");
+      _pattern = regexpLikePredicate.getPattern();
     }
 
     @Override
     public boolean applySV(int dictId) {
-      return _matcher.reset(_dictionary.getStringValue(dictId)).find();
+      // delay scanning the dictionary until planning is complete
+      ensureDictionaryScanned();
+      return _matchingDictIdsBitmap.contains(dictId);
     }
 
     @Override
@@ -95,16 +97,24 @@ public class RegexpLikePredicateEvaluatorFactory {
     @Override
     public int[] getMatchingDictIds() {
       if (_matchingDictIds == null) {
-        IntList matchingDictIds = new IntArrayList();
-        int dictionarySize = _dictionary.length();
-        for (int dictId = 0; dictId < dictionarySize; dictId++) {
-          if (applySV(dictId)) {
-            matchingDictIds.add(dictId);
+        ensureDictionaryScanned();
+        _matchingDictIds = _matchingDictIdsBitmap.toArray();
+      }
+      return _matchingDictIds;
+    }
+
+    private void ensureDictionaryScanned() {
+      if (_matchingDictIdsBitmap == null) {
+        RoaringBitmapWriter<RoaringBitmap> writer = 
RoaringBitmapWriter.writer().runCompress(false).get();
+        Matcher matcher = _pattern.matcher("");
+        for (int dictId = 0; dictId < _dictionary.length(); dictId++) {
+          String value = _dictionary.getStringValue(dictId);
+          if (matcher.reset(value).find()) {
+            writer.add(dictId);
           }
         }
-        _matchingDictIds = matchingDictIds.toIntArray();
+        _matchingDictIdsBitmap = writer.get();
       }
-      return _matchingDictIds;
     }
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to