This is an automated email from the ASF dual-hosted git repository.

tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 3b5ef194466 [ngram index part 1]Add realtime ngram filtering index and 
benchmark results. (#16364)
3b5ef194466 is described below

commit 3b5ef194466bf2f740715563dad2c31014eae786
Author: Ting Chen <[email protected]>
AuthorDate: Thu Oct 2 09:23:44 2025 -0700

    [ngram index part 1]Add realtime ngram filtering index and benchmark 
results. (#16364)
    
    * [ngram index]Add realtime ngram filtering index and benchmark results.
    
    * Fix an linter issue.
    
    * Fix an linter issue.
    
    * Improve bitmap and operations.
    
    * Fix based on comments.
---
 .../pinot/perf/BenchmarkNgramFilteringIndex.java   | 102 +++++++++++
 .../invertedindex/RealtimeNgramFilteringIndex.java | 196 +++++++++++++++++++++
 .../RealtimeNgramFilteringIndexTest.java           |  96 ++++++++++
 3 files changed, 394 insertions(+)

diff --git 
a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNgramFilteringIndex.java
 
b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNgramFilteringIndex.java
new file mode 100644
index 00000000000..f202e90c1cc
--- /dev/null
+++ 
b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNgramFilteringIndex.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.perf;
+
+import java.util.concurrent.TimeUnit;
+import 
org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeNgramFilteringIndex;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.ChainedOptionsBuilder;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+import org.roaringbitmap.IntIterator;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+
+/**
+ * This benchmark is a benchmark for testing the performance of N-gram 
filtering index vs a pure regex matcher
+ */
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Fork(1)
+@Warmup(iterations = 1)
+@Measurement(iterations = 2)
+@State(Scope.Benchmark)
+public class BenchmarkNgramFilteringIndex {
+  public static final String PREFIX = "somelonglongveryverylongword";
+  RealtimeNgramFilteringIndex _realtimeNgramFilteringIndex;
+
+  public static void main(String[] args)
+      throws Exception {
+    ChainedOptionsBuilder opt = new 
OptionsBuilder().include(BenchmarkNgramFilteringIndex.class.getSimpleName());
+    new Runner(opt.build()).run();
+  }
+
+  @Setup(Level.Trial)
+  public void setUp()
+      throws Exception {
+    _realtimeNgramFilteringIndex = new RealtimeNgramFilteringIndex("col", 2, 
3);
+    // Load the index with 10000 words
+    for (int i = 0; i < 10000; i++) {
+      _realtimeNgramFilteringIndex.add(PREFIX + i);
+    }
+  }
+
+  @Benchmark
+  @BenchmarkMode(Mode.All)
+  @OutputTimeUnit(TimeUnit.MILLISECONDS)
+  public int benchmarkTextMatchingUsingNgram() {
+    // First retrieve the docIds using the N-gram index
+    MutableRoaringBitmap map = _realtimeNgramFilteringIndex.getDocIds("ord78");
+    IntIterator intIterator = map.getIntIterator();
+    // Next doing regex matching on the docIds to validate the results
+    while (intIterator.hasNext()) {
+      int docId = intIterator.next();
+      // Simulate regex validate on the actual document.
+      (PREFIX + docId).matches(".*ord78.*");
+    }
+    return 0;
+  }
+
+  @Benchmark
+  @BenchmarkMode(Mode.All)
+  @OutputTimeUnit(TimeUnit.MILLISECONDS)
+  public int benchmarkMatchingViaRegexScan() {
+    // Matching the documents using regex scan
+    for (int i = 0; i < 10000; i++) {
+      (PREFIX + i).matches(".*ord78.*");
+    }
+    return 0;
+  }
+
+  @TearDown
+  public void tearDown()
+      throws Exception {
+    _realtimeNgramFilteringIndex.close();
+  }
+}
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeNgramFilteringIndex.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeNgramFilteringIndex.java
new file mode 100644
index 00000000000..7f841ae7f8e
--- /dev/null
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeNgramFilteringIndex.java
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.realtime.impl.invertedindex;
+
+import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.pinot.segment.spi.index.mutable.MutableTextIndex;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+
+/**
+ * RealtimeNgramFilteringIndex is a mutable inverted index that supports 
adding document IDs for n-grams. An n-gram
+ * is a contiguous sequence of n characters from input text. The length of the 
n-gram is variable between
+ * [minNgramLength, maxNgramLength] as specified in the index config. 
Internally, it uses a realtime inverted index
+ * which maps n-grams to their posting lists (document IDs). The index is 
thread-safe for single writer and multiple
+ * readers.
+ */
+public class RealtimeNgramFilteringIndex implements MutableTextIndex {
+  private final String _column;
+  // Under the hood, ngram index is implemented as an inverted index from 
n-grams to their posting lists.
+  private final RealtimeInvertedIndex _invertedIndex;
+  // A mapping from n-grams to their dictionary IDs.
+  private final Object2IntOpenHashMap<String> _ngramToDictIdMapping;
+  // Read and write locks for thread safety.
+  private final ReentrantReadWriteLock.ReadLock _readLock;
+  private final ReentrantReadWriteLock.WriteLock _writeLock;
+
+  // Next document ID to be assigned.
+  private int _nextDocId = 0;
+  private int _nextDictId = 0;
+
+  private final int _minNgramLength;
+  private final int _maxNgramLength;
+
+  public RealtimeNgramFilteringIndex(String column, int minNgramLength, int 
maxNgramLength) {
+    _column = column;
+    _invertedIndex = new RealtimeInvertedIndex();
+    _ngramToDictIdMapping = new Object2IntOpenHashMap<>();
+    _ngramToDictIdMapping.defaultReturnValue(-1);
+    if (minNgramLength <= 0 || maxNgramLength <= 0 || minNgramLength > 
maxNgramLength) {
+      throw new IllegalArgumentException("Invalid n-gram settings: " + 
minNgramLength + "..." + maxNgramLength);
+    }
+    _minNgramLength = minNgramLength;
+    _maxNgramLength = maxNgramLength;
+
+    ReentrantReadWriteLock readWriteLock = new ReentrantReadWriteLock();
+    _readLock = readWriteLock.readLock();
+    _writeLock = readWriteLock.writeLock();
+  }
+
+  /**
+   * Index the string value
+   *
+   * @param value the input value as a string
+   */
+  @Override
+  public void add(String value) {
+    if (value == null) {
+      return;
+    }
+    addHelper(value);
+    _nextDocId++;
+  }
+
+  /**
+   * Index an array of string values. Each string in the array is treated as a 
separate document.
+   *
+   * @param values the documents as an array of strings
+   */
+  @Override
+  public void add(String[] values) {
+    if (values == null || values.length == 0) {
+      return;
+    }
+    for (String value : values) {
+      if (value != null) {
+        addHelper(value);
+      }
+    }
+    _nextDocId++;
+  }
+
+  /**
+   * Returns the matching dictionary ids for the given search query (optional).
+   *
+   * @param searchQuery as a literal string
+   */
+  @Override
+  public ImmutableRoaringBitmap getDictIds(String searchQuery) {
+    throw new UnsupportedOperationException();
+  }
+
+  /**
+   * Returns the matching document ids for the given search query.
+   * Returns null if no n-grams are generated from the search query -- either 
because the search query is null or empty
+   * or shorter than the minimum n-gram length.
+   * @param searchQuery as a literal string
+   */
+  @Override
+  public MutableRoaringBitmap getDocIds(String searchQuery) {
+    _readLock.lock();
+    Iterable<String> ngrams = generateNgrams(searchQuery);
+    if (!ngrams.iterator().hasNext()) {
+      return null; // No n-grams generated, return null.
+    }
+    ArrayList<MutableRoaringBitmap> bitmapLst = new ArrayList<>();
+    try {
+      for (String ngram : ngrams) {
+        int dictId = _ngramToDictIdMapping.getInt(ngram);
+        bitmapLst.add(_invertedIndex.getDocIds(dictId));
+      }
+      if (bitmapLst.isEmpty()) {
+        return null; // No n-grams found in the index.
+      }
+      MutableRoaringBitmap resultBitmap = bitmapLst.get(0);
+      for (int i = 1; i < bitmapLst.size(); i++) {
+        resultBitmap.and(bitmapLst.get(i));
+      }
+      return resultBitmap;
+    } finally {
+      _readLock.unlock();
+    }
+  }
+
+  /**
+   * Closes this stream and releases any system resources associated with it. 
If the stream is already closed then
+   * invoking this method has no effect.
+   *
+   * <p> As noted in {@link AutoCloseable#close()}, cases where the
+   * close may fail require careful attention. It is strongly advised to 
relinquish the underlying resources and to
+   * internally
+   * <em>mark</em> the {@code Closeable} as closed, prior to throwing
+   * the {@code IOException}.
+   *
+   * @throws IOException if an I/O error occurs
+   */
+  @Override
+  public void close()
+      throws IOException {
+    _ngramToDictIdMapping.clear();
+  }
+
+  private void addHelper(String value) {
+    Iterable<String> ngrams = generateNgrams(value);
+    _writeLock.lock();
+    try {
+      for (String ngram : ngrams) {
+        int currentDictId = _ngramToDictIdMapping.computeIfAbsent(ngram, k -> 
_nextDictId++);
+        _invertedIndex.add(currentDictId, _nextDocId);
+      }
+    } finally {
+      _writeLock.unlock();
+    }
+  }
+
+  // Use Lucene's NGramTokenizer to generate n-grams from the input value.
+  private Iterable<String> generateNgrams(String value) {
+    if (value == null || value.isEmpty()) {
+      return java.util.Collections.emptyList();
+    }
+    ArrayList<String> ngrams = new ArrayList<>();
+    // Implement the logic to generate n-grams from the input value.
+    try (NGramTokenizer nGramTokenizer = new NGramTokenizer(_minNgramLength, 
_maxNgramLength)) {
+      nGramTokenizer.setReader(new java.io.StringReader(value));
+      nGramTokenizer.reset();
+      while (nGramTokenizer.incrementToken()) {
+        // Extract the text term as a string and add it to the list.
+        
ngrams.add(nGramTokenizer.getAttribute(CharTermAttribute.class).toString());
+      }
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+    return ngrams;
+  }
+}
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeNgramFilteringIndexTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeNgramFilteringIndexTest.java
new file mode 100644
index 00000000000..2cf25954733
--- /dev/null
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeNgramFilteringIndexTest.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.realtime.impl.invertedindex;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.testng.annotations.AfterClass;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertNull;
+import static org.testng.Assert.assertTrue;
+
+
+public class RealtimeNgramFilteringIndexTest {
+  // ngram length from 2 to 3
+  private RealtimeNgramFilteringIndex _ngram2To3Index;
+  // ngram length from 2 to 2
+  private RealtimeNgramFilteringIndex _ngram2To2Index;
+
+  @BeforeClass
+  public void setUp()
+      throws Exception {
+    _ngram2To3Index = new RealtimeNgramFilteringIndex("col", 2, 3);
+    _ngram2To2Index = new RealtimeNgramFilteringIndex("col", 2, 2);
+    List<String> documents = getTextData();
+
+    for (String doc : documents) {
+      _ngram2To3Index.add(doc);
+      _ngram2To2Index.add(doc);
+    }
+  }
+
+  @AfterClass
+  public void tearDown()
+      throws IOException {
+    _ngram2To3Index.close();
+    _ngram2To2Index.close();
+  }
+
+  @Test
+  public void testQueries() {
+    String ngramQuery = "drew";
+    testSelectionResults(_ngram2To3Index, ngramQuery, Arrays.asList(0, 1));
+    testSelectionResults(_ngram2To2Index, ngramQuery, Arrays.asList(0, 1, 4));
+
+    ngramQuery = "rew";
+    testSelectionResults(_ngram2To3Index, ngramQuery, Arrays.asList(0, 1, 4));
+    testSelectionResults(_ngram2To2Index, ngramQuery, Arrays.asList(0, 1, 4, 
5));
+
+    ngramQuery = "re";
+    testSelectionResults(_ngram2To3Index, ngramQuery, Arrays.asList(0, 1, 4, 
5));
+    testSelectionResults(_ngram2To2Index, ngramQuery, Arrays.asList(0, 1, 4, 
5));
+
+    ngramQuery = "r";
+    testSelectionResults(_ngram2To3Index, ngramQuery, null);
+    testSelectionResults(_ngram2To2Index, ngramQuery, null);
+  }
+
+  private List<String> getTextData() {
+    return Arrays.asList("andrew", "drerew", "draw", "dr", "dr and rew", "ew 
re");
+  }
+
+  private void testSelectionResults(RealtimeNgramFilteringIndex 
realtimeNgramFilteringIndex, String nativeQuery,
+      List<Integer> results) {
+    ImmutableRoaringBitmap resultMap = 
realtimeNgramFilteringIndex.getDocIds(nativeQuery);
+    if (resultMap == null) {
+      assertNull(results);
+      return;
+    }
+
+    assertEquals(resultMap.getCardinality(), results.size());
+    for (int result : results) {
+      assertTrue(resultMap.contains(result));
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to