This is an automated email from the ASF dual-hosted git repository.
tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 3b5ef194466 [ngram index part 1]Add realtime ngram filtering index and
benchmark results. (#16364)
3b5ef194466 is described below
commit 3b5ef194466bf2f740715563dad2c31014eae786
Author: Ting Chen <[email protected]>
AuthorDate: Thu Oct 2 09:23:44 2025 -0700
[ngram index part 1]Add realtime ngram filtering index and benchmark
results. (#16364)
* [ngram index]Add realtime ngram filtering index and benchmark results.
* Fix an linter issue.
* Fix an linter issue.
* Improve bitmap and operations.
* Fix based on comments.
---
.../pinot/perf/BenchmarkNgramFilteringIndex.java | 102 +++++++++++
.../invertedindex/RealtimeNgramFilteringIndex.java | 196 +++++++++++++++++++++
.../RealtimeNgramFilteringIndexTest.java | 96 ++++++++++
3 files changed, 394 insertions(+)
diff --git
a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNgramFilteringIndex.java
b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNgramFilteringIndex.java
new file mode 100644
index 00000000000..f202e90c1cc
--- /dev/null
+++
b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkNgramFilteringIndex.java
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.perf;
+
+import java.util.concurrent.TimeUnit;
+import
org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeNgramFilteringIndex;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.ChainedOptionsBuilder;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+import org.roaringbitmap.IntIterator;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+
+/**
+ * This benchmark is a benchmark for testing the performance of N-gram
filtering index vs a pure regex matcher
+ */
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Fork(1)
+@Warmup(iterations = 1)
+@Measurement(iterations = 2)
+@State(Scope.Benchmark)
+public class BenchmarkNgramFilteringIndex {
+ public static final String PREFIX = "somelonglongveryverylongword";
+ RealtimeNgramFilteringIndex _realtimeNgramFilteringIndex;
+
+ public static void main(String[] args)
+ throws Exception {
+ ChainedOptionsBuilder opt = new
OptionsBuilder().include(BenchmarkNgramFilteringIndex.class.getSimpleName());
+ new Runner(opt.build()).run();
+ }
+
+ @Setup(Level.Trial)
+ public void setUp()
+ throws Exception {
+ _realtimeNgramFilteringIndex = new RealtimeNgramFilteringIndex("col", 2,
3);
+ // Load the index with 10000 words
+ for (int i = 0; i < 10000; i++) {
+ _realtimeNgramFilteringIndex.add(PREFIX + i);
+ }
+ }
+
+ @Benchmark
+ @BenchmarkMode(Mode.All)
+ @OutputTimeUnit(TimeUnit.MILLISECONDS)
+ public int benchmarkTextMatchingUsingNgram() {
+ // First retrieve the docIds using the N-gram index
+ MutableRoaringBitmap map = _realtimeNgramFilteringIndex.getDocIds("ord78");
+ IntIterator intIterator = map.getIntIterator();
+ // Next doing regex matching on the docIds to validate the results
+ while (intIterator.hasNext()) {
+ int docId = intIterator.next();
+ // Simulate regex validate on the actual document.
+ (PREFIX + docId).matches(".*ord78.*");
+ }
+ return 0;
+ }
+
+ @Benchmark
+ @BenchmarkMode(Mode.All)
+ @OutputTimeUnit(TimeUnit.MILLISECONDS)
+ public int benchmarkMatchingViaRegexScan() {
+ // Matching the documents using regex scan
+ for (int i = 0; i < 10000; i++) {
+ (PREFIX + i).matches(".*ord78.*");
+ }
+ return 0;
+ }
+
+ @TearDown
+ public void tearDown()
+ throws Exception {
+ _realtimeNgramFilteringIndex.close();
+ }
+}
diff --git
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeNgramFilteringIndex.java
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeNgramFilteringIndex.java
new file mode 100644
index 00000000000..7f841ae7f8e
--- /dev/null
+++
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeNgramFilteringIndex.java
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.realtime.impl.invertedindex;
+
+import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.pinot.segment.spi.index.mutable.MutableTextIndex;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.roaringbitmap.buffer.MutableRoaringBitmap;
+
+
+/**
+ * RealtimeNgramFilteringIndex is a mutable inverted index that supports
adding document IDs for n-grams. An n-gram
+ * is a contiguous sequence of n characters from input text. The length of the
n-gram is variable between
+ * [minNgramLength, maxNgramLength] as specified in the index config.
Internally, it uses a realtime inverted index
+ * which maps n-grams to their posting lists (document IDs). The index is
thread-safe for single writer and multiple
+ * readers.
+ */
+public class RealtimeNgramFilteringIndex implements MutableTextIndex {
+ private final String _column;
+ // Under the hood, ngram index is implemented as an inverted index from
n-grams to their posting lists.
+ private final RealtimeInvertedIndex _invertedIndex;
+ // A mapping from n-grams to their dictionary IDs.
+ private final Object2IntOpenHashMap<String> _ngramToDictIdMapping;
+ // Read and write locks for thread safety.
+ private final ReentrantReadWriteLock.ReadLock _readLock;
+ private final ReentrantReadWriteLock.WriteLock _writeLock;
+
+ // Next document ID to be assigned.
+ private int _nextDocId = 0;
+ private int _nextDictId = 0;
+
+ private final int _minNgramLength;
+ private final int _maxNgramLength;
+
+ public RealtimeNgramFilteringIndex(String column, int minNgramLength, int
maxNgramLength) {
+ _column = column;
+ _invertedIndex = new RealtimeInvertedIndex();
+ _ngramToDictIdMapping = new Object2IntOpenHashMap<>();
+ _ngramToDictIdMapping.defaultReturnValue(-1);
+ if (minNgramLength <= 0 || maxNgramLength <= 0 || minNgramLength >
maxNgramLength) {
+ throw new IllegalArgumentException("Invalid n-gram settings: " +
minNgramLength + "..." + maxNgramLength);
+ }
+ _minNgramLength = minNgramLength;
+ _maxNgramLength = maxNgramLength;
+
+ ReentrantReadWriteLock readWriteLock = new ReentrantReadWriteLock();
+ _readLock = readWriteLock.readLock();
+ _writeLock = readWriteLock.writeLock();
+ }
+
+ /**
+ * Index the string value
+ *
+ * @param value the input value as a string
+ */
+ @Override
+ public void add(String value) {
+ if (value == null) {
+ return;
+ }
+ addHelper(value);
+ _nextDocId++;
+ }
+
+ /**
+ * Index an array of string values. Each string in the array is treated as a
separate document.
+ *
+ * @param values the documents as an array of strings
+ */
+ @Override
+ public void add(String[] values) {
+ if (values == null || values.length == 0) {
+ return;
+ }
+ for (String value : values) {
+ if (value != null) {
+ addHelper(value);
+ }
+ }
+ _nextDocId++;
+ }
+
+ /**
+ * Returns the matching dictionary ids for the given search query (optional).
+ *
+ * @param searchQuery as a literal string
+ */
+ @Override
+ public ImmutableRoaringBitmap getDictIds(String searchQuery) {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ * Returns the matching document ids for the given search query.
+ * Returns null if no n-grams are generated from the search query -- either
because the search query is null or empty
+ * or shorter than the minimum n-gram length.
+ * @param searchQuery as a literal string
+ */
+ @Override
+ public MutableRoaringBitmap getDocIds(String searchQuery) {
+ _readLock.lock();
+ Iterable<String> ngrams = generateNgrams(searchQuery);
+ if (!ngrams.iterator().hasNext()) {
+ return null; // No n-grams generated, return null.
+ }
+ ArrayList<MutableRoaringBitmap> bitmapLst = new ArrayList<>();
+ try {
+ for (String ngram : ngrams) {
+ int dictId = _ngramToDictIdMapping.getInt(ngram);
+ bitmapLst.add(_invertedIndex.getDocIds(dictId));
+ }
+ if (bitmapLst.isEmpty()) {
+ return null; // No n-grams found in the index.
+ }
+ MutableRoaringBitmap resultBitmap = bitmapLst.get(0);
+ for (int i = 1; i < bitmapLst.size(); i++) {
+ resultBitmap.and(bitmapLst.get(i));
+ }
+ return resultBitmap;
+ } finally {
+ _readLock.unlock();
+ }
+ }
+
+ /**
+ * Closes this stream and releases any system resources associated with it.
If the stream is already closed then
+ * invoking this method has no effect.
+ *
+ * <p> As noted in {@link AutoCloseable#close()}, cases where the
+ * close may fail require careful attention. It is strongly advised to
relinquish the underlying resources and to
+ * internally
+ * <em>mark</em> the {@code Closeable} as closed, prior to throwing
+ * the {@code IOException}.
+ *
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public void close()
+ throws IOException {
+ _ngramToDictIdMapping.clear();
+ }
+
+ private void addHelper(String value) {
+ Iterable<String> ngrams = generateNgrams(value);
+ _writeLock.lock();
+ try {
+ for (String ngram : ngrams) {
+ int currentDictId = _ngramToDictIdMapping.computeIfAbsent(ngram, k ->
_nextDictId++);
+ _invertedIndex.add(currentDictId, _nextDocId);
+ }
+ } finally {
+ _writeLock.unlock();
+ }
+ }
+
+ // Use Lucene's NGramTokenizer to generate n-grams from the input value.
+ private Iterable<String> generateNgrams(String value) {
+ if (value == null || value.isEmpty()) {
+ return java.util.Collections.emptyList();
+ }
+ ArrayList<String> ngrams = new ArrayList<>();
+ // Implement the logic to generate n-grams from the input value.
+ try (NGramTokenizer nGramTokenizer = new NGramTokenizer(_minNgramLength,
_maxNgramLength)) {
+ nGramTokenizer.setReader(new java.io.StringReader(value));
+ nGramTokenizer.reset();
+ while (nGramTokenizer.incrementToken()) {
+ // Extract the text term as a string and add it to the list.
+
ngrams.add(nGramTokenizer.getAttribute(CharTermAttribute.class).toString());
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ return ngrams;
+ }
+}
diff --git
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeNgramFilteringIndexTest.java
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeNgramFilteringIndexTest.java
new file mode 100644
index 00000000000..2cf25954733
--- /dev/null
+++
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/RealtimeNgramFilteringIndexTest.java
@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.realtime.impl.invertedindex;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
+import org.testng.annotations.AfterClass;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertNull;
+import static org.testng.Assert.assertTrue;
+
+
+public class RealtimeNgramFilteringIndexTest {
+ // ngram length from 2 to 3
+ private RealtimeNgramFilteringIndex _ngram2To3Index;
+ // ngram length from 2 to 2
+ private RealtimeNgramFilteringIndex _ngram2To2Index;
+
+ @BeforeClass
+ public void setUp()
+ throws Exception {
+ _ngram2To3Index = new RealtimeNgramFilteringIndex("col", 2, 3);
+ _ngram2To2Index = new RealtimeNgramFilteringIndex("col", 2, 2);
+ List<String> documents = getTextData();
+
+ for (String doc : documents) {
+ _ngram2To3Index.add(doc);
+ _ngram2To2Index.add(doc);
+ }
+ }
+
+ @AfterClass
+ public void tearDown()
+ throws IOException {
+ _ngram2To3Index.close();
+ _ngram2To2Index.close();
+ }
+
+ @Test
+ public void testQueries() {
+ String ngramQuery = "drew";
+ testSelectionResults(_ngram2To3Index, ngramQuery, Arrays.asList(0, 1));
+ testSelectionResults(_ngram2To2Index, ngramQuery, Arrays.asList(0, 1, 4));
+
+ ngramQuery = "rew";
+ testSelectionResults(_ngram2To3Index, ngramQuery, Arrays.asList(0, 1, 4));
+ testSelectionResults(_ngram2To2Index, ngramQuery, Arrays.asList(0, 1, 4,
5));
+
+ ngramQuery = "re";
+ testSelectionResults(_ngram2To3Index, ngramQuery, Arrays.asList(0, 1, 4,
5));
+ testSelectionResults(_ngram2To2Index, ngramQuery, Arrays.asList(0, 1, 4,
5));
+
+ ngramQuery = "r";
+ testSelectionResults(_ngram2To3Index, ngramQuery, null);
+ testSelectionResults(_ngram2To2Index, ngramQuery, null);
+ }
+
+ private List<String> getTextData() {
+ return Arrays.asList("andrew", "drerew", "draw", "dr", "dr and rew", "ew
re");
+ }
+
+ private void testSelectionResults(RealtimeNgramFilteringIndex
realtimeNgramFilteringIndex, String nativeQuery,
+ List<Integer> results) {
+ ImmutableRoaringBitmap resultMap =
realtimeNgramFilteringIndex.getDocIds(nativeQuery);
+ if (resultMap == null) {
+ assertNull(results);
+ return;
+ }
+
+ assertEquals(resultMap.getCardinality(), results.size());
+ for (int result : results) {
+ assertTrue(resultMap.contains(result));
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]