(pinot) branch master updated: Adding Match prefix phrase query lucene parser (#16476)

jackie Thu, 31 Jul 2025 23:18:08 -0700

This is an automated email from the ASF dual-hosted git repository.

jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git



The following commit(s) were added to refs/heads/master by this push:
     new 1d534b41e3 Adding Match prefix phrase query lucene parser  (#16476)
1d534b41e3 is described below

commit 1d534b41e3a4b26d0b4e39f154163074ef72822d
Author: RAGHVENDRA KUMAR YADAV <[email protected]>
AuthorDate: Thu Jul 31 23:17:56 2025 -0700

    Adding Match prefix phrase query lucene parser  (#16476)
---
 .../pinot/queries/TextSearchQueriesTest.java       |  54 +++-
 .../lucene/parsers/PrefixPhraseQueryParser.java    | 298 +++++++++++++++++++++
 .../segment/local/utils/LuceneTextIndexUtils.java  |  23 +-
 .../local/utils/LuceneTextIndexUtilsTest.java      | 133 +++++++++
 4 files changed, 506 insertions(+), 2 deletions(-)

diff --git 
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java 
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index 0852bba44c..b88276963a 100644
--- 
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
+++ 
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -2028,7 +2028,11 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {
     });
 
     String query = "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " 
WHERE TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
-        + ", '*ealtime streaming system*', 
'parser=CLASSIC,allowLeadingWildcard=true,defaultOperator=AND') LIMIT 50000";
+        + ", 'realtime streaming system', 'parser=MATCHPHRASE') LIMIT 50000";
+    testTextSearchSelectQueryHelper(query, 0, false, expected);
+
+    query = "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE 
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+        + ", 'realtime streaming system', 
'parser=MATCHPHRASE,enablePrefixMatch=true') LIMIT 50000";
     testTextSearchSelectQueryHelper(query, expected.size(), false, expected);
 
     List<Object[]> expected1 = new ArrayList<>();
@@ -2082,6 +2086,54 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {
     testTextSearchSelectQueryHelper(query8, expected.size(), false, expected);
   }
 
+  @Test
+  public void testMatchPhraseQueryParser()
+      throws Exception {
+    // Test case 1: "Tensor flow" - should match 3 documents
+    List<Object[]> expectedTensorFlow = new ArrayList<>();
+    expectedTensorFlow.add(new Object[]{
+        1004, "Machine learning, Tensor flow, Java, Stanford university,"
+    });
+    expectedTensorFlow.add(new Object[]{
+        1007, "C++, Python, Tensor flow, database kernel, storage, indexing 
and transaction processing, building "
+        + "large scale systems, Machine learning"
+    });
+    expectedTensorFlow.add(new Object[]{
+        1016, "CUDA, GPU processing, Tensor flow, Pandas, Python, Jupyter 
notebook, spark, Machine learning, building"
+        + " high performance scalable systems"
+    });
+
+    // Test exact phrase "Tensor flow" with default settings (slop=0, 
inOrder=true)
+    String queryExactPhrase =
+        "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE 
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+            + ", 'Tensor flow', 'parser=MATCHPHRASE,enablePrefixMatch=true') 
LIMIT 50000";
+    testTextSearchSelectQueryHelper(queryExactPhrase, 3, false, 
expectedTensorFlow);
+
+    // Test "Tensor database" with slop=1 (should allow one position gap)
+    List<Object[]> expectedTensorDatabase = new ArrayList<>();
+    expectedTensorDatabase.add(new Object[]{
+        1007, "C++, Python, Tensor flow, database kernel, storage, indexing 
and transaction processing, building "
+        + "large scale systems, Machine learning"
+    });
+
+    String querySlop1 =
+        "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE 
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+            + ", 'Tensor database', 
'parser=MATCHPHRASE,enablePrefixMatch=true,slop=1') LIMIT 50000";
+    testTextSearchSelectQueryHelper(querySlop1, 1, false, 
expectedTensorDatabase);
+
+    // Test "Tensor flow" with inOrder=false (should allow any order)
+    String queryInOrderFalse =
+        "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE 
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+            + ", 'Tensor flow', 
'parser=MATCHPHRASE,enablePrefixMatch=true,inOrder=false') LIMIT 50000";
+    testTextSearchSelectQueryHelper(queryInOrderFalse, 3, false, 
expectedTensorFlow);
+
+    // Test "Tensor flow" with both slop=1 and inOrder=false
+    String querySlopAndInOrder =
+        "SELECT INT_COL, SKILLS_TEXT_COL FROM " + TABLE_NAME + " WHERE 
TEXT_MATCH(" + SKILLS_TEXT_COL_NAME
+            + ", 'flow Tensor', 
'parser=MATCHPHRASE,enablePrefixMatch=true,inOrder=false') LIMIT 50000";
+    testTextSearchSelectQueryHelper(querySlopAndInOrder, 3, false, 
expectedTensorFlow);
+  }
+
   // ===== TEST CASES FOR AND/OR FILTER OPERATORS =====
   @Test
   public void testTextSearchWithOptionsAndOrOperators()
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/PrefixPhraseQueryParser.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/PrefixPhraseQueryParser.java
new file mode 100644
index 0000000000..d6c5b74043
--- /dev/null
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/lucene/parsers/PrefixPhraseQueryParser.java
@@ -0,0 +1,298 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.local.segment.index.text.lucene.parsers;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queries.spans.SpanMultiTermQueryWrapper;
+import org.apache.lucene.queries.spans.SpanNearQuery;
+import org.apache.lucene.queries.spans.SpanQuery;
+import org.apache.lucene.queries.spans.SpanTermQuery;
+import org.apache.lucene.queryparser.charstream.CharStream;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParserBase;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.WildcardQuery;
+
+
+/**
+ * A custom query parser that creates prefix phrase queries.
+ * This parser tokenizes the input query and creates a SpanNearQuery where
+ * all terms except the last one are exact matches, and the last term can 
optionally
+ * have a wildcard suffix based on the enablePrefixMatch setting.
+ *
+ * <p>This parser is designed to support both exact phrase matching and prefix 
phrase matching:</p>
+ * <ul>
+ *   <li><strong>Exact phrase matching (default):</strong> All terms are 
matched exactly as they appear</li>
+ *   <li><strong>Prefix phrase matching:</strong> The last term is treated as 
a prefix with wildcard</li>
+ * </ul>
+ *
+ * <p><strong>Example usage:</strong></p>
+ * <ul>
+ *   <li>Input: 'java realtime streaming' with enablePrefixMatch=false 
(default)
+ *       <br>Output: SpanNearQuery with exact matches for "java", "realtime", 
and "streaming"</li>
+ *   <li>Input: 'java realtime streaming' with enablePrefixMatch=true
+ *       <br>Output: SpanNearQuery with exact matches for "java" and 
"realtime",
+ *       and wildcard match for "streaming*"</li>
+ *   <li>Input: 'stream' with enablePrefixMatch=false (default)
+ *       <br>Output: SpanTermQuery for exact match "stream"</li>
+ *   <li>Input: 'stream' with enablePrefixMatch=true
+ *       <br>Output: SpanMultiTermQueryWrapper for wildcard match 
"stream*"</li>
+ * </ul>
+ *
+ * <p><strong>Behavior:</strong></p>
+ * <ul>
+ *   <li>Single term queries: Returns SpanTermQuery (exact) or 
SpanMultiTermQueryWrapper (prefix)</li>
+ *   <li>Multiple term queries: Returns SpanNearQuery with all terms in exact 
order</li>
+ *   <li>Null/empty queries: Throws ParseException</li>
+ *   <li>Whitespace-only queries: Throws ParseException</li>
+ * </ul>
+ *
+ * <p>This parser extends Lucene's QueryParserBase and implements the required 
abstract methods.
+ * It uses the provided Analyzer for tokenization and creates appropriate 
Lucene Span queries.</p>
+ */
+public class PrefixPhraseQueryParser extends QueryParserBase {
+  /** The field name to search in */
+  private final String _field;
+
+  /** The analyzer used for tokenizing the query */
+  private final Analyzer _analyzer;
+
+  /** Flag to control whether prefix matching is enabled on the last term */
+  private boolean _enablePrefixMatch = false;
+
+  /** The slop (distance) allowed between terms in the phrase query. Default 
is 0 (exact order) */
+  private int _slop = 0;
+
+  /** Whether terms must appear in the specified order. Default is true (exact 
order) */
+  private boolean _inOrder = true;
+
+  /**
+   * Constructs a new PrefixPhraseQueryParser with the specified field and 
analyzer.
+   *
+   * @param field the field name to search in (must not be null)
+   * @param analyzer the analyzer to use for tokenizing queries (must not be 
null)
+   * @throws IllegalArgumentException if field or analyzer is null
+   */
+  public PrefixPhraseQueryParser(String field, Analyzer analyzer) {
+    super();
+    _field = field;
+    _analyzer = analyzer;
+  }
+
+    /**
+   * Sets whether to enable prefix matching on the last term.
+   *
+   * <p>When enabled ({@code true}):
+   * <ul>
+   *   <li>Single term queries: Returns a SpanMultiTermQueryWrapper with 
wildcard (*)</li>
+   *   <li>Multiple term queries: The last term gets a wildcard suffix (*)</li>
+   * </ul>
+   *
+   * <p>When disabled ({@code false}, default):
+   * <ul>
+   *   <li>Single term queries: Returns a SpanTermQuery for exact match</li>
+   *   <li>Multiple term queries: All terms are matched exactly</li>
+   * </ul>
+   *
+   * @param enablePrefixMatch true to enable prefix matching, false to disable 
(default)
+   */
+  public void setEnablePrefixMatch(boolean enablePrefixMatch) {
+    _enablePrefixMatch = enablePrefixMatch;
+  }
+
+  /**
+   * Sets the slop (distance) allowed between terms in the phrase query.
+   *
+   * <p>The slop determines how many positions apart the terms can be while 
still matching.
+   * For example:</p>
+   * <ul>
+   *   <li>slop=0: Terms must be adjacent in exact order</li>
+   *   <li>slop=1: Terms can be 1 position apart</li>
+   *   <li>slop=2: Terms can be 2 positions apart</li>
+   * </ul>
+   *
+   * <p>This setting only affects multiple term queries that create 
SpanNearQuery.</p>
+   *
+   * @param slop the number of positions allowed between terms (default is 0)
+   * @throws IllegalArgumentException if slop is negative
+   */
+  public void setSlop(int slop) {
+    if (slop < 0) {
+      throw new IllegalArgumentException("Slop cannot be negative: " + slop);
+    }
+    _slop = slop;
+  }
+
+  /**
+   * Sets whether terms must appear in the specified order.
+   *
+   * <p>When enabled ({@code true}, default):
+   * <ul>
+   *   <li>Terms must appear in the exact order specified in the query</li>
+   *   <li>Example: "java realtime" matches "java realtime streaming" but not 
"realtime java streaming"</li>
+   * </ul>
+   *
+   * <p>When disabled ({@code false}):
+   * <ul>
+   *   <li>Terms can appear in any order within the slop distance</li>
+   *   <li>Example: "java realtime" matches both "java realtime streaming" and 
"realtime java streaming"</li>
+   * </ul>
+   *
+   * <p>This setting only affects multiple term queries that create 
SpanNearQuery.</p>
+   *
+   * @param inOrder true to require terms in exact order, false to allow any 
order
+   */
+  public void setInOrder(boolean inOrder) {
+    _inOrder = inOrder;
+  }
+
+  /**
+   * Parses the given query string and returns an appropriate Lucene Query.
+   *
+   * <p>This method performs the following steps:</p>
+   * <ol>
+   *   <li>Validates the input query (null, empty, whitespace-only)</li>
+   *   <li>Tokenizes the query using the configured analyzer</li>
+   *   <li>Creates appropriate Lucene queries based on the number of tokens 
and enablePrefixMatch setting</li>
+   * </ol>
+   *
+   * <p><strong>Query Types Returned:</strong></p>
+   * <ul>
+   *   <li><strong>Single term:</strong>
+   *       <ul>
+   *         <li>If enablePrefixMatch=false: SpanTermQuery for exact match</li>
+   *         <li>If enablePrefixMatch=true: SpanMultiTermQueryWrapper with 
wildcard</li>
+   *       </ul>
+   *   </li>
+   *   <li><strong>Multiple terms:</strong> SpanNearQuery with all terms in 
exact order
+   *       <ul>
+   *         <li>All terms except the last: SpanTermQuery (exact match)</li>
+   *         <li>Last term: SpanTermQuery (exact) or SpanMultiTermQueryWrapper 
(wildcard)
+   *             based on enablePrefixMatch</li>
+   *       </ul>
+   *   </li>
+   * </ul>
+   *
+   * @param query the query string to parse (must not be null or empty)
+   * @return a Lucene Query object representing the parsed query
+   * @throws ParseException if the query is null, empty, or contains no valid 
tokens after tokenization
+   * @throws RuntimeException if tokenization fails due to an IOException
+   */
+  @Override
+  public Query parse(String query) throws ParseException {
+    if (query == null) {
+      throw new ParseException("Query cannot be null");
+    }
+
+    if (query.trim().isEmpty()) {
+      throw new ParseException("Query cannot be empty");
+    }
+
+    // Tokenize the query
+    List<String> tokens = new ArrayList<>();
+    try (TokenStream stream = _analyzer.tokenStream(_field, query)) {
+      stream.reset();
+      CharTermAttribute charTermAttribute = 
stream.addAttribute(CharTermAttribute.class);
+
+      while (stream.incrementToken()) {
+        String token = charTermAttribute.toString();
+        if (!token.trim().isEmpty()) {
+          tokens.add(token);
+        }
+      }
+      stream.end();
+    } catch (IOException e) {
+      throw new RuntimeException("Failed to tokenize query: " + query, e);
+    }
+
+    // Check if we have any valid tokens after tokenization
+    if (tokens.isEmpty()) {
+      throw new ParseException("Query tokenization resulted in no valid 
tokens");
+    }
+
+    // Handle single token case
+    if (tokens.size() == 1) {
+      String token = tokens.get(0);
+      if (_enablePrefixMatch) {
+        WildcardQuery wildcardQuery = new WildcardQuery(new Term(_field, token 
+ "*"));
+        return new SpanMultiTermQueryWrapper<>(wildcardQuery);
+      } else {
+        return new SpanTermQuery(new Term(_field, token));
+      }
+    }
+
+    // Handle multiple tokens case
+    List<SpanQuery> spanQueries = new ArrayList<>();
+
+    // Add regular SpanTermQueries for all tokens except the last one
+    for (int i = 0; i < tokens.size() - 1; i++) {
+      spanQueries.add(new SpanTermQuery(new Term(_field, tokens.get(i))));
+    }
+
+    // Add query for the last token
+    String lastToken = tokens.get(tokens.size() - 1);
+    if (_enablePrefixMatch) {
+      WildcardQuery wildcardQuery = new WildcardQuery(new Term(_field, 
lastToken + "*"));
+      spanQueries.add(new SpanMultiTermQueryWrapper<>(wildcardQuery));
+    } else {
+      spanQueries.add(new SpanTermQuery(new Term(_field, lastToken)));
+    }
+
+    // Create SpanNearQuery with configurable slop and inOrder settings
+    return new SpanNearQuery(spanQueries.toArray(new SpanQuery[0]), _slop, 
_inOrder);
+  }
+
+  /**
+   * Reinitializes the parser with a new CharStream.
+   *
+   * <p>This method is required by QueryParserBase but is not used in this 
implementation
+   * since we override the parse(String) method directly. The method is left 
as a no-op.</p>
+   *
+   * @param input the CharStream to reinitialize with (ignored in this 
implementation)
+   */
+  @Override
+  public void ReInit(CharStream input) {
+    // This method is required by QueryParserBase but not used in our 
implementation
+    // since we override parse(String) directly
+  }
+
+  /**
+   * Creates a top-level query for the specified field.
+   *
+   * <p>This method is required by QueryParserBase but is not supported in 
this implementation.
+   * Use the parse(String) method instead for query parsing.</p>
+   *
+   * @param field the field name (ignored in this implementation)
+   * @return never returns (always throws UnsupportedOperationException)
+   * @throws ParseException never thrown (method always throws 
UnsupportedOperationException)
+   * @throws UnsupportedOperationException always thrown, indicating this 
method is not supported
+   */
+  @Override
+  public Query TopLevelQuery(String field)
+      throws ParseException {
+    throw new UnsupportedOperationException(
+        "TopLevelQuery is not supported in PrefixPhraseQueryParser. Use 
parse(String) method instead.");
+  }
+}
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java
index dfa4c9f6fb..d366789d9c 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtils.java
@@ -48,6 +48,7 @@ public class LuceneTextIndexUtils {
   public static final String PARSER_CLASSIC = "CLASSIC";
   public static final String PARSER_STANDARD = "STANDARD";
   public static final String PARSER_COMPLEX = "COMPLEX";
+  public static final String PARSER_MATCHPHRASE = "MATCHPHRASE";
 
   // Default operator constants
   public static final String DEFAULT_OPERATOR_AND = "AND";
@@ -76,6 +77,9 @@ public class LuceneTextIndexUtils {
     public static final String TIME_ZONE = "timeZone";
     public static final String PHRASE_SLOP = "phraseSlop";
     public static final String MAX_DETERMINIZED_STATES = 
"maxDeterminizedStates";
+    public static final String SLOP = "slop";
+    public static final String IN_ORDER = "inOrder";
+    public static final String ENABLE_PREFIX_MATCH = "enablePrefixMatch";
   }
 
   // Parser class names
@@ -84,6 +88,8 @@ public class LuceneTextIndexUtils {
   public static final String COMPLEX_PHRASE_QUERY_PARSER_CLASS =
       "org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser";
   public static final String CLASSIC_QUERY_PARSER = 
"org.apache.lucene.queryparser.classic.QueryParser";
+  public static final String MATCHPHRASE_QUERY_PARSER_CLASS =
+      
"org.apache.pinot.segment.local.segment.index.text.lucene.parsers.PrefixPhraseQueryParser";
 
   private LuceneTextIndexUtils() {
   }
@@ -147,6 +153,9 @@ public class LuceneTextIndexUtils {
       case PARSER_COMPLEX:
         parserClassName = COMPLEX_PHRASE_QUERY_PARSER_CLASS;
         break;
+      case PARSER_MATCHPHRASE:
+        parserClassName = MATCHPHRASE_QUERY_PARSER_CLASS;
+        break;
       default:
         parserClassName = CLASSIC_QUERY_PARSER;
         break;
@@ -224,7 +233,7 @@ public class LuceneTextIndexUtils {
         Method parseMethod = parser.getClass().getMethod("parse", 
String.class, String.class);
         query = (Query) parseMethod.invoke(parser, actualQuery, column);
       } else {
-        // Other parsers use parse(String)
+        // Other parsers (CLASSIC, COMPLEX, MATCHPHRASE) use parse(String)
         Method parseMethod = parser.getClass().getMethod("parse", 
String.class);
         query = (Query) parseMethod.invoke(parser, actualQuery);
       }
@@ -332,6 +341,18 @@ public class LuceneTextIndexUtils {
     public int getMaxDeterminizedStates() {
       return 
Integer.parseInt(_options.getOrDefault(OptionKey.MAX_DETERMINIZED_STATES, 
"10000"));
     }
+
+    public int getSlop() {
+      return Integer.parseInt(_options.getOrDefault(OptionKey.SLOP, "0"));
+    }
+
+    public boolean isInOrder() {
+      return Boolean.parseBoolean(_options.getOrDefault(OptionKey.IN_ORDER, 
"true"));
+    }
+
+    public boolean isEnablePrefixMatch() {
+      return 
Boolean.parseBoolean(_options.getOrDefault(OptionKey.ENABLE_PREFIX_MATCH, 
"false"));
+    }
   }
 
   /**
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtilsTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtilsTest.java
index 2308809584..b8a70caad6 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtilsTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/utils/LuceneTextIndexUtilsTest.java
@@ -18,7 +18,10 @@
  */
 package org.apache.pinot.segment.local.utils;
 
+import java.lang.reflect.InvocationTargetException;
 import java.util.Map;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.spans.SpanMultiTermQueryWrapper;
 import org.apache.lucene.queries.spans.SpanNearQuery;
@@ -27,9 +30,11 @@ import org.apache.lucene.queries.spans.SpanTermQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
 import org.apache.lucene.search.RegexpQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.WildcardQuery;
+import 
org.apache.pinot.segment.local.segment.index.text.lucene.parsers.PrefixPhraseQueryParser;
 import org.testng.Assert;
 import org.testng.annotations.Test;
 
@@ -202,4 +207,132 @@ public class LuceneTextIndexUtilsTest {
     Assert.assertEquals(options.getPhraseSlop(), 2);
     Assert.assertEquals(options.getMaxDeterminizedStates(), 5000);
   }
+
+  @Test
+  public void testMatchPhraseQueryParser()
+      throws Exception {
+    // Test the new MATCHPHRASE parser functionality
+    String optionsString = "parser=MATCHPHRASE,enablePrefixMatch=true";
+    LuceneTextIndexUtils.LuceneTextIndexOptions options =
+        new LuceneTextIndexUtils.LuceneTextIndexOptions(optionsString);
+
+    // Create a simple analyzer for testing
+    Analyzer analyzer = new WhitespaceAnalyzer();
+    String column = "testColumn";
+
+    // Test positive case: "java realtime streaming"
+    String query = "java realtime streaming";
+
+    Query result = LuceneTextIndexUtils.createQueryParserWithOptions(query, 
options, column, analyzer);
+    Assert.assertNotNull(result);
+    Assert.assertTrue(result instanceof SpanNearQuery);
+
+    // Test positive case: "realtime stream*"
+    query = "realtime stream*";
+    result = LuceneTextIndexUtils.createQueryParserWithOptions(query, options, 
column, analyzer);
+    Assert.assertNotNull(result);
+    Assert.assertTrue(result instanceof SpanNearQuery);
+
+    // Test positive case: "stream*" - single term should return 
SpanMultiTermQueryWrapper
+    query = "stream*";
+    result = LuceneTextIndexUtils.createQueryParserWithOptions(query, options, 
column, analyzer);
+    Assert.assertNotNull(result);
+    Assert.assertTrue(result instanceof SpanMultiTermQueryWrapper);
+
+    // Test edge case: empty string ""
+    query = "";
+    try {
+      LuceneTextIndexUtils.createQueryParserWithOptions(query, options, 
column, analyzer);
+      Assert.fail("Expected exception for empty query");
+    } catch (RuntimeException e) {
+      // The method wraps ParseException in RuntimeException via reflection
+      Assert.assertTrue(e.getCause() instanceof InvocationTargetException);
+    }
+
+    // Test edge case: null query
+    try {
+      LuceneTextIndexUtils.createQueryParserWithOptions(null, options, column, 
analyzer);
+      Assert.fail("Expected exception for null query");
+    } catch (RuntimeException e) {
+      // The method wraps ParseException in RuntimeException via reflection
+      Assert.assertTrue(e.getCause() instanceof InvocationTargetException);
+    }
+
+    // Test that TopLevelQuery throws UnsupportedOperationException
+    try {
+      PrefixPhraseQueryParser parser = new PrefixPhraseQueryParser(column, 
analyzer);
+      parser.TopLevelQuery(column);
+      Assert.fail("Expected UnsupportedOperationException for TopLevelQuery");
+    } catch (UnsupportedOperationException e) {
+      Assert.assertTrue(e.getMessage().contains("TopLevelQuery is not 
supported"));
+    }
+
+    // Test slop and inOrder settings
+    PrefixPhraseQueryParser slopParser = new PrefixPhraseQueryParser(column, 
analyzer);
+
+    // Test default slop and inOrder (0 slop, true inOrder)
+    Query defaultSlopQuery = slopParser.parse("java realtime streaming");
+    Assert.assertTrue(defaultSlopQuery instanceof SpanNearQuery);
+
+    // Test custom slop and inOrder
+    slopParser.setSlop(2);
+    slopParser.setInOrder(false);
+    Query customSlopQuery = slopParser.parse("java realtime streaming");
+    Assert.assertTrue(customSlopQuery instanceof SpanNearQuery);
+
+    // Test invalid slop (should throw exception)
+    try {
+      slopParser.setSlop(-1);
+      Assert.fail("Expected IllegalArgumentException for negative slop");
+    } catch (IllegalArgumentException e) {
+      Assert.assertTrue(e.getMessage().contains("Slop cannot be negative"));
+    }
+
+    // Test slop and inOrder with createQueryParserWithOptions
+    LuceneTextIndexUtils.LuceneTextIndexOptions slopOptions =
+        
LuceneTextIndexUtils.createOptions("parser=MATCHPHRASE,enablePrefixMatch=true");
+
+    // Test default slop and inOrder behavior
+    Query defaultSlopResult = 
LuceneTextIndexUtils.createQueryParserWithOptions(
+        "java realtime streaming", slopOptions, column, analyzer);
+    Assert.assertTrue(defaultSlopResult instanceof SpanNearQuery);
+
+    // Test custom slop and inOrder settings
+    LuceneTextIndexUtils.LuceneTextIndexOptions customSlopOptions =
+        
LuceneTextIndexUtils.createOptions("parser=MATCHPHRASE,enablePrefixMatch=true");
+
+    // Create a parser instance to test slop and inOrder settings
+    PrefixPhraseQueryParser customParser = new PrefixPhraseQueryParser(column, 
analyzer);
+    customParser.setEnablePrefixMatch(true);
+    customParser.setSlop(2);
+    customParser.setInOrder(false);
+
+    // Test that custom settings work correctly
+    Query customSlopResult = customParser.parse("java realtime streaming");
+    Assert.assertTrue(customSlopResult instanceof SpanNearQuery);
+
+    // Test that the parser can be configured with different slop values
+    customParser.setSlop(1);
+    Query slop1Result = customParser.parse("java realtime streaming");
+    Assert.assertTrue(slop1Result instanceof SpanNearQuery);
+
+    // Test that the parser can be configured with different inOrder values
+    customParser.setInOrder(true);
+    Query inOrderTrueResult = customParser.parse("java realtime streaming");
+    Assert.assertTrue(inOrderTrueResult instanceof SpanNearQuery);
+
+    // Test default behavior using createOptions
+    LuceneTextIndexUtils.LuceneTextIndexOptions defaultOptions =
+        LuceneTextIndexUtils.createOptions("parser=MATCHPHRASE");
+
+    // Test single term with default behavior (prefix match disabled)
+    Query defaultSingleTermQuery =
+        LuceneTextIndexUtils.createQueryParserWithOptions("stream", 
defaultOptions, column, analyzer);
+    Assert.assertTrue(defaultSingleTermQuery instanceof SpanTermQuery);
+
+    // Test multiple terms with default behavior (prefix match disabled)
+    Query defaultMultiTermQuery =
+        LuceneTextIndexUtils.createQueryParserWithOptions("java realtime 
streaming", defaultOptions, column, analyzer);
+    Assert.assertTrue(defaultMultiTermQuery instanceof SpanNearQuery);
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(pinot) branch master updated: Adding Match prefix phrase query lucene parser (#16476)

Reply via email to