(pinot) branch master updated: Refactor CaseSensitiveAnalyzer and StandardAnalyzer to CaseAwareStandardAnalyzer (#15830)

xiangfu Tue, 20 May 2025 06:30:01 -0700

This is an automated email from the ASF dual-hosted git repository.

xiangfu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git



The following commit(s) were added to refs/heads/master by this push:
     new f1b2f461e5 Refactor CaseSensitiveAnalyzer and StandardAnalyzer to 
CaseAwareStandardAnalyzer (#15830)
f1b2f461e5 is described below

commit f1b2f461e516c7846f554c28da2d349205e1ce88
Author: Xiang Fu <xiangfu.1...@gmail.com>
AuthorDate: Tue May 20 21:29:51 2025 +0800

    Refactor CaseSensitiveAnalyzer and StandardAnalyzer to 
CaseAwareStandardAnalyzer (#15830)
---
 .../pinot/queries/TextSearchQueriesTest.java       | 25 ++++-----
 .../impl/invertedindex/NativeMutableTextIndex.java |  4 +-
 .../creator/impl/text/NativeTextIndexCreator.java  |  4 +-
 ...nalyzer.java => CaseAwareStandardAnalyzer.java} | 59 ++++++++++++++++++----
 .../local/segment/store/TextIndexUtils.java        | 30 ++++++-----
 5 files changed, 83 insertions(+), 39 deletions(-)

diff --git 
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java 
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
index b0ad5f7e1c..4b4f8d8102 100644
--- 
a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
+++ 
b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java
@@ -33,7 +33,7 @@ import java.util.Objects;
 import java.util.Random;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.TextField;
@@ -58,6 +58,7 @@ import 
org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoa
 import 
org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneTextIndex;
 import 
org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl;
 import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig;
+import 
org.apache.pinot.segment.local.segment.index.text.CaseAwareStandardAnalyzer;
 import org.apache.pinot.segment.local.segment.readers.GenericRowRecordReader;
 import org.apache.pinot.segment.spi.ImmutableSegment;
 import org.apache.pinot.segment.spi.IndexSegment;
@@ -1372,15 +1373,15 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {
     // create and open an index writer
     File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test1.index");
     Directory indexDirectory = FSDirectory.open(indexFile.toPath());
-    StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
-    IndexWriterConfig indexWriterConfig = new 
IndexWriterConfig(standardAnalyzer);
+    Analyzer analyzer = new CaseAwareStandardAnalyzer();
+    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
     indexWriterConfig.setRAMBufferSizeMB(500);
     IndexWriter indexWriter = new IndexWriter(indexDirectory, 
indexWriterConfig);
 
     // create an NRT index reader
     SearcherManager searcherManager = new SearcherManager(indexWriter, false, 
false, null);
 
-    QueryParser queryParser = new QueryParser("skill", standardAnalyzer);
+    QueryParser queryParser = new QueryParser("skill", analyzer);
     Query query = queryParser.parse("\"machine learning\"");
 
     // acquire a searcher
@@ -1542,8 +1543,8 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {
     // create and open an index writer
     File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test2.index");
     Directory indexDirectory = FSDirectory.open(indexFile.toPath());
-    StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
-    IndexWriterConfig indexWriterConfig = new 
IndexWriterConfig(standardAnalyzer);
+    CaseAwareStandardAnalyzer analyzer = new CaseAwareStandardAnalyzer();
+    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
     indexWriterConfig.setRAMBufferSizeMB(50);
     IndexWriter indexWriter = new IndexWriter(indexDirectory, 
indexWriterConfig);
 
@@ -1553,7 +1554,7 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {
     indexWriter.addDocument(docToIndex);
 
     // create an NRT index reader from the writer -- should see one 
uncommitted document
-    QueryParser queryParser = new QueryParser("skill", standardAnalyzer);
+    QueryParser queryParser = new QueryParser("skill", analyzer);
     Query query = queryParser.parse("\"distributed systems\" AND (Java C++)");
     IndexReader indexReader1 = DirectoryReader.open(indexWriter);
     IndexSearcher searcher1 = new IndexSearcher(indexReader1);
@@ -1592,9 +1593,9 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {
       throws Exception {
     File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test3.index");
     Directory indexDirectory = FSDirectory.open(indexFile.toPath());
-    StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
+    Analyzer analyzer = new CaseAwareStandardAnalyzer();
     // create and open a writer
-    IndexWriterConfig indexWriterConfig = new 
IndexWriterConfig(standardAnalyzer);
+    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
     indexWriterConfig.setRAMBufferSizeMB(500);
     IndexWriter indexWriter = new IndexWriter(indexDirectory, 
indexWriterConfig);
 
@@ -1608,7 +1609,7 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {
 
     // start writer and reader
     Thread writer = new Thread(new RealtimeWriter(indexWriter));
-    Thread realtimeReader = new Thread(new RealtimeReader(searcherManager, 
standardAnalyzer));
+    Thread realtimeReader = new Thread(new RealtimeReader(searcherManager, 
analyzer));
 
     writer.start();
     realtimeReader.start();
@@ -1674,8 +1675,8 @@ public class TextSearchQueriesTest extends 
BaseQueriesTest {
     private final QueryParser _queryParser;
     private final SearcherManager _searcherManager;
 
-    RealtimeReader(SearcherManager searcherManager, StandardAnalyzer 
standardAnalyzer) {
-      _queryParser = new QueryParser("skill", standardAnalyzer);
+    RealtimeReader(SearcherManager searcherManager, Analyzer analyzer) {
+      _queryParser = new QueryParser("skill", analyzer);
       _searcherManager = searcherManager;
     }
 
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeMutableTextIndex.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeMutableTextIndex.java
index 1e56c57c87..abeeb08cbc 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeMutableTextIndex.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeMutableTextIndex.java
@@ -25,9 +25,9 @@ import java.util.List;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import 
org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
+import 
org.apache.pinot.segment.local.segment.index.text.CaseAwareStandardAnalyzer;
 import org.apache.pinot.segment.local.utils.nativefst.mutablefst.MutableFST;
 import 
org.apache.pinot.segment.local.utils.nativefst.mutablefst.MutableFSTImpl;
 import 
org.apache.pinot.segment.local.utils.nativefst.utils.RealTimeRegexpMatcher;
@@ -58,7 +58,7 @@ public class NativeMutableTextIndex implements 
MutableTextIndex {
     ReentrantReadWriteLock readWriteLock = new ReentrantReadWriteLock();
     _readLock = readWriteLock.readLock();
     _writeLock = readWriteLock.writeLock();
-    _analyzer = new 
StandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET);
+    _analyzer = new 
CaseAwareStandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET);
   }
 
   @Override
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/NativeTextIndexCreator.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/NativeTextIndexCreator.java
index 832801883d..7ef4d25214 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/NativeTextIndexCreator.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/NativeTextIndexCreator.java
@@ -31,10 +31,10 @@ import java.util.TreeMap;
 import org.apache.commons.io.FileUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import 
org.apache.pinot.segment.local.segment.creator.impl.inv.BitmapInvertedIndexWriter;
 import 
org.apache.pinot.segment.local.segment.index.text.AbstractTextIndexCreator;
+import 
org.apache.pinot.segment.local.segment.index.text.CaseAwareStandardAnalyzer;
 import org.apache.pinot.segment.local.utils.nativefst.FST;
 import org.apache.pinot.segment.local.utils.nativefst.FSTHeader;
 import org.apache.pinot.segment.local.utils.nativefst.builder.FSTBuilder;
@@ -87,7 +87,7 @@ public class NativeTextIndexCreator extends 
AbstractTextIndexCreator {
     }
     _fstIndexFile = new File(_tempDir, FST_FILE_NAME);
     _invertedIndexFile = new File(_tempDir, INVERTED_INDEX_FILE_NAME);
-    _analyzer = new 
StandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET);
+    _analyzer = new 
CaseAwareStandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET);
   }
 
   @Override
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseAwareStandardAnalyzer.java
similarity index 60%
rename from 
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java
rename to 
pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseAwareStandardAnalyzer.java
index d8b003f7ae..25552b00ec 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseAwareStandardAnalyzer.java
@@ -19,6 +19,7 @@
 package org.apache.pinot.segment.local.segment.index.text;
 
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.StopwordAnalyzerBase;
 import org.apache.lucene.analysis.TokenStream;
@@ -26,36 +27,55 @@ import 
org.apache.lucene.analysis.standard.StandardTokenizer;
 
 
 /**
- * A {@link org.apache.lucene.analysis.Analyzer} for case-sensitive text.
+ * A {@link org.apache.lucene.analysis.Analyzer} for standard text that is 
case-aware.
+ * This analyzer supports both case-sensitive and case-insensitive modes, 
making it
+ * suitable for use cases where case sensitivity is configurable.
+ * <p>
  * It's directly copied from {@link 
org.apache.lucene.analysis.standard.StandardAnalyzer} but
- * removes the lowercase filter.
+ * allows case-sensitive tokenization.
+ * <p>
+ * The analyzer applies lowercasing to tokens only when the {@code 
caseSensitive} flag is set to
+ * {@code false} (the default behavior, same as {@link 
org.apache.lucene.analysis.standard.StandardAnalyzer}).
+ * When {@code caseSensitive} is {@code true}, tokens preserve their original 
case.
  */
-public class CaseSensitiveAnalyzer extends StopwordAnalyzerBase {
+public class CaseAwareStandardAnalyzer extends StopwordAnalyzerBase {
 
   /** Default maximum allowed token length */
   public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
 
   private int _maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
 
+  private final boolean _caseSensitive;
+
   /**
    * Builds an analyzer with the given stop words.
    *
    * @param stopWords stop words
    */
-  public CaseSensitiveAnalyzer(CharArraySet stopWords) {
-    super(stopWords);
+  public CaseAwareStandardAnalyzer(CharArraySet stopWords) {
+    this(stopWords, false);
   }
 
   /** Builds an analyzer with no stop words. */
-  public CaseSensitiveAnalyzer() {
-    this(CharArraySet.EMPTY_SET);
+  public CaseAwareStandardAnalyzer() {
+    this(CharArraySet.EMPTY_SET, false);
+  }
+
+  /**
+   * Builds an analyzer with the given stop words.
+   *
+   * @param stopWords stop words
+   */
+  public CaseAwareStandardAnalyzer(CharArraySet stopWords, boolean 
caseSensitive) {
+    super(stopWords);
+    _caseSensitive = caseSensitive;
   }
 
   /**
    * Set the max allowed token length. Tokens larger than this will be chopped 
up at this token
    * length and emitted as multiple tokens. If you need to skip such large 
tokens, you could
    * increase this max length, and then use {@code LengthFilter} to remove 
long tokens. The default
-   * is {@link 
org.apache.pinot.segment.local.segment.index.text.CaseSensitiveAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
+   * is {@link CaseAwareStandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}.
    */
   public void setMaxTokenLength(int length) {
     _maxTokenLength = length;
@@ -70,11 +90,24 @@ public class CaseSensitiveAnalyzer extends 
StopwordAnalyzerBase {
     return _maxTokenLength;
   }
 
+  /**
+   * Returns true if the analyzer is case sensitive
+   */
+  public boolean isCaseSensitive() {
+    return _caseSensitive;
+  }
+
   @Override
   protected TokenStreamComponents createComponents(final String fieldName) {
     final StandardTokenizer tokenizer = new StandardTokenizer();
     tokenizer.setMaxTokenLength(_maxTokenLength);
-    TokenStream tok = new StopFilter(tokenizer, stopwords);
+    TokenStream tok;
+    if (_caseSensitive) {
+      tok = tokenizer;
+    } else {
+      tok = new LowerCaseFilter(tokenizer);
+    }
+    tok = new StopFilter(tok, stopwords);
     return new TokenStreamComponents(
         r -> {
           tokenizer.setMaxTokenLength(_maxTokenLength);
@@ -82,4 +115,12 @@ public class CaseSensitiveAnalyzer extends 
StopwordAnalyzerBase {
         },
         tok);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    if (_caseSensitive) {
+      return in;
+    }
+    return new LowerCaseFilter(in);
+  }
 }
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
index 8d982af4ee..d8d633213e 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java
@@ -36,7 +36,7 @@ import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.queryparser.classic.QueryParserBase;
 import 
org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator;
-import org.apache.pinot.segment.local.segment.index.text.CaseSensitiveAnalyzer;
+import 
org.apache.pinot.segment.local.segment.index.text.CaseAwareStandardAnalyzer;
 import 
org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder;
 import org.apache.pinot.segment.spi.V1Constants;
 import org.apache.pinot.segment.spi.V1Constants.Indexes;
@@ -51,6 +51,7 @@ import org.slf4j.LoggerFactory;
 
 public class TextIndexUtils {
   private static final Logger LOGGER = 
LoggerFactory.getLogger(TextIndexUtils.class);
+
   private TextIndexUtils() {
   }
 
@@ -135,18 +136,20 @@ public class TextIndexUtils {
    * @return Lucene Analyzer class instance
    * @throws ReflectiveOperationException if instantiation via reflection fails
    */
-  public static Analyzer getAnalyzer(TextIndexConfig config) throws 
ReflectiveOperationException {
+  public static Analyzer getAnalyzer(TextIndexConfig config)
+      throws ReflectiveOperationException {
     String luceneAnalyzerClassName = config.getLuceneAnalyzerClass();
     List<String> luceneAnalyzerClassArgs = config.getLuceneAnalyzerClassArgs();
     List<String> luceneAnalyzerClassArgTypes = 
config.getLuceneAnalyzerClassArgTypes();
 
     if (null == luceneAnalyzerClassName || luceneAnalyzerClassName.isEmpty()
-            || 
(luceneAnalyzerClassName.equals(StandardAnalyzer.class.getName())
-                    && luceneAnalyzerClassArgs.isEmpty() && 
luceneAnalyzerClassArgTypes.isEmpty())) {
+        || 
((luceneAnalyzerClassName.equals(CaseAwareStandardAnalyzer.class.getName())
+        || luceneAnalyzerClassName.equals(StandardAnalyzer.class.getName()))
+        && luceneAnalyzerClassArgs.isEmpty() && 
luceneAnalyzerClassArgTypes.isEmpty())) {
       // When there is no analyzer defined, or when StandardAnalyzer (default) 
is used without arguments,
       // use existing logic to obtain an instance of StandardAnalyzer with 
customized stop words
       return TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords(
-              config.getStopWordsInclude(), config.getStopWordsExclude(), 
config.isCaseSensitive());
+          config.getStopWordsInclude(), config.getStopWordsExclude(), 
config.isCaseSensitive());
     }
 
     // Custom analyzer + custom configs via reflection
@@ -177,7 +180,7 @@ public class TextIndexUtils {
 
     // Return a new instance of custom lucene analyzer class
     return (Analyzer) 
luceneAnalyzerClass.getConstructor(argClasses.toArray(new Class<?>[0]))
-            .newInstance(argValues.toArray(new Object[0]));
+        .newInstance(argValues.toArray(new Object[0]));
   }
 
   /**
@@ -186,7 +189,8 @@ public class TextIndexUtils {
    * @return Class object of the value type
    * @throws ClassNotFoundException when the value type is not supported
    */
-  public static Class<?> parseSupportedTypes(String valueTypeString) throws 
ClassNotFoundException {
+  public static Class<?> parseSupportedTypes(String valueTypeString)
+      throws ClassNotFoundException {
     try {
       // Support both primitive types + class
       switch (valueTypeString) {
@@ -223,7 +227,7 @@ public class TextIndexUtils {
    * @throws ReflectiveOperationException if value cannot be coerced without 
ambiguity or encountered unsupported type
    */
   public static Object parseSupportedTypeValues(String stringValue, Class<?> 
clazz)
-          throws ReflectiveOperationException {
+      throws ReflectiveOperationException {
     try {
       if (clazz.equals(String.class)) {
         return stringValue;
@@ -260,7 +264,7 @@ public class TextIndexUtils {
       }
     } catch (NumberFormatException | ReflectiveOperationException ex) {
       String exceptionMessage = "Custom analyzer argument cannot be coerced 
from "
-              + stringValue + " to " + clazz.getName() + " type";
+          + stringValue + " to " + clazz.getName() + " type";
       LOGGER.error(exceptionMessage);
       throw new ReflectiveOperationException(exceptionMessage);
     } catch (UnsupportedOperationException ex) {
@@ -280,14 +284,12 @@ public class TextIndexUtils {
     if (stopWordsExclude != null) {
       stopWordsExclude.forEach(stopWordSet::remove);
     }
-    if (isCaseSensitive) {
-      return new CaseSensitiveAnalyzer(new CharArraySet(stopWordSet, false));
-    }
-    return new StandardAnalyzer(new CharArraySet(stopWordSet, true));
+    return new CaseAwareStandardAnalyzer(new CharArraySet(stopWordSet, 
!isCaseSensitive), isCaseSensitive);
   }
 
   public static Constructor<QueryParserBase> 
getQueryParserWithStringAndAnalyzerTypeConstructor(
-          String queryParserClassName) throws ReflectiveOperationException {
+      String queryParserClassName)
+      throws ReflectiveOperationException {
     // Fail-fast if the query parser is specified class is not QueryParseBase 
class
     final Class<?> queryParserClass = Class.forName(queryParserClassName);
     if (!QueryParserBase.class.isAssignableFrom(queryParserClass)) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

(pinot) branch master updated: Refactor CaseSensitiveAnalyzer and StandardAnalyzer to CaseAwareStandardAnalyzer (#15830)

Reply via email to