This is an automated email from the ASF dual-hosted git repository. xiangfu pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new f1b2f461e5 Refactor CaseSensitiveAnalyzer and StandardAnalyzer to CaseAwareStandardAnalyzer (#15830) f1b2f461e5 is described below commit f1b2f461e516c7846f554c28da2d349205e1ce88 Author: Xiang Fu <xiangfu.1...@gmail.com> AuthorDate: Tue May 20 21:29:51 2025 +0800 Refactor CaseSensitiveAnalyzer and StandardAnalyzer to CaseAwareStandardAnalyzer (#15830) --- .../pinot/queries/TextSearchQueriesTest.java | 25 ++++----- .../impl/invertedindex/NativeMutableTextIndex.java | 4 +- .../creator/impl/text/NativeTextIndexCreator.java | 4 +- ...nalyzer.java => CaseAwareStandardAnalyzer.java} | 59 ++++++++++++++++++---- .../local/segment/store/TextIndexUtils.java | 30 ++++++----- 5 files changed, 83 insertions(+), 39 deletions(-) diff --git a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java index b0ad5f7e1c..4b4f8d8102 100644 --- a/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java +++ b/pinot-core/src/test/java/org/apache/pinot/queries/TextSearchQueriesTest.java @@ -33,7 +33,7 @@ import java.util.Objects; import java.util.Random; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; @@ -58,6 +58,7 @@ import org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoa import org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneTextIndex; import org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl; import org.apache.pinot.segment.local.segment.index.loader.IndexLoadingConfig; +import org.apache.pinot.segment.local.segment.index.text.CaseAwareStandardAnalyzer; import org.apache.pinot.segment.local.segment.readers.GenericRowRecordReader; import org.apache.pinot.segment.spi.ImmutableSegment; import org.apache.pinot.segment.spi.IndexSegment; @@ -1372,15 +1373,15 @@ public class TextSearchQueriesTest extends BaseQueriesTest { // create and open an index writer File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test1.index"); Directory indexDirectory = FSDirectory.open(indexFile.toPath()); - StandardAnalyzer standardAnalyzer = new StandardAnalyzer(); - IndexWriterConfig indexWriterConfig = new IndexWriterConfig(standardAnalyzer); + Analyzer analyzer = new CaseAwareStandardAnalyzer(); + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); indexWriterConfig.setRAMBufferSizeMB(500); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); // create an NRT index reader SearcherManager searcherManager = new SearcherManager(indexWriter, false, false, null); - QueryParser queryParser = new QueryParser("skill", standardAnalyzer); + QueryParser queryParser = new QueryParser("skill", analyzer); Query query = queryParser.parse("\"machine learning\""); // acquire a searcher @@ -1542,8 +1543,8 @@ public class TextSearchQueriesTest extends BaseQueriesTest { // create and open an index writer File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test2.index"); Directory indexDirectory = FSDirectory.open(indexFile.toPath()); - StandardAnalyzer standardAnalyzer = new StandardAnalyzer(); - IndexWriterConfig indexWriterConfig = new IndexWriterConfig(standardAnalyzer); + CaseAwareStandardAnalyzer analyzer = new CaseAwareStandardAnalyzer(); + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); indexWriterConfig.setRAMBufferSizeMB(50); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); @@ -1553,7 +1554,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest { indexWriter.addDocument(docToIndex); // create an NRT index reader from the writer -- should see one uncommitted document - QueryParser queryParser = new QueryParser("skill", standardAnalyzer); + QueryParser queryParser = new QueryParser("skill", analyzer); Query query = queryParser.parse("\"distributed systems\" AND (Java C++)"); IndexReader indexReader1 = DirectoryReader.open(indexWriter); IndexSearcher searcher1 = new IndexSearcher(indexReader1); @@ -1592,9 +1593,9 @@ public class TextSearchQueriesTest extends BaseQueriesTest { throws Exception { File indexFile = new File(INDEX_DIR.getPath() + "/realtime-test3.index"); Directory indexDirectory = FSDirectory.open(indexFile.toPath()); - StandardAnalyzer standardAnalyzer = new StandardAnalyzer(); + Analyzer analyzer = new CaseAwareStandardAnalyzer(); // create and open a writer - IndexWriterConfig indexWriterConfig = new IndexWriterConfig(standardAnalyzer); + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); indexWriterConfig.setRAMBufferSizeMB(500); IndexWriter indexWriter = new IndexWriter(indexDirectory, indexWriterConfig); @@ -1608,7 +1609,7 @@ public class TextSearchQueriesTest extends BaseQueriesTest { // start writer and reader Thread writer = new Thread(new RealtimeWriter(indexWriter)); - Thread realtimeReader = new Thread(new RealtimeReader(searcherManager, standardAnalyzer)); + Thread realtimeReader = new Thread(new RealtimeReader(searcherManager, analyzer)); writer.start(); realtimeReader.start(); @@ -1674,8 +1675,8 @@ public class TextSearchQueriesTest extends BaseQueriesTest { private final QueryParser _queryParser; private final SearcherManager _searcherManager; - RealtimeReader(SearcherManager searcherManager, StandardAnalyzer standardAnalyzer) { - _queryParser = new QueryParser("skill", standardAnalyzer); + RealtimeReader(SearcherManager searcherManager, Analyzer analyzer) { + _queryParser = new QueryParser("skill", analyzer); _searcherManager = searcherManager; } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeMutableTextIndex.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeMutableTextIndex.java index 1e56c57c87..abeeb08cbc 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeMutableTextIndex.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/realtime/impl/invertedindex/NativeMutableTextIndex.java @@ -25,9 +25,9 @@ import java.util.List; import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator; +import org.apache.pinot.segment.local.segment.index.text.CaseAwareStandardAnalyzer; import org.apache.pinot.segment.local.utils.nativefst.mutablefst.MutableFST; import org.apache.pinot.segment.local.utils.nativefst.mutablefst.MutableFSTImpl; import org.apache.pinot.segment.local.utils.nativefst.utils.RealTimeRegexpMatcher; @@ -58,7 +58,7 @@ public class NativeMutableTextIndex implements MutableTextIndex { ReentrantReadWriteLock readWriteLock = new ReentrantReadWriteLock(); _readLock = readWriteLock.readLock(); _writeLock = readWriteLock.writeLock(); - _analyzer = new StandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET); + _analyzer = new CaseAwareStandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/NativeTextIndexCreator.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/NativeTextIndexCreator.java index 832801883d..7ef4d25214 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/NativeTextIndexCreator.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/creator/impl/text/NativeTextIndexCreator.java @@ -31,10 +31,10 @@ import java.util.TreeMap; import org.apache.commons.io.FileUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.pinot.segment.local.segment.creator.impl.inv.BitmapInvertedIndexWriter; import org.apache.pinot.segment.local.segment.index.text.AbstractTextIndexCreator; +import org.apache.pinot.segment.local.segment.index.text.CaseAwareStandardAnalyzer; import org.apache.pinot.segment.local.utils.nativefst.FST; import org.apache.pinot.segment.local.utils.nativefst.FSTHeader; import org.apache.pinot.segment.local.utils.nativefst.builder.FSTBuilder; @@ -87,7 +87,7 @@ public class NativeTextIndexCreator extends AbstractTextIndexCreator { } _fstIndexFile = new File(_tempDir, FST_FILE_NAME); _invertedIndexFile = new File(_tempDir, INVERTED_INDEX_FILE_NAME); - _analyzer = new StandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET); + _analyzer = new CaseAwareStandardAnalyzer(LuceneTextIndexCreator.ENGLISH_STOP_WORDS_SET); } @Override diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseAwareStandardAnalyzer.java similarity index 60% rename from pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java rename to pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseAwareStandardAnalyzer.java index d8b003f7ae..25552b00ec 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseSensitiveAnalyzer.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/index/text/CaseAwareStandardAnalyzer.java @@ -19,6 +19,7 @@ package org.apache.pinot.segment.local.segment.index.text; import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.StopwordAnalyzerBase; import org.apache.lucene.analysis.TokenStream; @@ -26,36 +27,55 @@ import org.apache.lucene.analysis.standard.StandardTokenizer; /** - * A {@link org.apache.lucene.analysis.Analyzer} for case-sensitive text. + * A {@link org.apache.lucene.analysis.Analyzer} for standard text that is case-aware. + * This analyzer supports both case-sensitive and case-insensitive modes, making it + * suitable for use cases where case sensitivity is configurable. + * <p> * It's directly copied from {@link org.apache.lucene.analysis.standard.StandardAnalyzer} but - * removes the lowercase filter. + * allows case-sensitive tokenization. + * <p> + * The analyzer applies lowercasing to tokens only when the {@code caseSensitive} flag is set to + * {@code false} (the default behavior, same as {@link org.apache.lucene.analysis.standard.StandardAnalyzer}). + * When {@code caseSensitive} is {@code true}, tokens preserve their original case. */ -public class CaseSensitiveAnalyzer extends StopwordAnalyzerBase { +public class CaseAwareStandardAnalyzer extends StopwordAnalyzerBase { /** Default maximum allowed token length */ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; private int _maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + private final boolean _caseSensitive; + /** * Builds an analyzer with the given stop words. * * @param stopWords stop words */ - public CaseSensitiveAnalyzer(CharArraySet stopWords) { - super(stopWords); + public CaseAwareStandardAnalyzer(CharArraySet stopWords) { + this(stopWords, false); } /** Builds an analyzer with no stop words. */ - public CaseSensitiveAnalyzer() { - this(CharArraySet.EMPTY_SET); + public CaseAwareStandardAnalyzer() { + this(CharArraySet.EMPTY_SET, false); + } + + /** + * Builds an analyzer with the given stop words. + * + * @param stopWords stop words + */ + public CaseAwareStandardAnalyzer(CharArraySet stopWords, boolean caseSensitive) { + super(stopWords); + _caseSensitive = caseSensitive; } /** * Set the max allowed token length. Tokens larger than this will be chopped up at this token * length and emitted as multiple tokens. If you need to skip such large tokens, you could * increase this max length, and then use {@code LengthFilter} to remove long tokens. The default - * is {@link org.apache.pinot.segment.local.segment.index.text.CaseSensitiveAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}. + * is {@link CaseAwareStandardAnalyzer#DEFAULT_MAX_TOKEN_LENGTH}. */ public void setMaxTokenLength(int length) { _maxTokenLength = length; @@ -70,11 +90,24 @@ public class CaseSensitiveAnalyzer extends StopwordAnalyzerBase { return _maxTokenLength; } + /** + * Returns true if the analyzer is case sensitive + */ + public boolean isCaseSensitive() { + return _caseSensitive; + } + @Override protected TokenStreamComponents createComponents(final String fieldName) { final StandardTokenizer tokenizer = new StandardTokenizer(); tokenizer.setMaxTokenLength(_maxTokenLength); - TokenStream tok = new StopFilter(tokenizer, stopwords); + TokenStream tok; + if (_caseSensitive) { + tok = tokenizer; + } else { + tok = new LowerCaseFilter(tokenizer); + } + tok = new StopFilter(tok, stopwords); return new TokenStreamComponents( r -> { tokenizer.setMaxTokenLength(_maxTokenLength); @@ -82,4 +115,12 @@ public class CaseSensitiveAnalyzer extends StopwordAnalyzerBase { }, tok); } + + @Override + protected TokenStream normalize(String fieldName, TokenStream in) { + if (_caseSensitive) { + return in; + } + return new LowerCaseFilter(in); + } } diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java index 8d982af4ee..d8d633213e 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/segment/store/TextIndexUtils.java @@ -36,7 +36,7 @@ import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.queryparser.classic.QueryParserBase; import org.apache.pinot.segment.local.segment.creator.impl.text.LuceneTextIndexCreator; -import org.apache.pinot.segment.local.segment.index.text.CaseSensitiveAnalyzer; +import org.apache.pinot.segment.local.segment.index.text.CaseAwareStandardAnalyzer; import org.apache.pinot.segment.local.segment.index.text.TextIndexConfigBuilder; import org.apache.pinot.segment.spi.V1Constants; import org.apache.pinot.segment.spi.V1Constants.Indexes; @@ -51,6 +51,7 @@ import org.slf4j.LoggerFactory; public class TextIndexUtils { private static final Logger LOGGER = LoggerFactory.getLogger(TextIndexUtils.class); + private TextIndexUtils() { } @@ -135,18 +136,20 @@ public class TextIndexUtils { * @return Lucene Analyzer class instance * @throws ReflectiveOperationException if instantiation via reflection fails */ - public static Analyzer getAnalyzer(TextIndexConfig config) throws ReflectiveOperationException { + public static Analyzer getAnalyzer(TextIndexConfig config) + throws ReflectiveOperationException { String luceneAnalyzerClassName = config.getLuceneAnalyzerClass(); List<String> luceneAnalyzerClassArgs = config.getLuceneAnalyzerClassArgs(); List<String> luceneAnalyzerClassArgTypes = config.getLuceneAnalyzerClassArgTypes(); if (null == luceneAnalyzerClassName || luceneAnalyzerClassName.isEmpty() - || (luceneAnalyzerClassName.equals(StandardAnalyzer.class.getName()) - && luceneAnalyzerClassArgs.isEmpty() && luceneAnalyzerClassArgTypes.isEmpty())) { + || ((luceneAnalyzerClassName.equals(CaseAwareStandardAnalyzer.class.getName()) + || luceneAnalyzerClassName.equals(StandardAnalyzer.class.getName())) + && luceneAnalyzerClassArgs.isEmpty() && luceneAnalyzerClassArgTypes.isEmpty())) { // When there is no analyzer defined, or when StandardAnalyzer (default) is used without arguments, // use existing logic to obtain an instance of StandardAnalyzer with customized stop words return TextIndexUtils.getStandardAnalyzerWithCustomizedStopWords( - config.getStopWordsInclude(), config.getStopWordsExclude(), config.isCaseSensitive()); + config.getStopWordsInclude(), config.getStopWordsExclude(), config.isCaseSensitive()); } // Custom analyzer + custom configs via reflection @@ -177,7 +180,7 @@ public class TextIndexUtils { // Return a new instance of custom lucene analyzer class return (Analyzer) luceneAnalyzerClass.getConstructor(argClasses.toArray(new Class<?>[0])) - .newInstance(argValues.toArray(new Object[0])); + .newInstance(argValues.toArray(new Object[0])); } /** @@ -186,7 +189,8 @@ public class TextIndexUtils { * @return Class object of the value type * @throws ClassNotFoundException when the value type is not supported */ - public static Class<?> parseSupportedTypes(String valueTypeString) throws ClassNotFoundException { + public static Class<?> parseSupportedTypes(String valueTypeString) + throws ClassNotFoundException { try { // Support both primitive types + class switch (valueTypeString) { @@ -223,7 +227,7 @@ public class TextIndexUtils { * @throws ReflectiveOperationException if value cannot be coerced without ambiguity or encountered unsupported type */ public static Object parseSupportedTypeValues(String stringValue, Class<?> clazz) - throws ReflectiveOperationException { + throws ReflectiveOperationException { try { if (clazz.equals(String.class)) { return stringValue; @@ -260,7 +264,7 @@ public class TextIndexUtils { } } catch (NumberFormatException | ReflectiveOperationException ex) { String exceptionMessage = "Custom analyzer argument cannot be coerced from " - + stringValue + " to " + clazz.getName() + " type"; + + stringValue + " to " + clazz.getName() + " type"; LOGGER.error(exceptionMessage); throw new ReflectiveOperationException(exceptionMessage); } catch (UnsupportedOperationException ex) { @@ -280,14 +284,12 @@ public class TextIndexUtils { if (stopWordsExclude != null) { stopWordsExclude.forEach(stopWordSet::remove); } - if (isCaseSensitive) { - return new CaseSensitiveAnalyzer(new CharArraySet(stopWordSet, false)); - } - return new StandardAnalyzer(new CharArraySet(stopWordSet, true)); + return new CaseAwareStandardAnalyzer(new CharArraySet(stopWordSet, !isCaseSensitive), isCaseSensitive); } public static Constructor<QueryParserBase> getQueryParserWithStringAndAnalyzerTypeConstructor( - String queryParserClassName) throws ReflectiveOperationException { + String queryParserClassName) + throws ReflectiveOperationException { // Fail-fast if the query parser is specified class is not QueryParseBase class final Class<?> queryParserClass = Class.forName(queryParserClassName); if (!QueryParserBase.class.isAssignableFrom(queryParserClass)) { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org