mayya-sharipova opened a new pull request, #14430: URL: https://github.com/apache/lucene/pull/14430
This reverts commit ce2a917cf2c2f40b3996656f3b294e3c01d25e5b. ### Description In Elasticsearch (and probably other applications) we reuse the same analyzer across fields. And this change breaks it. A fix in Lucene is trivial to make a default reuse strategy per_field:  But I don't know all the implications for this. So I will revert my change for now . For example, the test below will fail currently: ```java private static class PhraseWrappedAnalyzer extends AnalyzerWrapper { private final Analyzer delegate; private final int posIncGap; PhraseWrappedAnalyzer(Analyzer delegate, int posIncGap) { super(delegate.getReuseStrategy()); this.delegate = delegate; this.posIncGap = posIncGap; } @Override public int getPositionIncrementGap(String fieldName) { // Delegate or return fixed value? Original test didn't rely on this. // Returning the passed value is consistent with the constructor. // Delegating might be safer generally: return delegate.getPositionIncrementGap(fieldName); return posIncGap; } @Override public int getOffsetGap(String fieldName) { // Delegate offset gap as well for completeness return delegate.getOffsetGap(fieldName); } @Override protected Analyzer getWrappedAnalyzer(String fieldName) { return delegate; } @Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { // Wrap the delegate's token stream with FixedShingleFilter for bigrams return new TokenStreamComponents(components.getSource(), new ShingleFilter(components.getTokenStream(), 2)); } } public void testIndexDiffFieldsSameAnalyzer() throws IOException { final Analyzer textAnalyzer = new StandardAnalyzer(); final Analyzer phraseAnalyzer = new PhraseWrappedAnalyzer(textAnalyzer, 0); FieldType textVectorType = new FieldType(TextField.TYPE_NOT_STORED); textVectorType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); textVectorType.freeze(); Map<String, Analyzer> analyzerMap = new HashMap<>(); analyzerMap.put("text", textAnalyzer); analyzerMap.put("text_phrases", phraseAnalyzer); // Use this field to store phrase tokens Analyzer perFieldAnalyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(), analyzerMap); Directory dir = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(perFieldAnalyzer); IndexWriter writer = new IndexWriter(dir, iwc); int maxDocs = 4; String content = "the quick brown fox jumped over the lazy dog"; for (int i = 0; i < maxDocs; i++) { Document doc = new Document(); doc.add(new Field("text", content, textVectorType)); doc.add(new Field("text_phrases", content, textVectorType)); writer.addDocument(doc); } writer.commit(); try (IndexReader reader = DirectoryReader.open(writer)) { assertEquals("Should have indexed maxDocs documents", maxDocs, reader.numDocs()); // Verify term frequencies for the 'text' field Terms textTerms = MultiTerms.getTerms(reader, "text"); assertNotNull("Terms should exist for 'text' field", textTerms); TermsEnum textTermsEnum = textTerms.iterator(); BytesRef term; int termCount = 0; while ((term = textTermsEnum.next()) != null) { assertEquals("Incorrect docFreq for term '" + term.utf8ToString() + "' in field 'text'", maxDocs, textTermsEnum.docFreq()); termCount++; } assertTrue("Should find terms in 'text' field", termCount > 0); // Verify term frequencies for the 'text_phrases' field (shingles) Terms phraseTerms = MultiTerms.getTerms(reader, "text_phrases"); assertNotNull("Terms should exist for 'text_phrases' field", phraseTerms); TermsEnum phraseTermsEnum = phraseTerms.iterator(); BytesRef phrase; int phraseCount = 0; while ((phrase = phraseTermsEnum.next()) != null) { assertEquals("Incorrect docFreq for phrase '" + phrase.utf8ToString() + "' in field 'text_phrases'", maxDocs, phraseTermsEnum.docFreq()); phraseCount++; } assertTrue("Should find phrases (shingles) in 'text_phrases' field", phraseCount > 0); } finally { writer.close(); dir.close(); perFieldAnalyzer.close(); } } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org