rmuir commented on code in PR #14278: URL: https://github.com/apache/lucene/pull/14278#discussion_r1967791155
########## lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java: ########## @@ -682,4 +687,41 @@ protected TokenStreamComponents createComponents(String fieldName) { checkOneTerm(b, "", ""); b.close(); } + + public void testDecompoundingWithConsumingChars() throws Exception { + + CharArraySet dict = makeDictionary("wein", "schwein", "fleisch"); + + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + String searchTerm = "schweinefleisch"; + DictionaryCompoundWordTokenFilter tf = + getDictionaryCompoundWordTokenFilter(tokenizer, searchTerm, dict); + + assertTokenStreamContents(tf, new String[] {searchTerm, "schwein", "fleisch"}); + } + + public void testDecompoundingWithConsumingChars2() throws Exception { + CharArraySet dict = makeDictionary("waffe", "affe", "kampf"); + + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + String searchTerm = "nahkampfwaffen"; + + DictionaryCompoundWordTokenFilter tf = + getDictionaryCompoundWordTokenFilter(tokenizer, searchTerm, dict); + + assertTokenStreamContents(tf, new String[] {searchTerm, "kampf", "waffe"}); + } + + private DictionaryCompoundWordTokenFilter getDictionaryCompoundWordTokenFilter( + Tokenizer tokenizer, String searchTerm, CharArraySet dict) { + tokenizer.setReader(new StringReader(searchTerm)); + return new DictionaryCompoundWordTokenFilter( + tokenizer, + dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, + true, + true); Review Comment: could we add a case with: ```java longestMatch = false; consumeChars = true; ``` If the combination doesn't make sense, lets just throw an `IllegalArgumentException` in the constructor and have the test `expectThrows()` that? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org