[ https://issues.apache.org/jira/browse/LUCENE-10358?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Uwe Schindler updated LUCENE-10358: ----------------------------------- Labels: random-chains (was: ) > JapaneseIterationMarkCharFilter: TestRandomChains fails with incorrect > offsets or causes IndexOutOfBounds > --------------------------------------------------------------------------------------------------------- > > Key: LUCENE-10358 > URL: https://issues.apache.org/jira/browse/LUCENE-10358 > Project: Lucene - Core > Issue Type: Bug > Components: modules/analysis > Reporter: Uwe Schindler > Priority: Major > Labels: random-chains > > Failures seen: > {noformat} > $ gradlew :lucene:analysis:integration.tests:test --tests > TestRandomChains.testRandomChainsWithLargeStrings > -Dtests.seed=AA632771CC823702 -Dtests.slow=true -Dtests.locale=fr-MF > -Dtests.timezone=America/Panama -Dtests.asserts=true > -Dtests.file.encoding=UTF-8 > org.apache.lucene.analysis.tests.TestRandomChains > test suite's output saved > to C:\Users\Uwe > Schindler\Projects\lucene\lucene\lucene\analysis\integration.tests\build\test-results\test\outputs\OUTPUT-org.apache.lucene.analysis.tests.TestRandomChains.txt, > copied below: > 2> stage 0: ÉÆû<[0-2] +1> ÉÆä<[4-6] +1> ppkarrpf<[7-14] +1> 1<[16-17] +1> > 5<[18-19] +1> > 2> stage 1: ÉÆû<[0-2] +1> ÉÆä<[4-6] +1> 000000<[4-6] +0> ppkarrpf<[7-14] > +1> 759700<[7-14] +0> 1<[16-17] +1> 5<[18-19] +1> 000000<[18-19] +0> > 2> stage 2: ÉÆû<[0-2] +1> ÉÆä<[4-6] +1> 000000<[4-6] +0> ppkarrpf<[7-14] > +1> 759700<[7-14] +0> 1<[16-17] +1> 000000<[18-19] +0> > 2> TEST FAIL: useCharFilter=true text='\ud801\udc96\ud801\udcaa\ud801\udc84 > ppkarpf {1,5}g?)u em mbm hbil' > 2> Exception from random analyzer: > 2> charfilters= > 2> > org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter(java.io.StringReader@105e6aa7, > true, false) > 2> tokenizer= > 2> org.apache.lucene.analysis.th.ThaiTokenizer() > 2> filters= > 2> > Conditional:org.apache.lucene.analysis.phonetic.DaitchMokotoffSoundexFilter(OneTimeWrapper@79889b7f > > term=,bytes=[],startOffset=0,endOffset=0,positionIncrement=1,positionLength=1,type=word,termFrequency=1, > true) > 2> > org.apache.lucene.analysis.ja.JapaneseNumberFilter(ValidatingTokenFilter@53a9e96c > > term=,bytes=[],startOffset=0,endOffset=0,positionIncrement=1,positionLength=1,type=word,termFrequency=1,keyword=false) > 2> > org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter(ValidatingTokenFilter@6cb4578d > > term=,bytes=[],startOffset=0,endOffset=0,positionIncrement=1,positionLength=1,type=word,termFrequency=1,keyword=false, > > org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter$StemmerOverrideMap@51fc8124) > > java.lang.IllegalStateException: stage 2: inconsistent startOffset > at pos=3: 16 vs 18; token=000000 > > at > __randomizedtesting.SeedInfo.seed([AA632771CC823702:C038986095CC17F1]:0) > > at > org.apache.lucene.test_framework@10.0.0-SNAPSHOT/org.apache.lucene.tests.analysis.ValidatingTokenFilter.incrementToken(ValidatingTokenFilter.java:138) > > at > org.apache.lucene.analysis.common@10.0.0-SNAPSHOT/org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.incrementToken(StemmerOverrideFilter.java:67) > > at > org.apache.lucene.test_framework@10.0.0-SNAPSHOT/org.apache.lucene.tests.analysis.ValidatingTokenFilter.incrementToken(ValidatingTokenFilter.java:81) > > at > org.apache.lucene.test_framework@10.0.0-SNAPSHOT/org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.checkAnalysisConsistency(BaseTokenStreamTestCase.java:1130) > > at > org.apache.lucene.test_framework@10.0.0-SNAPSHOT/org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.checkRandomData(BaseTokenStreamTestCase.java:1028) > > at > org.apache.lucene.test_framework@10.0.0-SNAPSHOT/org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.checkRandomData(BaseTokenStreamTestCase.java:922) > > at > org.apache.lucene.analysis.tests@10.0.0-SNAPSHOT/org.apache.lucene.analysis.tests.TestRandomChains.testRandomChainsWithLargeStrings(TestRandomChains.java:943) > {noformat} > and also: > {noformat} > $ gradlew :lucene:analysis:integration.tests:test --tests > TestRandomChains.testRandomChains -Dtests.seed=3A0D0E91E0CA5BFC > -Dtests.slow=true -Dtests.locale=nmg-CM -Dtests.timezone=Antarctica/Vostok > -Dtests.asserts=true -Dtests.file.encoding=UTF-8 > org.apache.lucene.analysis.tests.TestRandomChains > test suite's output saved > to C:\Users\Uwe > Schindler\Projects\lucene\lucene\lucene\analysis\integration.tests\build\test-results\test_17\outputs\OUTPUT-org.apache.lucene.analysis.tests.TestRandomChains.txt, > copied below: > 2> TEST FAIL: useCharFilter=false text='' > 2> Exception from random analyzer: > 2> charfilters= > 2> > org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter(java.io.StringReader@7ee7c045) > 2> > org.apache.lucene.analysis.charfilter.HTMLStripCharFilter(org.apache.lucene.analysis.ja.JapaneseIterationMarkCharFilter@66bba53d, > []) > 2> tokenizer= > 2> org.apache.lucene.analysis.core.KeywordTokenizer(27) > 2> filters= > 2> > org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter(ValidatingTokenFilter@1a8623a7 > > term=,bytes=[],startOffset=0,endOffset=0,positionIncrement=1,positionLength=1,type=word,termFrequency=1,keyword=false, > -21) > > java.lang.ArrayIndexOutOfBoundsException: Index -1 out of bounds for > length 32 > > at > __randomizedtesting.SeedInfo.seed([3A0D0E91E0CA5BFC:7EC27F0A7D8463C]:0) > > at > org.apache.lucene.analysis.kuromoji@10.0.0-SNAPSHOT/org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter.stem(JapaneseKatakanaStemFilter.java:76) > > at > org.apache.lucene.analysis.kuromoji@10.0.0-SNAPSHOT/org.apache.lucene.analysis.ja.JapaneseKatakanaStemFilter.incrementToken(JapaneseKatakanaStemFilter.java:59) > > at > org.apache.lucene.test_framework@10.0.0-SNAPSHOT/org.apache.lucene.tests.analysis.ValidatingTokenFilter.incrementToken(ValidatingTokenFilter.java:81) > > at > org.apache.lucene.test_framework@10.0.0-SNAPSHOT/org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.checkAnalysisConsistency(BaseTokenStreamTestCase.java:1130) > > at > org.apache.lucene.test_framework@10.0.0-SNAPSHOT/org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.checkRandomData(BaseTokenStreamTestCase.java:1028) > > at > org.apache.lucene.test_framework@10.0.0-SNAPSHOT/org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.checkRandomData(BaseTokenStreamTestCase.java:922) > > at > org.apache.lucene.analysis.tests@10.0.0-SNAPSHOT/org.apache.lucene.analysis.tests.TestRandomChains.testRandomChains(TestRandomChains.java:911) > {noformat} -- This message was sent by Atlassian Jira (v8.20.1#820001) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org