rmuir commented on code in PR #12312: URL: https://github.com/apache/lucene/pull/12312#discussion_r1199139773
########## lucene/core/src/java/org/apache/lucene/util/automaton/DaciukMihovAutomatonBuilder.java: ########## @@ -308,17 +290,83 @@ private void replaceOrRegister(State state) { } } - /** - * Add a suffix of <code>current</code> starting at <code>fromIndex</code> (inclusive) to state - * <code>state</code>. - */ - private void addSuffix(State state, CharSequence current, int fromIndex) { - final int len = current.length(); - while (fromIndex < len) { - int cp = Character.codePointAt(current, fromIndex); - state = state.newState(cp); - fromIndex += Character.charCount(cp); + private static class CharacterBasedBuilder extends DaciukMihovAutomatonBuilder { + private final CharsRefBuilder scratch = new CharsRefBuilder(); + + @Override + protected void add(BytesRef current) { + // Convert the input UTF-8 bytes to CharsRef so we can use the code points as our transition + // labels. Review Comment: nevermind, sorry for the noise. i read the diff wrong and got the char/binary mixed up. i think you are doing it right -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org