bruno-roustant commented on code in PR #13431: URL: https://github.com/apache/lucene/pull/13431#discussion_r1617116464
########## lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java: ########## @@ -168,33 +171,53 @@ public UserMorphData getMorphAttributes() { * @return array of {wordId, position, length} */ public int[][] lookup(char[] chars, int off, int len) throws IOException { - // TODO: can we avoid this treemap/toIndexArray? - TreeMap<Integer, int[]> result = new TreeMap<>(); // index, [length, length...] - boolean found = false; // true if we found any results - + List<Match> matches = null; + int numResults = 0; final FST.BytesReader fstReader = fst.getBytesReader(); - + final int end = off + len; FST.Arc<Long> arc = new FST.Arc<>(); - int end = off + len; for (int startOffset = off; startOffset < end; startOffset++) { + int[] wordIdAndLength = null; arc = fst.getFirstArc(arc); int output = 0; - int remaining = end - startOffset; - for (int i = 0; i < remaining; i++) { + for (int i = 0, remaining = end - startOffset; i < remaining; i++) { int ch = chars[startOffset + i]; if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) { break; // continue to next position } output += arc.output().intValue(); if (arc.isFinal()) { - final int finalOutput = output + arc.nextFinalOutput().intValue(); - result.put(startOffset - off, segmentations[finalOutput]); - found = true; + int finalOutput = output + arc.nextFinalOutput().intValue(); + wordIdAndLength = segmentations[finalOutput]; } } + if (wordIdAndLength != null) { + if (matches == null) { + matches = new ArrayList<>(); + } + matches.add(new Match(startOffset - off, wordIdAndLength)); + numResults += wordIdAndLength.length - 1; + } } - - return found ? toIndexArray(result) : EMPTY_RESULT; + if (numResults == 0) { + return EMPTY_RESULT; + } + int[][] result = new int[numResults][]; Review Comment: Interestingly, I also tried to build the result array directly inside the first loop on startOffset. While it reduced the code, it slowed down the execution, probably because this second loop disturbed the optimization of the first loop. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org