donnerpeter commented on a change in pull request #2330: URL: https://github.com/apache/lucene-solr/pull/2330#discussion_r572760859
########## File path: lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java ########## @@ -33,44 +40,59 @@ */ class GeneratingSuggester { private static final int MAX_ROOTS = 100; - private static final int MAX_GUESSES = 100; + private static final int MAX_WORDS = 100; + private static final int MAX_GUESSES = 200; private final Dictionary dictionary; + private final SpellChecker speller; - GeneratingSuggester(Dictionary dictionary) { - this.dictionary = dictionary; + GeneratingSuggester(SpellChecker speller) { + this.dictionary = speller.dictionary; + this.speller = speller; } List<String> suggest(String word, WordCase originalCase, Set<String> prevSuggestions) { - List<WeightedWord> roots = findSimilarDictionaryEntries(word, originalCase); - List<WeightedWord> expanded = expandRoots(word, roots); - TreeSet<WeightedWord> bySimilarity = rankBySimilarity(word, expanded); + List<Weighted<DictEntry>> roots = findSimilarDictionaryEntries(word, originalCase); + List<Weighted<String>> expanded = expandRoots(word, roots); + TreeSet<Weighted<String>> bySimilarity = rankBySimilarity(word, expanded); return getMostRelevantSuggestions(bySimilarity, prevSuggestions); } - private List<WeightedWord> findSimilarDictionaryEntries(String word, WordCase originalCase) { - try { - IntsRefFSTEnum<IntsRef> fstEnum = new IntsRefFSTEnum<>(dictionary.words); - TreeSet<WeightedWord> roots = new TreeSet<>(); + private List<Weighted<DictEntry>> findSimilarDictionaryEntries( + String word, WordCase originalCase) { + TreeSet<Weighted<DictEntry>> roots = new TreeSet<>(); + processFST( + dictionary.words, + (key, forms) -> { + if (Math.abs(key.length - word.length()) > 4) return; + + String root = toString(key); + List<DictEntry> entries = filterSuitableEntries(root, forms); + if (entries.isEmpty()) return; + + if (originalCase == WordCase.LOWER + && WordCase.caseOf(root) == WordCase.TITLE + && !dictionary.hasLanguage("de")) { + return; + } - IntsRefFSTEnum.InputOutput<IntsRef> mapping; - while ((mapping = fstEnum.next()) != null) { - IntsRef key = mapping.input; - if (Math.abs(key.length - word.length()) > 4 || !isSuitableRoot(mapping.output)) continue; - - String root = toString(key); - if (originalCase == WordCase.LOWER - && WordCase.caseOf(root) == WordCase.TITLE - && !dictionary.hasLanguage("de")) { - continue; - } + String lower = dictionary.toLowerCase(root); + int sc = + ngram(3, word, lower, EnumSet.of(NGramOptions.LONGER_WORSE)) + + commonPrefix(word, root); - String lower = dictionary.toLowerCase(root); - int sc = - ngram(3, word, lower, EnumSet.of(NGramOptions.LONGER_WORSE)) + commonPrefix(word, root); + entries.forEach(e -> roots.add(new Weighted<>(e, sc))); + }); + return roots.stream().limit(MAX_ROOTS).collect(Collectors.toList()); + } - roots.add(new WeightedWord(root, sc)); + private void processFST(FST<IntsRef> fst, BiConsumer<IntsRef, IntsRef> keyValueConsumer) { Review comment: I wonder if it makes sense to add something breakable in the middle, e.g. accepting some processor (unfortunately neither BiFunction nor BiPredicate convey that semantics for me :( ). OTOH I don't need it right now, and breakability can be added later. Or, it could be made a `Stream` or `Iterable`. One complication though: here I wrap all `IOException`s, but that's probably not a good idea in a general FST case. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org