madrob commented on a change in pull request #1042: LUCENE-9068: Build FuzzyQuery automata up-front URL: https://github.com/apache/lucene-solr/pull/1042#discussion_r362871453
########## File path: lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java ########## @@ -92,76 +105,62 @@ * * @param terms Delivers terms. * @param atts {@link AttributeSource} created by the rewrite method of {@link MultiTermQuery} - * thats contains information about competitive boosts during rewrite. It is also used - * to cache DFAs between segment transitions. + * that contains information about competitive boosts during rewrite * @param term Pattern term. * @param maxEdits Maximum edit distance. - * @param prefixLength Length of required common prefix. Default value is 0. + * @param automata An array of levenshtein automata to match against terms, + * see {@link #buildAutomata(String, int[], int, boolean, int)} * @throws IOException if there is a low-level IO error */ - public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, - final int maxEdits, final int prefixLength, boolean transpositions) throws IOException { - if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { - throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits); - } - if (prefixLength < 0) { - throw new IllegalArgumentException("prefixLength cannot be less than 0"); - } + public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, int termLength, + final int maxEdits, CompiledAutomaton[] automata) throws IOException { + this.maxEdits = maxEdits; this.terms = terms; this.term = term; - - // convert the string into a utf32 int[] representation for fast comparisons - this.termText = stringToUTF32(term.text()); - this.termLength = termText.length; + this.atts = atts; + this.termLength = termLength; - this.dfaAtt = atts.addAttribute(LevenshteinAutomataAttribute.class); this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); + this.boostAtt = atts.addAttribute(BoostAttribute.class); - // NOTE: boostAtt must pulled from attributes() not from atts! This is because TopTermsRewrite looks for boostAtt from this TermsEnum's - // private attributes() and not the global atts passed to us from MultiTermQuery: - this.boostAtt = attributes().addAttribute(BoostAttribute.class); - - //The prefix could be longer than the word. - //It's kind of silly though. It means we must match the entire word. - this.realPrefixLength = prefixLength > termLength ? termLength : prefixLength; - this.transpositions = transpositions; - - CompiledAutomaton[] prevAutomata = dfaAtt.automata(); - if (prevAutomata == null) { - prevAutomata = new CompiledAutomaton[maxEdits+1]; - Automaton[] automata = buildAutomata(termText, prefixLength, transpositions, maxEdits); - for (int i = 0; i <= maxEdits; i++) { - try { - prevAutomata[i] = new CompiledAutomaton(automata[i], true, false); - } catch (TooComplexToDeterminizeException e) { - throw new FuzzyTermsException(term.text(), e); - } - } - // first segment computes the automata, and we share with subsequent segments via this Attribute: - dfaAtt.setAutomata(prevAutomata); - } + this.automata = automata; - this.automata = prevAutomata; bottom = maxBoostAtt.getMaxNonCompetitiveBoost(); bottomTerm = maxBoostAtt.getCompetitiveTerm(); bottomChanged(null); } /** - * Builds a binary Automaton to match a fuzzy term - * @param text the term to match - * @param prefixLength length of a required common prefix - * @param transpositions {@code true} if transpositions should count as a single edit - * @param maxEdits the maximum edit distance of matching terms + * Sets the maximum non-competitive boost, which may allow switching to a + * lower max-edit automaton at run time + */ + public void setMaxNonCompetitiveBoost(float boost) { + this.maxBoostAtt.setMaxNonCompetitiveBoost(boost); Review comment: Does this need to call `bottomChanged`? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org