madrob commented on a change in pull request #1467: URL: https://github.com/apache/lucene-solr/pull/1467#discussion_r418121317
########## File path: lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java ########## @@ -88,43 +89,44 @@ * @throws IOException if there is a low-level IO error */ public FuzzyTermsEnum(Terms terms, Term term, int maxEdits, int prefixLength, boolean transpositions) throws IOException { - this(terms, term, stringToUTF32(term.text()), maxEdits, prefixLength, transpositions); - } - - private FuzzyTermsEnum(Terms terms, Term term, int[] codePoints, int maxEdits, int prefixLength, boolean transpositions) throws IOException { - this(terms, new AttributeSource(), term, codePoints.length, maxEdits, - buildAutomata(term.text(), codePoints, prefixLength, transpositions, maxEdits)); + this(terms, new AttributeSource(), term, () -> new FuzzyAutomatonBuilder(term.text(), maxEdits, prefixLength, transpositions)); } /** * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of * length <code>prefixLength</code> with <code>term</code> and which have at most {@code maxEdits} edits. * <p> - * After calling the constructor the enumeration is already pointing to the first - * valid term if such a term exists. - * + * After calling the constructor the enumeration is already pointing to the first + * valid term if such a term exists. + * * @param terms Delivers terms. - * @param atts {@link AttributeSource} created by the rewrite method of {@link MultiTermQuery} - * that contains information about competitive boosts during rewrite + * @param atts An AttributeSource used to share automata between segments * @param term Pattern term. * @param maxEdits Maximum edit distance. - * @param automata An array of levenshtein automata to match against terms, - * see {@link #buildAutomata(String, int[], int, boolean, int)} + * @param prefixLength the length of the required common prefix + * @param transpositions whether transpositions should count as a single edit * @throws IOException if there is a low-level IO error */ - public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, int termLength, - final int maxEdits, CompiledAutomaton[] automata) throws IOException { + FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, int maxEdits, int prefixLength, boolean transpositions) throws IOException { + this(terms, atts, term, () -> new FuzzyAutomatonBuilder(term.text(), maxEdits, prefixLength, transpositions)); + } + + private FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, Supplier<FuzzyAutomatonBuilder> automatonBuilder) throws IOException { - this.maxEdits = maxEdits; this.terms = terms; - this.term = term; this.atts = atts; - this.termLength = termLength; + this.term = term; this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); this.boostAtt = atts.addAttribute(BoostAttribute.class); - this.automata = automata; + atts.addAttributeImpl(new AutomatonAttributeImpl()); Review comment: The java doc on this suggest that we need to be using an AttributeFactory, but that might be overkill here? ########## File path: lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java ########## @@ -88,43 +89,44 @@ * @throws IOException if there is a low-level IO error */ public FuzzyTermsEnum(Terms terms, Term term, int maxEdits, int prefixLength, boolean transpositions) throws IOException { - this(terms, term, stringToUTF32(term.text()), maxEdits, prefixLength, transpositions); - } - - private FuzzyTermsEnum(Terms terms, Term term, int[] codePoints, int maxEdits, int prefixLength, boolean transpositions) throws IOException { - this(terms, new AttributeSource(), term, codePoints.length, maxEdits, - buildAutomata(term.text(), codePoints, prefixLength, transpositions, maxEdits)); + this(terms, new AttributeSource(), term, () -> new FuzzyAutomatonBuilder(term.text(), maxEdits, prefixLength, transpositions)); } /** * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of * length <code>prefixLength</code> with <code>term</code> and which have at most {@code maxEdits} edits. * <p> - * After calling the constructor the enumeration is already pointing to the first - * valid term if such a term exists. - * + * After calling the constructor the enumeration is already pointing to the first + * valid term if such a term exists. + * * @param terms Delivers terms. - * @param atts {@link AttributeSource} created by the rewrite method of {@link MultiTermQuery} - * that contains information about competitive boosts during rewrite + * @param atts An AttributeSource used to share automata between segments * @param term Pattern term. * @param maxEdits Maximum edit distance. - * @param automata An array of levenshtein automata to match against terms, - * see {@link #buildAutomata(String, int[], int, boolean, int)} + * @param prefixLength the length of the required common prefix + * @param transpositions whether transpositions should count as a single edit * @throws IOException if there is a low-level IO error */ - public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, int termLength, - final int maxEdits, CompiledAutomaton[] automata) throws IOException { + FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, int maxEdits, int prefixLength, boolean transpositions) throws IOException { + this(terms, atts, term, () -> new FuzzyAutomatonBuilder(term.text(), maxEdits, prefixLength, transpositions)); + } + + private FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, Supplier<FuzzyAutomatonBuilder> automatonBuilder) throws IOException { - this.maxEdits = maxEdits; this.terms = terms; - this.term = term; this.atts = atts; - this.termLength = termLength; + this.term = term; this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); this.boostAtt = atts.addAttribute(BoostAttribute.class); - this.automata = automata; + atts.addAttributeImpl(new AutomatonAttributeImpl()); + AutomatonAttribute aa = atts.addAttribute(AutomatonAttribute.class); Review comment: why not getAttribute? If we're sharing this across segments, then I think we want to reuse the same one each time, right? ########## File path: lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java ########## @@ -193,7 +172,7 @@ protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOExc if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact return new SingleTermsEnum(terms.iterator(), term.bytes()); } - return new FuzzyTermsEnum(terms, atts, getTerm(), termLength, maxEdits, automata); + return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions); Review comment: do we need to update the java docs on multi term query to reflect that fuzzy query also uses the attribute source now as well? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org