msfroh commented on code in PR #14349: URL: https://github.com/apache/lucene/pull/14349#discussion_r1992424195
########## lucene/core/src/java/org/apache/lucene/search/CaseInsensitiveTermInSetQuery.java: ########## @@ -81,58 +89,95 @@ public void visit(QueryVisitor visitor) { visitor.consumeTermsMatching(this, field, this::asByteRunAutomaton); } } - + + /** + * Creates an efficient ByteRunAutomaton implementation for case-insensitive term matching. + * + * <p>This implementation addresses the maintainer's concern about automaton inefficiency with + * large numbers of terms. Instead of building a complex automaton that requires expensive + * determinization, we use a HashSet-based approach that provides O(1) lookups regardless of the + * number of terms. + * + * @return A custom ByteRunAutomaton that efficiently matches terms case-insensitively + */ private ByteRunAutomaton asByteRunAutomaton() { - // Create regex for case-insensitive matching: term1|term2|term3... - StringBuilder pattern = new StringBuilder(); - boolean first = true; - + // Convert all terms to lowercase for case-insensitive comparison + final Set<String> lowerCaseTerms = new HashSet<>(terms.size()); for (BytesRef term : terms) { - if (!first) { - pattern.append('|'); - } - first = false; - pattern.append(term.utf8ToString()); - } - - // Create regex automaton with case-insensitive flag - RegExp regexp = new RegExp(pattern.toString(), RegExp.NONE, RegExp.CASE_INSENSITIVE); - Automaton automaton = regexp.toAutomaton(); - - // Ensure the automaton is deterministic - if (!automaton.isDeterministic()) { - automaton = Operations.determinize(automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); + lowerCaseTerms.add(term.utf8ToString().toLowerCase()); } - - return new ByteRunAutomaton(automaton); + + // Create a minimal automaton shell + final Automaton emptyAutomaton = new Automaton(); + emptyAutomaton.createState(); + + // Override the run method with our HashSet-based implementation + return new ByteRunAutomaton(emptyAutomaton) { + @Override + public boolean run(byte[] s, int offset, int length) { + // Convert input bytes to string and lowercase for comparison + BytesRef slice = new BytesRef(s, offset, length); + String termString = slice.utf8ToString().toLowerCase(); + return lowerCaseTerms.contains(termString); + } Review Comment: The `ByteRunAutomaton` is only used for highlighting (see `MultiTermHighlighting`). The actual query will use the `Automaton` passed to `AutomatonQuery`. ########## lucene/core/src/java/org/apache/lucene/search/CaseInsensitiveTermInSetQuery.java: ########## @@ -81,58 +89,95 @@ public void visit(QueryVisitor visitor) { visitor.consumeTermsMatching(this, field, this::asByteRunAutomaton); } } - + + /** + * Creates an efficient ByteRunAutomaton implementation for case-insensitive term matching. + * + * <p>This implementation addresses the maintainer's concern about automaton inefficiency with + * large numbers of terms. Instead of building a complex automaton that requires expensive + * determinization, we use a HashSet-based approach that provides O(1) lookups regardless of the + * number of terms. + * + * @return A custom ByteRunAutomaton that efficiently matches terms case-insensitively + */ private ByteRunAutomaton asByteRunAutomaton() { - // Create regex for case-insensitive matching: term1|term2|term3... - StringBuilder pattern = new StringBuilder(); - boolean first = true; - + // Convert all terms to lowercase for case-insensitive comparison + final Set<String> lowerCaseTerms = new HashSet<>(terms.size()); for (BytesRef term : terms) { - if (!first) { - pattern.append('|'); - } - first = false; - pattern.append(term.utf8ToString()); - } - - // Create regex automaton with case-insensitive flag - RegExp regexp = new RegExp(pattern.toString(), RegExp.NONE, RegExp.CASE_INSENSITIVE); - Automaton automaton = regexp.toAutomaton(); - - // Ensure the automaton is deterministic - if (!automaton.isDeterministic()) { - automaton = Operations.determinize(automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT); + lowerCaseTerms.add(term.utf8ToString().toLowerCase()); } - - return new ByteRunAutomaton(automaton); + + // Create a minimal automaton shell + final Automaton emptyAutomaton = new Automaton(); + emptyAutomaton.createState(); + + // Override the run method with our HashSet-based implementation + return new ByteRunAutomaton(emptyAutomaton) { + @Override + public boolean run(byte[] s, int offset, int length) { + // Convert input bytes to string and lowercase for comparison + BytesRef slice = new BytesRef(s, offset, length); + String termString = slice.utf8ToString().toLowerCase(); + return lowerCaseTerms.contains(termString); + } + }; } @Override public Query rewrite(IndexSearcher indexSearcher) throws IOException { if (terms.isEmpty()) { return new MatchNoDocsQuery("Empty terms set"); } - - // For a single term, use a simple RegexpQuery + + // For a single term, use a RegexpQuery with case-insensitive flag if (terms.size() == 1) { return new RegexpQuery( - new Term(field, terms.iterator().next().utf8ToString()), - RegExp.NONE, - RegExp.CASE_INSENSITIVE); - } - - // For multiple terms, create a boolean query of regexps - BooleanQuery.Builder builder = new BooleanQuery.Builder(); - for (BytesRef term : terms) { - RegexpQuery regexpQuery = new RegexpQuery( - new Term(field, term.utf8ToString()), + new Term(field, terms.iterator().next().utf8ToString()), RegExp.NONE, RegExp.CASE_INSENSITIVE); - - builder.add(regexpQuery, BooleanClause.Occur.SHOULD); } - - return builder.build(); + + // Handle different term set sizes appropriately: + // 1. For smaller sets (<=100 terms), create a BooleanQuery with RegexpQueries + // 2. For large sets (>100 terms), use an AutomatonQuery approach to avoid BooleanQuery clause + // limits + + if (terms.size() <= 100) { + // For a reasonable number of terms, create a BooleanQuery of regexps + BooleanQuery.Builder builder = new BooleanQuery.Builder(); + for (BytesRef term : terms) { + RegexpQuery regexpQuery = + new RegexpQuery( + new Term(field, term.utf8ToString()), RegExp.NONE, RegExp.CASE_INSENSITIVE); + + builder.add(regexpQuery, BooleanClause.Occur.SHOULD); + } + return builder.build(); + } else { + // For large term sets, use a custom AutomatonQuery with our efficient + // asByteRunAutomaton implementation to avoid BooleanQuery clause limits + AutomatonQuery query = new AutomatonQuery(new Term(field), createAutomaton()); + + // Wrap in a ConstantScoreQuery to maintain consistent scoring + return new ConstantScoreQuery(query); + } + } + + /** + * Creates a minimal automaton for use with AutomatonQuery. + * + * <p>This simple automaton acts as a placeholder. The actual matching logic is handled by our + * custom asByteRunAutomaton() implementation, which efficiently checks terms using a HashSet for + * case-insensitive matching. This approach avoids the inefficiency of building and determinizing + * complex automata with many terms. + */ + private Automaton createAutomaton() { + // Create a minimal automaton that accepts any string + // The actual matching is done via the ByteRunAutomaton override + Automaton automaton = new Automaton(); + int state = automaton.createState(); + automaton.setAccept(state, true); + return automaton; } Review Comment: This `Automaton` will only match the empty string. (It's equivalent to `Automata.makeEmptyString()`.) I believe the query will be very fast, but won't match any terms (except the empty string). In the end, I believe we'd need an `Automaton` that matches all the strings in a case-insensitive fashion. I think it would be possible to do that with some changes to the `StringsToAutomaton` class to support case-insensitive transitions. Essentially, for each codepoint, we would need to add a transition for the lower- and upper-case versions (if applicable) in the `convert` method. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org