mikemccand commented on code in PR #14193: URL: https://github.com/apache/lucene/pull/14193#discussion_r1943595546
########## lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java: ########## @@ -648,13 +645,16 @@ private Automaton toAutomaton( break; case REGEXP_CHAR: if (check(ASCII_CASE_INSENSITIVE)) { - a = toCaseInsensitiveChar(c); + a = Automata.makeCharSet(toCaseInsensitiveChar(c)); } else { a = Automata.makeChar(c); } break; case REGEXP_CHAR_RANGE: - a = Automata.makeCharRange(from, to); + a = Automata.makeCharRange(from[0], to[0]); Review Comment: Do we check that `from.length == 1` and `to.length == 1` somewhere? ########## lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java: ########## @@ -828,13 +838,10 @@ void toStringBuilder(StringBuilder b) { if (digits > 0) for (int i = s2.length(); i < digits; i++) b.append('0'); b.append(s2).append(">"); break; - case REGEXP_PRE_CLASS: - b.append("\\").appendCodePoint(from); - break; } } - /** Like to string, but more verbose (shows the higherchy more clearly). */ + /** Like to string, but more verbose (shows the hierarchy more clearly). */ Review Comment: Oooh nice typo fix! ########## lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java: ########## @@ -1195,60 +1215,132 @@ final RegExp parseCharClassExp() throws IllegalArgumentException { } final RegExp parseCharClasses() throws IllegalArgumentException { - RegExp e = parseCharClass(); - while (more() && !peek("]")) e = makeUnion(flags, e, parseCharClass()); - return e; - } - - final RegExp parseCharClass() throws IllegalArgumentException { - RegExp predefinedExp = matchPredefinedCharacterClass(); - if (predefinedExp != null) { - return predefinedExp; - } - - int c = parseCharExp(); - if (match('-')) return makeCharRange(flags, c, parseCharExp()); - else return makeChar(flags, c); - } - - RegExp expandPredefined() { - // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html - switch (from) { - case 'd': - return new RegExp("[0-9]"); // digit - case 'D': - return new RegExp("[^0-9]"); // non-digit - case 's': - return new RegExp("[ \t\n\r]"); // whitespace - case 'S': - return new RegExp("[^\\s]"); // non-whitespace - case 'w': - return new RegExp("[a-zA-Z_0-9]"); // word - case 'W': - return new RegExp("[^\\w]"); // non-word - default: - throw new IllegalArgumentException("invalid character class " + from); - } - } + var starts = new ArrayList<Integer>(); + var ends = new ArrayList<Integer>(); - final RegExp matchPredefinedCharacterClass() { - // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html - if (match('\\')) { - if (peek("dDwWsS")) { - return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0); + do { + // look for escape + if (match('\\')) { + expandPreDefined(starts, ends); + } else { + // parse a character + int c = parseCharExp(); + + if (match('-')) { + // range from c-d + starts.add(c); + ends.add(parseCharExp()); + } else if (check(ASCII_CASE_INSENSITIVE)) { + // single case-insensitive character + for (int form : toCaseInsensitiveChar(c)) { + starts.add(form); + ends.add(form); + } + } else { + // single character + starts.add(c); + ends.add(c); + } } + } while (more() && !peek("]")); - if (peek("\\")) { - return makeChar(flags, next()); + // not sure why we bother optimizing nodes, same automaton... + // definitely saves time vs fixing toString()-based tests. + if (starts.size() == 1) { + if (starts.get(0) == ends.get(0)) { + return makeChar(flags, starts.get(0)); + } else { + return makeCharRange(flags, starts.get(0), ends.get(0)); } + } else { + return makeCharClass( + flags, + starts.stream().mapToInt(Integer::intValue).toArray(), + ends.stream().mapToInt(Integer::intValue).toArray()); + } + } + void expandPreDefined(List<Integer> starts, List<Integer> ends) { + if (peek("\\")) { + // escape + starts.add((int) '\\'); + ends.add((int) '\\'); + next(); + } else if (peek("d")) { + // digit: [0-9] + starts.add((int) '0'); + ends.add((int) '9'); + next(); + } else if (peek("D")) { + // non-digit: [^0-9] + starts.add(Character.MIN_CODE_POINT); + ends.add('0' - 1); + starts.add('9' + 1); + ends.add(Character.MAX_CODE_POINT); + next(); + } else if (peek("s")) { + // whitespace: [\t-\n\r ] + starts.add((int) '\t'); + ends.add((int) '\n'); + starts.add((int) '\r'); + ends.add((int) '\r'); + starts.add((int) ' '); + ends.add((int) ' '); + next(); + } else if (peek("S")) { + // non-whitespace: [^ \t\n\r] Review Comment: Hmm maybe fix this comment to put the space char at the end for consistency (the chars are sorted by their ascii/codepoint, ascending)? ########## lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java: ########## @@ -1195,60 +1215,132 @@ final RegExp parseCharClassExp() throws IllegalArgumentException { } final RegExp parseCharClasses() throws IllegalArgumentException { - RegExp e = parseCharClass(); - while (more() && !peek("]")) e = makeUnion(flags, e, parseCharClass()); - return e; - } - - final RegExp parseCharClass() throws IllegalArgumentException { - RegExp predefinedExp = matchPredefinedCharacterClass(); - if (predefinedExp != null) { - return predefinedExp; - } - - int c = parseCharExp(); - if (match('-')) return makeCharRange(flags, c, parseCharExp()); - else return makeChar(flags, c); - } - - RegExp expandPredefined() { - // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html - switch (from) { - case 'd': - return new RegExp("[0-9]"); // digit - case 'D': - return new RegExp("[^0-9]"); // non-digit - case 's': - return new RegExp("[ \t\n\r]"); // whitespace - case 'S': - return new RegExp("[^\\s]"); // non-whitespace - case 'w': - return new RegExp("[a-zA-Z_0-9]"); // word - case 'W': - return new RegExp("[^\\w]"); // non-word - default: - throw new IllegalArgumentException("invalid character class " + from); - } - } + var starts = new ArrayList<Integer>(); + var ends = new ArrayList<Integer>(); - final RegExp matchPredefinedCharacterClass() { - // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html - if (match('\\')) { - if (peek("dDwWsS")) { - return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0); + do { + // look for escape + if (match('\\')) { + expandPreDefined(starts, ends); + } else { + // parse a character + int c = parseCharExp(); + + if (match('-')) { + // range from c-d + starts.add(c); + ends.add(parseCharExp()); + } else if (check(ASCII_CASE_INSENSITIVE)) { + // single case-insensitive character + for (int form : toCaseInsensitiveChar(c)) { + starts.add(form); + ends.add(form); + } + } else { + // single character + starts.add(c); + ends.add(c); + } } + } while (more() && !peek("]")); - if (peek("\\")) { - return makeChar(flags, next()); + // not sure why we bother optimizing nodes, same automaton... + // definitely saves time vs fixing toString()-based tests. + if (starts.size() == 1) { + if (starts.get(0) == ends.get(0)) { + return makeChar(flags, starts.get(0)); + } else { + return makeCharRange(flags, starts.get(0), ends.get(0)); } + } else { + return makeCharClass( + flags, + starts.stream().mapToInt(Integer::intValue).toArray(), + ends.stream().mapToInt(Integer::intValue).toArray()); + } + } + void expandPreDefined(List<Integer> starts, List<Integer> ends) { + if (peek("\\")) { + // escape + starts.add((int) '\\'); + ends.add((int) '\\'); + next(); + } else if (peek("d")) { + // digit: [0-9] + starts.add((int) '0'); + ends.add((int) '9'); + next(); + } else if (peek("D")) { + // non-digit: [^0-9] + starts.add(Character.MIN_CODE_POINT); + ends.add('0' - 1); + starts.add('9' + 1); + ends.add(Character.MAX_CODE_POINT); + next(); + } else if (peek("s")) { + // whitespace: [\t-\n\r ] + starts.add((int) '\t'); + ends.add((int) '\n'); + starts.add((int) '\r'); + ends.add((int) '\r'); + starts.add((int) ' '); + ends.add((int) ' '); + next(); + } else if (peek("S")) { + // non-whitespace: [^ \t\n\r] + starts.add(Character.MIN_CODE_POINT); + ends.add('\t' - 1); + starts.add('\n' + 1); Review Comment: Maybe add a comment that you do not do the `\t-\n` case because they are adjacent codepoints (`0x9` and `0xa`) so there are no characters in the negated range in between them? ########## lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java: ########## @@ -140,6 +141,32 @@ public static Automaton makeCharRange(int min, int max) { return a; } + /** Returns a new minimal automaton that accepts any of the provided codepoints */ + public static Automaton makeCharSet(int[] codepoints) { Review Comment: I guess it's harmless if you have the same codepoint more than once in the incoming array? Automaton will dedup those dups in `a.finishState()`? Maybe we should add a test case making sure that's OK ... ########## lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java: ########## @@ -1195,60 +1215,132 @@ final RegExp parseCharClassExp() throws IllegalArgumentException { } final RegExp parseCharClasses() throws IllegalArgumentException { - RegExp e = parseCharClass(); - while (more() && !peek("]")) e = makeUnion(flags, e, parseCharClass()); - return e; - } - - final RegExp parseCharClass() throws IllegalArgumentException { - RegExp predefinedExp = matchPredefinedCharacterClass(); - if (predefinedExp != null) { - return predefinedExp; - } - - int c = parseCharExp(); - if (match('-')) return makeCharRange(flags, c, parseCharExp()); - else return makeChar(flags, c); - } - - RegExp expandPredefined() { - // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html - switch (from) { - case 'd': - return new RegExp("[0-9]"); // digit - case 'D': - return new RegExp("[^0-9]"); // non-digit - case 's': - return new RegExp("[ \t\n\r]"); // whitespace - case 'S': - return new RegExp("[^\\s]"); // non-whitespace - case 'w': - return new RegExp("[a-zA-Z_0-9]"); // word - case 'W': - return new RegExp("[^\\w]"); // non-word - default: - throw new IllegalArgumentException("invalid character class " + from); - } - } + var starts = new ArrayList<Integer>(); + var ends = new ArrayList<Integer>(); - final RegExp matchPredefinedCharacterClass() { - // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html - if (match('\\')) { - if (peek("dDwWsS")) { - return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0); + do { + // look for escape + if (match('\\')) { + expandPreDefined(starts, ends); + } else { + // parse a character + int c = parseCharExp(); + + if (match('-')) { + // range from c-d + starts.add(c); + ends.add(parseCharExp()); + } else if (check(ASCII_CASE_INSENSITIVE)) { + // single case-insensitive character + for (int form : toCaseInsensitiveChar(c)) { + starts.add(form); + ends.add(form); + } + } else { + // single character + starts.add(c); + ends.add(c); + } } + } while (more() && !peek("]")); - if (peek("\\")) { - return makeChar(flags, next()); + // not sure why we bother optimizing nodes, same automaton... + // definitely saves time vs fixing toString()-based tests. + if (starts.size() == 1) { + if (starts.get(0) == ends.get(0)) { + return makeChar(flags, starts.get(0)); + } else { + return makeCharRange(flags, starts.get(0), ends.get(0)); } + } else { + return makeCharClass( + flags, + starts.stream().mapToInt(Integer::intValue).toArray(), + ends.stream().mapToInt(Integer::intValue).toArray()); + } + } + void expandPreDefined(List<Integer> starts, List<Integer> ends) { + if (peek("\\")) { + // escape + starts.add((int) '\\'); + ends.add((int) '\\'); + next(); + } else if (peek("d")) { + // digit: [0-9] + starts.add((int) '0'); + ends.add((int) '9'); + next(); + } else if (peek("D")) { + // non-digit: [^0-9] + starts.add(Character.MIN_CODE_POINT); + ends.add('0' - 1); + starts.add('9' + 1); + ends.add(Character.MAX_CODE_POINT); + next(); + } else if (peek("s")) { + // whitespace: [\t-\n\r ] + starts.add((int) '\t'); + ends.add((int) '\n'); + starts.add((int) '\r'); + ends.add((int) '\r'); + starts.add((int) ' '); + ends.add((int) ' '); + next(); + } else if (peek("S")) { + // non-whitespace: [^ \t\n\r] + starts.add(Character.MIN_CODE_POINT); + ends.add('\t' - 1); + starts.add('\n' + 1); + ends.add('\r' - 1); + starts.add('\r' + 1); + ends.add(' ' - 1); + starts.add(' ' + 1); + ends.add(Character.MAX_CODE_POINT); + next(); + } else if (peek("w")) { + // word: [0-9A-Z_a-z] + starts.add((int) '0'); + ends.add((int) '9'); + starts.add((int) 'A'); + ends.add((int) 'Z'); + starts.add((int) '_'); + ends.add((int) '_'); + starts.add((int) 'a'); + ends.add((int) 'z'); + next(); + } else if (peek("W")) { + // non-word: [^0-9A-Z_a-z] + starts.add(Character.MIN_CODE_POINT); + ends.add('0' - 1); + starts.add('9' + 1); + ends.add('A' - 1); + starts.add('Z' + 1); + ends.add('_' - 1); + starts.add('_' + 1); + ends.add('a' - 1); Review Comment: Heh this range is just one character: ` (back quote). ########## lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java: ########## @@ -1050,14 +1059,25 @@ static RegExp makeDeprecatedComplement(int flags, RegExp exp) { } static RegExp makeChar(int flags, int c) { - return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0); + return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, null, null); } static RegExp makeCharRange(int flags, int from, int to) { if (from > to) throw new IllegalArgumentException( "invalid range: from (" + from + ") cannot be > to (" + to + ")"); - return newLeafNode(flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, to); + return newLeafNode( + flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, new int[] {from}, new int[] {to}); Review Comment: Oh if this is the only place where we create `REXEGP_CHAR_RANGE` (the API is not public) then we don't need to otherwise check `from.length == 1` and `to.length == 1`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org