Re: [PR] Add Automata.makeCharSet/makeCharClass to optimize regexp [lucene]

via GitHub Wed, 05 Feb 2025 12:30:13 -0800


mikemccand commented on code in PR #14193:
URL: https://github.com/apache/lucene/pull/14193#discussion_r1943595546



##########
lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java:
##########
@@ -648,13 +645,16 @@ private Automaton toAutomaton(
         break;
       case REGEXP_CHAR:
         if (check(ASCII_CASE_INSENSITIVE)) {
-          a = toCaseInsensitiveChar(c);
+          a = Automata.makeCharSet(toCaseInsensitiveChar(c));
         } else {
           a = Automata.makeChar(c);
         }
         break;
       case REGEXP_CHAR_RANGE:
-        a = Automata.makeCharRange(from, to);
+        a = Automata.makeCharRange(from[0], to[0]);

Review Comment:
   Do we check that `from.length == 1` and `to.length == 1` somewhere?



##########
lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java:
##########
@@ -828,13 +838,10 @@ void toStringBuilder(StringBuilder b) {
         if (digits > 0) for (int i = s2.length(); i < digits; i++) 
b.append('0');
         b.append(s2).append(">");
         break;
-      case REGEXP_PRE_CLASS:
-        b.append("\\").appendCodePoint(from);
-        break;
     }
   }
 
-  /** Like to string, but more verbose (shows the higherchy more clearly). */
+  /** Like to string, but more verbose (shows the hierarchy more clearly). */

Review Comment:
   Oooh nice typo fix!



##########
lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java:
##########
@@ -1195,60 +1215,132 @@ final RegExp parseCharClassExp() throws 
IllegalArgumentException {
   }
 
   final RegExp parseCharClasses() throws IllegalArgumentException {
-    RegExp e = parseCharClass();
-    while (more() && !peek("]")) e = makeUnion(flags, e, parseCharClass());
-    return e;
-  }
-
-  final RegExp parseCharClass() throws IllegalArgumentException {
-    RegExp predefinedExp = matchPredefinedCharacterClass();
-    if (predefinedExp != null) {
-      return predefinedExp;
-    }
-
-    int c = parseCharExp();
-    if (match('-')) return makeCharRange(flags, c, parseCharExp());
-    else return makeChar(flags, c);
-  }
-
-  RegExp expandPredefined() {
-    // See 
https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
-    switch (from) {
-      case 'd':
-        return new RegExp("[0-9]"); // digit
-      case 'D':
-        return new RegExp("[^0-9]"); // non-digit
-      case 's':
-        return new RegExp("[ \t\n\r]"); // whitespace
-      case 'S':
-        return new RegExp("[^\\s]"); // non-whitespace
-      case 'w':
-        return new RegExp("[a-zA-Z_0-9]"); // word
-      case 'W':
-        return new RegExp("[^\\w]"); // non-word
-      default:
-        throw new IllegalArgumentException("invalid character class " + from);
-    }
-  }
+    var starts = new ArrayList<Integer>();
+    var ends = new ArrayList<Integer>();
 
-  final RegExp matchPredefinedCharacterClass() {
-    // See 
https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
-    if (match('\\')) {
-      if (peek("dDwWsS")) {
-        return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, 
next(), 0);
+    do {
+      // look for escape
+      if (match('\\')) {
+        expandPreDefined(starts, ends);
+      } else {
+        // parse a character
+        int c = parseCharExp();
+
+        if (match('-')) {
+          // range from c-d
+          starts.add(c);
+          ends.add(parseCharExp());
+        } else if (check(ASCII_CASE_INSENSITIVE)) {
+          // single case-insensitive character
+          for (int form : toCaseInsensitiveChar(c)) {
+            starts.add(form);
+            ends.add(form);
+          }
+        } else {
+          // single character
+          starts.add(c);
+          ends.add(c);
+        }
       }
+    } while (more() && !peek("]"));
 
-      if (peek("\\")) {
-        return makeChar(flags, next());
+    // not sure why we bother optimizing nodes, same automaton...
+    // definitely saves time vs fixing toString()-based tests.
+    if (starts.size() == 1) {
+      if (starts.get(0) == ends.get(0)) {
+        return makeChar(flags, starts.get(0));
+      } else {
+        return makeCharRange(flags, starts.get(0), ends.get(0));
       }
+    } else {
+      return makeCharClass(
+          flags,
+          starts.stream().mapToInt(Integer::intValue).toArray(),
+          ends.stream().mapToInt(Integer::intValue).toArray());
+    }
+  }
 
+  void expandPreDefined(List<Integer> starts, List<Integer> ends) {
+    if (peek("\\")) {
+      // escape
+      starts.add((int) '\\');
+      ends.add((int) '\\');
+      next();
+    } else if (peek("d")) {
+      // digit: [0-9]
+      starts.add((int) '0');
+      ends.add((int) '9');
+      next();
+    } else if (peek("D")) {
+      // non-digit: [^0-9]
+      starts.add(Character.MIN_CODE_POINT);
+      ends.add('0' - 1);
+      starts.add('9' + 1);
+      ends.add(Character.MAX_CODE_POINT);
+      next();
+    } else if (peek("s")) {
+      // whitespace: [\t-\n\r ]
+      starts.add((int) '\t');
+      ends.add((int) '\n');
+      starts.add((int) '\r');
+      ends.add((int) '\r');
+      starts.add((int) ' ');
+      ends.add((int) ' ');
+      next();
+    } else if (peek("S")) {
+      // non-whitespace: [^ \t\n\r]

Review Comment:
   Hmm maybe fix this comment to put the space char at the end for consistency 
(the chars are sorted by their ascii/codepoint, ascending)?



##########
lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java:
##########
@@ -1195,60 +1215,132 @@ final RegExp parseCharClassExp() throws 
IllegalArgumentException {
   }
 
   final RegExp parseCharClasses() throws IllegalArgumentException {
-    RegExp e = parseCharClass();
-    while (more() && !peek("]")) e = makeUnion(flags, e, parseCharClass());
-    return e;
-  }
-
-  final RegExp parseCharClass() throws IllegalArgumentException {
-    RegExp predefinedExp = matchPredefinedCharacterClass();
-    if (predefinedExp != null) {
-      return predefinedExp;
-    }
-
-    int c = parseCharExp();
-    if (match('-')) return makeCharRange(flags, c, parseCharExp());
-    else return makeChar(flags, c);
-  }
-
-  RegExp expandPredefined() {
-    // See 
https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
-    switch (from) {
-      case 'd':
-        return new RegExp("[0-9]"); // digit
-      case 'D':
-        return new RegExp("[^0-9]"); // non-digit
-      case 's':
-        return new RegExp("[ \t\n\r]"); // whitespace
-      case 'S':
-        return new RegExp("[^\\s]"); // non-whitespace
-      case 'w':
-        return new RegExp("[a-zA-Z_0-9]"); // word
-      case 'W':
-        return new RegExp("[^\\w]"); // non-word
-      default:
-        throw new IllegalArgumentException("invalid character class " + from);
-    }
-  }
+    var starts = new ArrayList<Integer>();
+    var ends = new ArrayList<Integer>();
 
-  final RegExp matchPredefinedCharacterClass() {
-    // See 
https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
-    if (match('\\')) {
-      if (peek("dDwWsS")) {
-        return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, 
next(), 0);
+    do {
+      // look for escape
+      if (match('\\')) {
+        expandPreDefined(starts, ends);
+      } else {
+        // parse a character
+        int c = parseCharExp();
+
+        if (match('-')) {
+          // range from c-d
+          starts.add(c);
+          ends.add(parseCharExp());
+        } else if (check(ASCII_CASE_INSENSITIVE)) {
+          // single case-insensitive character
+          for (int form : toCaseInsensitiveChar(c)) {
+            starts.add(form);
+            ends.add(form);
+          }
+        } else {
+          // single character
+          starts.add(c);
+          ends.add(c);
+        }
       }
+    } while (more() && !peek("]"));
 
-      if (peek("\\")) {
-        return makeChar(flags, next());
+    // not sure why we bother optimizing nodes, same automaton...
+    // definitely saves time vs fixing toString()-based tests.
+    if (starts.size() == 1) {
+      if (starts.get(0) == ends.get(0)) {
+        return makeChar(flags, starts.get(0));
+      } else {
+        return makeCharRange(flags, starts.get(0), ends.get(0));
       }
+    } else {
+      return makeCharClass(
+          flags,
+          starts.stream().mapToInt(Integer::intValue).toArray(),
+          ends.stream().mapToInt(Integer::intValue).toArray());
+    }
+  }
 
+  void expandPreDefined(List<Integer> starts, List<Integer> ends) {
+    if (peek("\\")) {
+      // escape
+      starts.add((int) '\\');
+      ends.add((int) '\\');
+      next();
+    } else if (peek("d")) {
+      // digit: [0-9]
+      starts.add((int) '0');
+      ends.add((int) '9');
+      next();
+    } else if (peek("D")) {
+      // non-digit: [^0-9]
+      starts.add(Character.MIN_CODE_POINT);
+      ends.add('0' - 1);
+      starts.add('9' + 1);
+      ends.add(Character.MAX_CODE_POINT);
+      next();
+    } else if (peek("s")) {
+      // whitespace: [\t-\n\r ]
+      starts.add((int) '\t');
+      ends.add((int) '\n');
+      starts.add((int) '\r');
+      ends.add((int) '\r');
+      starts.add((int) ' ');
+      ends.add((int) ' ');
+      next();
+    } else if (peek("S")) {
+      // non-whitespace: [^ \t\n\r]
+      starts.add(Character.MIN_CODE_POINT);
+      ends.add('\t' - 1);
+      starts.add('\n' + 1);

Review Comment:
   Maybe add a comment that you do not do the `\t-\n` case because they are 
adjacent codepoints (`0x9` and `0xa`) so there are no characters in the negated 
range in between them?



##########
lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java:
##########
@@ -140,6 +141,32 @@ public static Automaton makeCharRange(int min, int max) {
     return a;
   }
 
+  /** Returns a new minimal automaton that accepts any of the provided 
codepoints */
+  public static Automaton makeCharSet(int[] codepoints) {

Review Comment:
   I guess it's harmless if you have the same codepoint more than once in the 
incoming array?  Automaton will dedup those dups in `a.finishState()`?  Maybe 
we should add a test case making sure that's OK ...



##########
lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java:
##########
@@ -1195,60 +1215,132 @@ final RegExp parseCharClassExp() throws 
IllegalArgumentException {
   }
 
   final RegExp parseCharClasses() throws IllegalArgumentException {
-    RegExp e = parseCharClass();
-    while (more() && !peek("]")) e = makeUnion(flags, e, parseCharClass());
-    return e;
-  }
-
-  final RegExp parseCharClass() throws IllegalArgumentException {
-    RegExp predefinedExp = matchPredefinedCharacterClass();
-    if (predefinedExp != null) {
-      return predefinedExp;
-    }
-
-    int c = parseCharExp();
-    if (match('-')) return makeCharRange(flags, c, parseCharExp());
-    else return makeChar(flags, c);
-  }
-
-  RegExp expandPredefined() {
-    // See 
https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
-    switch (from) {
-      case 'd':
-        return new RegExp("[0-9]"); // digit
-      case 'D':
-        return new RegExp("[^0-9]"); // non-digit
-      case 's':
-        return new RegExp("[ \t\n\r]"); // whitespace
-      case 'S':
-        return new RegExp("[^\\s]"); // non-whitespace
-      case 'w':
-        return new RegExp("[a-zA-Z_0-9]"); // word
-      case 'W':
-        return new RegExp("[^\\w]"); // non-word
-      default:
-        throw new IllegalArgumentException("invalid character class " + from);
-    }
-  }
+    var starts = new ArrayList<Integer>();
+    var ends = new ArrayList<Integer>();
 
-  final RegExp matchPredefinedCharacterClass() {
-    // See 
https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
-    if (match('\\')) {
-      if (peek("dDwWsS")) {
-        return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, 
next(), 0);
+    do {
+      // look for escape
+      if (match('\\')) {
+        expandPreDefined(starts, ends);
+      } else {
+        // parse a character
+        int c = parseCharExp();
+
+        if (match('-')) {
+          // range from c-d
+          starts.add(c);
+          ends.add(parseCharExp());
+        } else if (check(ASCII_CASE_INSENSITIVE)) {
+          // single case-insensitive character
+          for (int form : toCaseInsensitiveChar(c)) {
+            starts.add(form);
+            ends.add(form);
+          }
+        } else {
+          // single character
+          starts.add(c);
+          ends.add(c);
+        }
       }
+    } while (more() && !peek("]"));
 
-      if (peek("\\")) {
-        return makeChar(flags, next());
+    // not sure why we bother optimizing nodes, same automaton...
+    // definitely saves time vs fixing toString()-based tests.
+    if (starts.size() == 1) {
+      if (starts.get(0) == ends.get(0)) {
+        return makeChar(flags, starts.get(0));
+      } else {
+        return makeCharRange(flags, starts.get(0), ends.get(0));
       }
+    } else {
+      return makeCharClass(
+          flags,
+          starts.stream().mapToInt(Integer::intValue).toArray(),
+          ends.stream().mapToInt(Integer::intValue).toArray());
+    }
+  }
 
+  void expandPreDefined(List<Integer> starts, List<Integer> ends) {
+    if (peek("\\")) {
+      // escape
+      starts.add((int) '\\');
+      ends.add((int) '\\');
+      next();
+    } else if (peek("d")) {
+      // digit: [0-9]
+      starts.add((int) '0');
+      ends.add((int) '9');
+      next();
+    } else if (peek("D")) {
+      // non-digit: [^0-9]
+      starts.add(Character.MIN_CODE_POINT);
+      ends.add('0' - 1);
+      starts.add('9' + 1);
+      ends.add(Character.MAX_CODE_POINT);
+      next();
+    } else if (peek("s")) {
+      // whitespace: [\t-\n\r ]
+      starts.add((int) '\t');
+      ends.add((int) '\n');
+      starts.add((int) '\r');
+      ends.add((int) '\r');
+      starts.add((int) ' ');
+      ends.add((int) ' ');
+      next();
+    } else if (peek("S")) {
+      // non-whitespace: [^ \t\n\r]
+      starts.add(Character.MIN_CODE_POINT);
+      ends.add('\t' - 1);
+      starts.add('\n' + 1);
+      ends.add('\r' - 1);
+      starts.add('\r' + 1);
+      ends.add(' ' - 1);
+      starts.add(' ' + 1);
+      ends.add(Character.MAX_CODE_POINT);
+      next();
+    } else if (peek("w")) {
+      // word: [0-9A-Z_a-z]
+      starts.add((int) '0');
+      ends.add((int) '9');
+      starts.add((int) 'A');
+      ends.add((int) 'Z');
+      starts.add((int) '_');
+      ends.add((int) '_');
+      starts.add((int) 'a');
+      ends.add((int) 'z');
+      next();
+    } else if (peek("W")) {
+      // non-word: [^0-9A-Z_a-z]
+      starts.add(Character.MIN_CODE_POINT);
+      ends.add('0' - 1);
+      starts.add('9' + 1);
+      ends.add('A' - 1);
+      starts.add('Z' + 1);
+      ends.add('_' - 1);
+      starts.add('_' + 1);
+      ends.add('a' - 1);

Review Comment:
   Heh this range is just one character: ` (back quote).



##########
lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java:
##########
@@ -1050,14 +1059,25 @@ static RegExp makeDeprecatedComplement(int flags, 
RegExp exp) {
   }
 
   static RegExp makeChar(int flags, int c) {
-    return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, 0, 0);
+    return newLeafNode(flags, Kind.REGEXP_CHAR, null, c, 0, 0, 0, null, null);
   }
 
   static RegExp makeCharRange(int flags, int from, int to) {
     if (from > to)
       throw new IllegalArgumentException(
           "invalid range: from (" + from + ") cannot be > to (" + to + ")");
-    return newLeafNode(flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, from, 
to);
+    return newLeafNode(
+        flags, Kind.REGEXP_CHAR_RANGE, null, 0, 0, 0, 0, new int[] {from}, new 
int[] {to});

Review Comment:
   Oh if this is the only place where we create `REXEGP_CHAR_RANGE` (the API is 
not public) then we don't need to otherwise check `from.length == 1` and 
`to.length == 1`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Add Automata.makeCharSet/makeCharClass to optimize regexp [lucene]

Reply via email to