Re: [PR] Add Automata.makeCharSet/makeCharClass to optimize regexp [lucene]

via GitHub Wed, 05 Feb 2025 12:59:59 -0800


rmuir commented on code in PR #14193:
URL: https://github.com/apache/lucene/pull/14193#discussion_r1943654993



##########
lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java:
##########
@@ -1195,60 +1215,132 @@ final RegExp parseCharClassExp() throws 
IllegalArgumentException {
   }
 
   final RegExp parseCharClasses() throws IllegalArgumentException {
-    RegExp e = parseCharClass();
-    while (more() && !peek("]")) e = makeUnion(flags, e, parseCharClass());
-    return e;
-  }
-
-  final RegExp parseCharClass() throws IllegalArgumentException {
-    RegExp predefinedExp = matchPredefinedCharacterClass();
-    if (predefinedExp != null) {
-      return predefinedExp;
-    }
-
-    int c = parseCharExp();
-    if (match('-')) return makeCharRange(flags, c, parseCharExp());
-    else return makeChar(flags, c);
-  }
-
-  RegExp expandPredefined() {
-    // See 
https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
-    switch (from) {
-      case 'd':
-        return new RegExp("[0-9]"); // digit
-      case 'D':
-        return new RegExp("[^0-9]"); // non-digit
-      case 's':
-        return new RegExp("[ \t\n\r]"); // whitespace
-      case 'S':
-        return new RegExp("[^\\s]"); // non-whitespace
-      case 'w':
-        return new RegExp("[a-zA-Z_0-9]"); // word
-      case 'W':
-        return new RegExp("[^\\w]"); // non-word
-      default:
-        throw new IllegalArgumentException("invalid character class " + from);
-    }
-  }
+    var starts = new ArrayList<Integer>();
+    var ends = new ArrayList<Integer>();
 
-  final RegExp matchPredefinedCharacterClass() {
-    // See 
https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html
-    if (match('\\')) {
-      if (peek("dDwWsS")) {
-        return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, 
next(), 0);
+    do {
+      // look for escape
+      if (match('\\')) {
+        expandPreDefined(starts, ends);
+      } else {
+        // parse a character
+        int c = parseCharExp();
+
+        if (match('-')) {
+          // range from c-d
+          starts.add(c);
+          ends.add(parseCharExp());
+        } else if (check(ASCII_CASE_INSENSITIVE)) {
+          // single case-insensitive character
+          for (int form : toCaseInsensitiveChar(c)) {
+            starts.add(form);
+            ends.add(form);
+          }
+        } else {
+          // single character
+          starts.add(c);
+          ends.add(c);
+        }
       }
+    } while (more() && !peek("]"));
 
-      if (peek("\\")) {
-        return makeChar(flags, next());
+    // not sure why we bother optimizing nodes, same automaton...
+    // definitely saves time vs fixing toString()-based tests.
+    if (starts.size() == 1) {
+      if (starts.get(0) == ends.get(0)) {
+        return makeChar(flags, starts.get(0));
+      } else {
+        return makeCharRange(flags, starts.get(0), ends.get(0));
       }
+    } else {
+      return makeCharClass(
+          flags,
+          starts.stream().mapToInt(Integer::intValue).toArray(),
+          ends.stream().mapToInt(Integer::intValue).toArray());
+    }
+  }
 
+  void expandPreDefined(List<Integer> starts, List<Integer> ends) {
+    if (peek("\\")) {
+      // escape
+      starts.add((int) '\\');
+      ends.add((int) '\\');
+      next();
+    } else if (peek("d")) {
+      // digit: [0-9]
+      starts.add((int) '0');
+      ends.add((int) '9');
+      next();
+    } else if (peek("D")) {
+      // non-digit: [^0-9]
+      starts.add(Character.MIN_CODE_POINT);
+      ends.add('0' - 1);
+      starts.add('9' + 1);
+      ends.add(Character.MAX_CODE_POINT);
+      next();
+    } else if (peek("s")) {
+      // whitespace: [\t-\n\r ]
+      starts.add((int) '\t');
+      ends.add((int) '\n');
+      starts.add((int) '\r');
+      ends.add((int) '\r');
+      starts.add((int) ' ');
+      ends.add((int) ' ');
+      next();
+    } else if (peek("S")) {
+      // non-whitespace: [^ \t\n\r]
+      starts.add(Character.MIN_CODE_POINT);
+      ends.add('\t' - 1);
+      starts.add('\n' + 1);

Review Comment:
   Yeah lemme fix the comment, so this makes better sense. I swear I thought i 
did it, but apparently not



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Add Automata.makeCharSet/makeCharClass to optimize regexp [lucene]

Reply via email to