rmuir commented on code in PR #14193: URL: https://github.com/apache/lucene/pull/14193#discussion_r1943654993
########## lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java: ########## @@ -1195,60 +1215,132 @@ final RegExp parseCharClassExp() throws IllegalArgumentException { } final RegExp parseCharClasses() throws IllegalArgumentException { - RegExp e = parseCharClass(); - while (more() && !peek("]")) e = makeUnion(flags, e, parseCharClass()); - return e; - } - - final RegExp parseCharClass() throws IllegalArgumentException { - RegExp predefinedExp = matchPredefinedCharacterClass(); - if (predefinedExp != null) { - return predefinedExp; - } - - int c = parseCharExp(); - if (match('-')) return makeCharRange(flags, c, parseCharExp()); - else return makeChar(flags, c); - } - - RegExp expandPredefined() { - // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html - switch (from) { - case 'd': - return new RegExp("[0-9]"); // digit - case 'D': - return new RegExp("[^0-9]"); // non-digit - case 's': - return new RegExp("[ \t\n\r]"); // whitespace - case 'S': - return new RegExp("[^\\s]"); // non-whitespace - case 'w': - return new RegExp("[a-zA-Z_0-9]"); // word - case 'W': - return new RegExp("[^\\w]"); // non-word - default: - throw new IllegalArgumentException("invalid character class " + from); - } - } + var starts = new ArrayList<Integer>(); + var ends = new ArrayList<Integer>(); - final RegExp matchPredefinedCharacterClass() { - // See https://docs.oracle.com/javase/tutorial/essential/regex/pre_char_classes.html - if (match('\\')) { - if (peek("dDwWsS")) { - return newLeafNode(flags, Kind.REGEXP_PRE_CLASS, null, 0, 0, 0, 0, next(), 0); + do { + // look for escape + if (match('\\')) { + expandPreDefined(starts, ends); + } else { + // parse a character + int c = parseCharExp(); + + if (match('-')) { + // range from c-d + starts.add(c); + ends.add(parseCharExp()); + } else if (check(ASCII_CASE_INSENSITIVE)) { + // single case-insensitive character + for (int form : toCaseInsensitiveChar(c)) { + starts.add(form); + ends.add(form); + } + } else { + // single character + starts.add(c); + ends.add(c); + } } + } while (more() && !peek("]")); - if (peek("\\")) { - return makeChar(flags, next()); + // not sure why we bother optimizing nodes, same automaton... + // definitely saves time vs fixing toString()-based tests. + if (starts.size() == 1) { + if (starts.get(0) == ends.get(0)) { + return makeChar(flags, starts.get(0)); + } else { + return makeCharRange(flags, starts.get(0), ends.get(0)); } + } else { + return makeCharClass( + flags, + starts.stream().mapToInt(Integer::intValue).toArray(), + ends.stream().mapToInt(Integer::intValue).toArray()); + } + } + void expandPreDefined(List<Integer> starts, List<Integer> ends) { + if (peek("\\")) { + // escape + starts.add((int) '\\'); + ends.add((int) '\\'); + next(); + } else if (peek("d")) { + // digit: [0-9] + starts.add((int) '0'); + ends.add((int) '9'); + next(); + } else if (peek("D")) { + // non-digit: [^0-9] + starts.add(Character.MIN_CODE_POINT); + ends.add('0' - 1); + starts.add('9' + 1); + ends.add(Character.MAX_CODE_POINT); + next(); + } else if (peek("s")) { + // whitespace: [\t-\n\r ] + starts.add((int) '\t'); + ends.add((int) '\n'); + starts.add((int) '\r'); + ends.add((int) '\r'); + starts.add((int) ' '); + ends.add((int) ' '); + next(); + } else if (peek("S")) { + // non-whitespace: [^ \t\n\r] + starts.add(Character.MIN_CODE_POINT); + ends.add('\t' - 1); + starts.add('\n' + 1); Review Comment: Yeah lemme fix the comment, so this makes better sense. I swear I thought i did it, but apparently not -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org