john-wagster commented on code in PR #14192: URL: https://github.com/apache/lucene/pull/14192#discussion_r1939889352
########## lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java: ########## @@ -35,6 +43,320 @@ public void testSmoke() { assertFalse(run.run("ad")); } + public void testUnicodeAsciiInsensitiveFlags() { + RegExp r; + // ASCII behaves appropriately with different flags + r = new RegExp("A"); + assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("a")); + + r = new RegExp("A", RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE); + assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("a")); + + r = new RegExp("A", RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE); + assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("a")); + + r = + new RegExp( + "A", RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE | RegExp.UNICODE_CASE_INSENSITIVE); + assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("a")); + + // class 1 Unicode characters behaves appropriately with different flags + r = new RegExp("Σ"); + assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("σ")); + assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("ς")); + + r = new RegExp("σ"); + assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("ς")); + + r = new RegExp("Σ", RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE); + assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("σ")); + assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("ς")); + + r = new RegExp("σ", RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE); + assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("ς")); + + r = new RegExp("Σ", RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE); + assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("σ")); + assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("ς")); + + r = new RegExp("σ", RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE); + assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("ς")); + + r = + new RegExp( + "Σ", RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE | RegExp.UNICODE_CASE_INSENSITIVE); + assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("σ")); + assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("ς")); + + r = + new RegExp( + "σ", RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE | RegExp.UNICODE_CASE_INSENSITIVE); + assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("ς")); + + // class 2 Unicode characters behaves appropriately with different flags + r = new RegExp("ῼ", RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE); + assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("ῳ")); + + r = new RegExp("ῼ", RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE); + assertFalse( + new CharacterRunAutomaton(r.toAutomaton()).run("ῼ".toUpperCase(Locale.ROOT))); // "ΩΙ" + + // class 3 Unicode characters behaves appropriately with different flags + r = new RegExp("ﬗ", RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE); + assertFalse( + new CharacterRunAutomaton(r.toAutomaton()).run("ﬗ".toUpperCase(Locale.ROOT))); // "ՄԽ" + } + + public void testUnicodeInsensitiveMatchPatternParity() { + // this ensures that if the Pattern class behavior were to change with a change to the Unicode + // spec then we would pick it up + // except new characters that were introduced (which would be a manual process; see tooling + // comments below) + for (Map.Entry<Integer, int[]> entry : RegExp.unstableUnicodeCharacters.entrySet()) { + int codePoint = entry.getKey(); + int[] caseInsensititiveAlternatives = entry.getValue(); + String pattern = new String(Character.toChars(codePoint)); + Pattern javaRegex = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + RegExp r = new RegExp(pattern, RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE); + CharacterRunAutomaton cra = new CharacterRunAutomaton(r.toAutomaton()); + for (int i = 0; i < caseInsensititiveAlternatives.length; i++) { + + int alt = caseInsensititiveAlternatives[i]; + String altString = new String(Character.toChars(alt)); + + assertTrue(javaRegex.matcher(altString).matches()); + assertTrue(cra.run(altString)); + } + } + + // tooling code to validate manually to discover new characters that fall into the "unstable" + // set of Unicode characters + // variations of these can be used to discover the various classes themselves + // generateClass1(); + // generateClass2(); + // generateClass3(); + } + + public void testRandomUnicodeInsensitiveMatchPatternParity() { + int maxIters = 1000; + List<Integer> reservedCharacters = + Set.of( + '.', '^', '$', '*', '+', '?', '(', ')', '[', '{', '\\', '|', '-', '"', '<', '>', + '#', '@', '&', '~') + .stream() + .map(c -> (int) c) + .toList(); + for (int i = 0; i < maxIters; i++) { + int nextCode1 = random().nextInt(0, Character.MAX_CODE_POINT + 1); + int nextCode2 = random().nextInt(0, Character.MAX_CODE_POINT + 1); + + // skip if we select a reserved character that blows up .^$*+?()[{\|-]"< + if (reservedCharacters.contains(nextCode1)) { + continue; + } + + String pattern = new String(Character.toChars(nextCode1)); + String altString = new String(Character.toChars(nextCode2)); + + Pattern javaRegex = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); + RegExp r = new RegExp(pattern, RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE); + CharacterRunAutomaton cra = new CharacterRunAutomaton(r.toAutomaton()); + assertEquals( + "Pattern and RegExp disagree on pattern: " + nextCode1 + " :text: " + nextCode2, + javaRegex.matcher(altString).matches(), + cra.run(altString)); + } + } + + public static void generateClass1() { Review Comment: Not sure where these utility functions should live; they are nice for validating and regenerating relevant code points so putting them here for now but open to suggestions for if / where they belong. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org