Re: [PR] Unicode Support for Case Insensitive Matching in RegExp [lucene]

via GitHub Mon, 03 Feb 2025 11:07:01 -0800


john-wagster commented on code in PR #14192:
URL: https://github.com/apache/lucene/pull/14192#discussion_r1939889352



##########
lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java:
##########
@@ -35,6 +43,320 @@ public void testSmoke() {
     assertFalse(run.run("ad"));
   }
 
+  public void testUnicodeAsciiInsensitiveFlags() {
+    RegExp r;
+    // ASCII behaves appropriately with different flags
+    r = new RegExp("A");
+    assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("a"));
+
+    r = new RegExp("A", RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE);
+    assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("a"));
+
+    r = new RegExp("A", RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE);
+    assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("a"));
+
+    r =
+        new RegExp(
+            "A", RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE | 
RegExp.UNICODE_CASE_INSENSITIVE);
+    assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("a"));
+
+    // class 1 Unicode characters behaves appropriately with different flags
+    r = new RegExp("Σ");
+    assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("σ"));
+    assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("ς"));
+
+    r = new RegExp("σ");
+    assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("ς"));
+
+    r = new RegExp("Σ", RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE);
+    assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("σ"));
+    assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("ς"));
+
+    r = new RegExp("σ", RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE);
+    assertFalse(new CharacterRunAutomaton(r.toAutomaton()).run("ς"));
+
+    r = new RegExp("Σ", RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE);
+    assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("σ"));
+    assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("ς"));
+
+    r = new RegExp("σ", RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE);
+    assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("ς"));
+
+    r =
+        new RegExp(
+            "Σ", RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE | 
RegExp.UNICODE_CASE_INSENSITIVE);
+    assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("σ"));
+    assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("ς"));
+
+    r =
+        new RegExp(
+            "σ", RegExp.ALL, RegExp.ASCII_CASE_INSENSITIVE | 
RegExp.UNICODE_CASE_INSENSITIVE);
+    assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("ς"));
+
+    // class 2 Unicode characters behaves appropriately with different flags
+    r = new RegExp("ῼ", RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE);
+    assertTrue(new CharacterRunAutomaton(r.toAutomaton()).run("ῳ"));
+
+    r = new RegExp("ῼ", RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE);
+    assertFalse(
+        new 
CharacterRunAutomaton(r.toAutomaton()).run("ῼ".toUpperCase(Locale.ROOT))); // 
"ΩΙ"
+
+    // class 3 Unicode characters behaves appropriately with different flags
+    r = new RegExp("ﬗ", RegExp.ALL, RegExp.UNICODE_CASE_INSENSITIVE);
+    assertFalse(
+        new 
CharacterRunAutomaton(r.toAutomaton()).run("ﬗ".toUpperCase(Locale.ROOT))); // 
"ՄԽ"
+  }
+
+  public void testUnicodeInsensitiveMatchPatternParity() {
+    // this ensures that if the Pattern class behavior were to change with a 
change to the Unicode
+    // spec then we would pick it up
+    // except new characters that were introduced (which would be a manual 
process; see tooling
+    // comments below)
+    for (Map.Entry<Integer, int[]> entry : 
RegExp.unstableUnicodeCharacters.entrySet()) {
+      int codePoint = entry.getKey();
+      int[] caseInsensititiveAlternatives = entry.getValue();
+      String pattern = new String(Character.toChars(codePoint));
+      Pattern javaRegex = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE | 
Pattern.UNICODE_CASE);
+      RegExp r = new RegExp(pattern, RegExp.ALL, 
RegExp.UNICODE_CASE_INSENSITIVE);
+      CharacterRunAutomaton cra = new CharacterRunAutomaton(r.toAutomaton());
+      for (int i = 0; i < caseInsensititiveAlternatives.length; i++) {
+
+        int alt = caseInsensititiveAlternatives[i];
+        String altString = new String(Character.toChars(alt));
+
+        assertTrue(javaRegex.matcher(altString).matches());
+        assertTrue(cra.run(altString));
+      }
+    }
+
+    // tooling code to validate manually to discover new characters that fall 
into the "unstable"
+    // set of Unicode characters
+    // variations of these can be used to discover the various classes 
themselves
+    // generateClass1();
+    // generateClass2();
+    // generateClass3();
+  }
+
+  public void testRandomUnicodeInsensitiveMatchPatternParity() {
+    int maxIters = 1000;
+    List<Integer> reservedCharacters =
+        Set.of(
+                '.', '^', '$', '*', '+', '?', '(', ')', '[', '{', '\\', '|', 
'-', '"', '<', '>',
+                '#', '@', '&', '~')
+            .stream()
+            .map(c -> (int) c)
+            .toList();
+    for (int i = 0; i < maxIters; i++) {
+      int nextCode1 = random().nextInt(0, Character.MAX_CODE_POINT + 1);
+      int nextCode2 = random().nextInt(0, Character.MAX_CODE_POINT + 1);
+
+      // skip if we select a reserved character that blows up .^$*+?()[{\|-]"<
+      if (reservedCharacters.contains(nextCode1)) {
+        continue;
+      }
+
+      String pattern = new String(Character.toChars(nextCode1));
+      String altString = new String(Character.toChars(nextCode2));
+
+      Pattern javaRegex = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE | 
Pattern.UNICODE_CASE);
+      RegExp r = new RegExp(pattern, RegExp.ALL, 
RegExp.UNICODE_CASE_INSENSITIVE);
+      CharacterRunAutomaton cra = new CharacterRunAutomaton(r.toAutomaton());
+      assertEquals(
+          "Pattern and RegExp disagree on pattern: " + nextCode1 + " :text: " 
+ nextCode2,
+          javaRegex.matcher(altString).matches(),
+          cra.run(altString));
+    }
+  }
+
+  public static void generateClass1() {

Review Comment:
   Not sure where these utility functions should live; they are nice for 
validating and regenerating relevant code points so putting them here for now 
but open to suggestions for if / where they belong.  



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Unicode Support for Case Insensitive Matching in RegExp [lucene]

Reply via email to