john-wagster commented on code in PR #14192: URL: https://github.com/apache/lucene/pull/14192#discussion_r1940026321
########## lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java: ########## @@ -436,6 +478,160 @@ public enum Kind { */ @Deprecated public static final int DEPRECATED_COMPLEMENT = 0x10000; + /** + * See {@link #UNICODE_CASE_INSENSITIVE} for more details on the set of known unstable alternative + * casings + */ + static final Map<Integer, int[]> unstableUnicodeCharacters = + Map.ofEntries( + // these are the set of characters whose casing matches across multiple characters + entry(181, new int[] {924, 956}), + entry(924, new int[] {181, 956}), + entry(956, new int[] {181, 924}), + entry(304, new int[] {105, 73, 305}), + entry(105, new int[] {304, 73, 305}), + entry(73, new int[] {304, 105, 305}), + entry(305, new int[] {304, 105, 73}), + entry(383, new int[] {83, 115}), + entry(83, new int[] {383, 115}), + entry(115, new int[] {383, 83}), + entry(837, new int[] {921, 953, 8126}), + entry(921, new int[] {837, 953, 8126}), + entry(953, new int[] {837, 921, 8126}), + entry(8126, new int[] {837, 921, 953}), + entry(962, new int[] {931, 963}), + entry(931, new int[] {962, 963}), + entry(963, new int[] {962, 931}), + entry(976, new int[] {914, 946}), + entry(914, new int[] {976, 946}), + entry(946, new int[] {976, 914}), + entry(977, new int[] {920, 952, 1012}), + entry(920, new int[] {977, 952, 1012}), + entry(952, new int[] {977, 920, 1012}), + entry(1012, new int[] {977, 920, 952}), + entry(981, new int[] {934, 966}), + entry(934, new int[] {981, 966}), + entry(966, new int[] {981, 934}), + entry(982, new int[] {928, 960}), + entry(928, new int[] {982, 960}), + entry(960, new int[] {982, 928}), + entry(1008, new int[] {922, 954}), + entry(922, new int[] {1008, 954}), + entry(954, new int[] {1008, 922}), + entry(1009, new int[] {929, 961}), + entry(929, new int[] {1009, 961}), + entry(961, new int[] {1009, 929}), + entry(1013, new int[] {917, 949}), + entry(917, new int[] {1013, 949}), + entry(949, new int[] {1013, 917}), + entry(7296, new int[] {1042, 1074}), + entry(1042, new int[] {7296, 1074}), + entry(1074, new int[] {7296, 1042}), + entry(7297, new int[] {1044, 1076}), + entry(1044, new int[] {7297, 1076}), + entry(1076, new int[] {7297, 1044}), + entry(7298, new int[] {1054, 1086}), + entry(1054, new int[] {7298, 1086}), + entry(1086, new int[] {7298, 1054}), + entry(7299, new int[] {1057, 1089}), + entry(1057, new int[] {7299, 1089}), + entry(1089, new int[] {7299, 1057}), + entry(7300, new int[] {1058, 1090, 7301}), + entry(1058, new int[] {7300, 1090, 7301}), + entry(1090, new int[] {7300, 1058, 7301}), + entry(7301, new int[] {7300, 1058, 1090}), + entry(7302, new int[] {1066, 1098}), + entry(1066, new int[] {7302, 1098}), + entry(1098, new int[] {7302, 1066}), + entry(7303, new int[] {1122, 1123}), + entry(1122, new int[] {7303, 1123}), + entry(1123, new int[] {7303, 1122}), + entry(7304, new int[] {42570, 42571}), + entry(42570, new int[] {7304, 42571}), + entry(42571, new int[] {7304, 42570}), + entry(7835, new int[] {7776, 7777}), + entry(7776, new int[] {7835, 7777}), + entry(7777, new int[] {7835, 7776}), + entry(8486, new int[] {969, 937}), + entry(969, new int[] {8486, 937}), + entry(937, new int[] {8486, 969}), + entry(8490, new int[] {107, 75}), + entry(107, new int[] {8490, 75}), + entry(75, new int[] {8490, 107}), + entry(8491, new int[] {229, 197}), + entry(229, new int[] {8491, 197}), + entry(197, new int[] {8491, 229}), Review Comment: good suggestion; let me poke around a bit more and see if I can somehow do that. My intention was to follow Pattern not the Unicode spec (and let Pattern follow some version of the spec) but it would be nice if there were obvious sets that all of these fell into rather than this hardcoded mapping. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org