rmuir commented on code in PR #14192: URL: https://github.com/apache/lucene/pull/14192#discussion_r1940264096
########## lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java: ########## @@ -436,6 +478,160 @@ public enum Kind { */ @Deprecated public static final int DEPRECATED_COMPLEMENT = 0x10000; + /** + * See {@link #UNICODE_CASE_INSENSITIVE} for more details on the set of known unstable alternative + * casings + */ + static final Map<Integer, int[]> unstableUnicodeCharacters = + Map.ofEntries( + // these are the set of characters whose casing matches across multiple characters + entry(181, new int[] {924, 956}), + entry(924, new int[] {181, 956}), + entry(956, new int[] {181, 924}), + entry(304, new int[] {105, 73, 305}), + entry(105, new int[] {304, 73, 305}), + entry(73, new int[] {304, 105, 305}), + entry(305, new int[] {304, 105, 73}), + entry(383, new int[] {83, 115}), + entry(83, new int[] {383, 115}), + entry(115, new int[] {383, 83}), + entry(837, new int[] {921, 953, 8126}), + entry(921, new int[] {837, 953, 8126}), + entry(953, new int[] {837, 921, 8126}), + entry(8126, new int[] {837, 921, 953}), + entry(962, new int[] {931, 963}), + entry(931, new int[] {962, 963}), + entry(963, new int[] {962, 931}), + entry(976, new int[] {914, 946}), + entry(914, new int[] {976, 946}), + entry(946, new int[] {976, 914}), + entry(977, new int[] {920, 952, 1012}), + entry(920, new int[] {977, 952, 1012}), + entry(952, new int[] {977, 920, 1012}), + entry(1012, new int[] {977, 920, 952}), + entry(981, new int[] {934, 966}), + entry(934, new int[] {981, 966}), + entry(966, new int[] {981, 934}), + entry(982, new int[] {928, 960}), + entry(928, new int[] {982, 960}), + entry(960, new int[] {982, 928}), + entry(1008, new int[] {922, 954}), + entry(922, new int[] {1008, 954}), + entry(954, new int[] {1008, 922}), + entry(1009, new int[] {929, 961}), + entry(929, new int[] {1009, 961}), + entry(961, new int[] {1009, 929}), + entry(1013, new int[] {917, 949}), + entry(917, new int[] {1013, 949}), + entry(949, new int[] {1013, 917}), + entry(7296, new int[] {1042, 1074}), + entry(1042, new int[] {7296, 1074}), + entry(1074, new int[] {7296, 1042}), + entry(7297, new int[] {1044, 1076}), + entry(1044, new int[] {7297, 1076}), + entry(1076, new int[] {7297, 1044}), + entry(7298, new int[] {1054, 1086}), + entry(1054, new int[] {7298, 1086}), + entry(1086, new int[] {7298, 1054}), + entry(7299, new int[] {1057, 1089}), + entry(1057, new int[] {7299, 1089}), + entry(1089, new int[] {7299, 1057}), + entry(7300, new int[] {1058, 1090, 7301}), + entry(1058, new int[] {7300, 1090, 7301}), + entry(1090, new int[] {7300, 1058, 7301}), + entry(7301, new int[] {7300, 1058, 1090}), + entry(7302, new int[] {1066, 1098}), + entry(1066, new int[] {7302, 1098}), + entry(1098, new int[] {7302, 1066}), + entry(7303, new int[] {1122, 1123}), + entry(1122, new int[] {7303, 1123}), + entry(1123, new int[] {7303, 1122}), + entry(7304, new int[] {42570, 42571}), + entry(42570, new int[] {7304, 42571}), + entry(42571, new int[] {7304, 42570}), + entry(7835, new int[] {7776, 7777}), + entry(7776, new int[] {7835, 7777}), + entry(7777, new int[] {7835, 7776}), + entry(8486, new int[] {969, 937}), + entry(969, new int[] {8486, 937}), + entry(937, new int[] {8486, 969}), + entry(8490, new int[] {107, 75}), + entry(107, new int[] {8490, 75}), + entry(75, new int[] {8490, 107}), + entry(8491, new int[] {229, 197}), + entry(229, new int[] {8491, 197}), + entry(197, new int[] {8491, 229}), Review Comment: Yeah I'm less concerned about what Pattern is doing. I am happy you have a solution though. I don't really want to look at Pattern and given that java STILL doesn't expose case-folding, I'm guessing I'd be sad if I looked. From unicode perspective I think this is where we want to move towards? https://unicode.org/reports/tr18/#Simple_Loose_Matches So if the answer to this special set is that, they are the ones in https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt, it will solve my problem. Then we have a way to keep everything up to date and understand what all the exceptions are for. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org