https://github.com/python/cpython/commit/56c4f10d6e474604a162521228b5f3b5ff79236c
commit: 56c4f10d6e474604a162521228b5f3b5ff79236c
branch: main
author: Stan Ulbrych <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2026-02-24T19:42:33+02:00
summary:
gh-88091: Fix unicodedata.decomposition() for Hangul Syllables (GH-144993)
files:
A Misc/NEWS.d/next/Library/2026-02-19-10-57-40.gh-issue-88091.N7qGV-.rst
M Lib/test/test_unicodedata.py
M Modules/unicodedata.c
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
index 8d4ba677faaa6f..30a26751d3ac54 100644
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -378,6 +378,12 @@ def test_decomposition(self):
# New in 17.0.0
self.assertEqual(self.db.decomposition('\uA7F1'), '' if self.old else
'<super> 0053')
+ # Hangul characters
+ self.assertEqual(self.db.decomposition('\uAC00'), '1100 1161')
+ self.assertEqual(self.db.decomposition('\uD4DB'), '1111 1171 11B6')
+ self.assertEqual(self.db.decomposition('\uC2F8'), '110A 1161')
+ self.assertEqual(self.db.decomposition('\uD7A3'), '1112 1175 11C2')
+
self.assertRaises(TypeError, self.db.decomposition)
self.assertRaises(TypeError, self.db.decomposition, 'xx')
@@ -687,9 +693,9 @@ class UnicodeFunctionsTest(unittest.TestCase,
BaseUnicodeFunctionsTest):
# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
- expectedchecksum = ('668dbbea1136e69d4f00677a5988b23bc78aefc6'
+ expectedchecksum = ('00b13fa975a60b1d3f490f1fc8c126ab24990c75'
if quicktest else
- 'b869af769bd8fe352c04622ab90533dc54df5cf3')
+ 'ebfc9dd281c2226998fd435744dd2e9321899beb')
@requires_resource('network')
def test_all_names(self):
@@ -1068,9 +1074,9 @@ def test_block_invalid_input(self):
class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
db = unicodedata.ucd_3_2_0
old = True
- expectedchecksum = ('2164a66700e03cba9c9f5ed9e9a8d594d2da136a'
+ expectedchecksum = ('cb5bbbd1f55b67371e18222b90a8e21c87f16b72'
if quicktest else
- 'a8276cec9b6991779c5bdaa46c1ae7cc50bc2403')
+ '74936dffe949d99203a47e6a66565b2fc337bae7')
class UnicodeMiscTest(unittest.TestCase):
diff --git
a/Misc/NEWS.d/next/Library/2026-02-19-10-57-40.gh-issue-88091.N7qGV-.rst
b/Misc/NEWS.d/next/Library/2026-02-19-10-57-40.gh-issue-88091.N7qGV-.rst
new file mode 100644
index 00000000000000..15cf25052bbb46
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-02-19-10-57-40.gh-issue-88091.N7qGV-.rst
@@ -0,0 +1 @@
+Fix :func:`unicodedata.decomposition` for Hangul characters.
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 27bdd19c409471..401f64e7416944 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -429,6 +429,17 @@ unicodedata_UCD_east_asian_width_impl(PyObject *self, int
chr)
return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
}
+// For Hangul decomposition
+#define SBase 0xAC00
+#define LBase 0x1100
+#define VBase 0x1161
+#define TBase 0x11A7
+#define LCount 19
+#define VCount 21
+#define TCount 28
+#define NCount (VCount*TCount)
+#define SCount (LCount*NCount)
+
/*[clinic input]
@permit_long_summary
unicodedata.UCD.decomposition
@@ -460,6 +471,25 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
return Py_GetConstant(Py_CONSTANT_EMPTY_STR); /* unassigned */
}
+ // Hangul Decomposition.
+ // See section 3.12.2, "Hangul Syllable Decomposition"
+ // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
+ if (SBase <= code && code < (SBase + SCount)) {
+ int SIndex = code - SBase;
+ int L = LBase + SIndex / NCount;
+ int V = VBase + (SIndex % NCount) / TCount;
+ int T = TBase + SIndex % TCount;
+ if (T != TBase) {
+ PyOS_snprintf(decomp, sizeof(decomp),
+ "%04X %04X %04X", L, V, T);
+ }
+ else {
+ PyOS_snprintf(decomp, sizeof(decomp),
+ "%04X %04X", L, V);
+ }
+ return PyUnicode_FromString(decomp);
+ }
+
if (code < 0 || code >= 0x110000)
index = 0;
else {
@@ -522,16 +552,6 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
(*index)++;
}
-#define SBase 0xAC00
-#define LBase 0x1100
-#define VBase 0x1161
-#define TBase 0x11A7
-#define LCount 19
-#define VCount 21
-#define TCount 28
-#define NCount (VCount*TCount)
-#define SCount (LCount*NCount)
-
static PyObject*
nfd_nfkd(PyObject *self, PyObject *input, int k)
{
@@ -585,7 +605,9 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
}
output = new_output;
}
- /* Hangul Decomposition. */
+ // Hangul Decomposition.
+ // See section 3.12.2, "Hangul Syllable Decomposition"
+ //
https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669
if (SBase <= code && code < (SBase+SCount)) {
int SIndex = code - SBase;
int L = LBase + SIndex / NCount;
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]