https://github.com/python/cpython/commit/f1f61bf87207c27da06ff73611b76933e456ef18
commit: f1f61bf87207c27da06ff73611b76933e456ef18
branch: main
author: Stan Ulbrych <[email protected]>
committer: malemburg <[email protected]>
date: 2026-02-21T18:27:55+01:00
summary:

gh-66802: Add `unicodedata.block()` function (#145042)

Closes #66802

files:
A Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst
M Doc/library/unicodedata.rst
M Doc/whatsnew/3.15.rst
M Lib/test/test_unicodedata.py
M Modules/clinic/unicodedata.c.h
M Modules/unicodedata.c
M Modules/unicodedata_db.h
M Tools/unicode/makeunicodedata.py

diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst
index 2fc8b1d8b52341..d5f0405efbecc6 100644
--- a/Doc/library/unicodedata.rst
+++ b/Doc/library/unicodedata.rst
@@ -130,6 +130,18 @@ following functions:
    `Unicode Standard Annex #11 <https://www.unicode.org/reports/tr11/>`_.
 
 
+.. function:: block(chr, /)
+
+   Returns the `block
+   
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_
+   assigned to the character *chr*. For example::
+
+      >>> unicodedata.block('S')
+      'Basic Latin'
+
+   .. versionadded:: next
+
+
 .. function:: mirrored(chr, /)
 
    Returns the mirrored property assigned to the character *chr* as
diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst
index fa3ba25a954e40..cd1ec0e5c452d3 100644
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -1134,6 +1134,11 @@ unicodedata
   of the character which are related to the above algorithm.
   (Contributed by Serhiy Storchaka and Guillaume Sanchez in :gh:`74902`.)
 
+* Add :func:`~unicodedata.block` function to return the `Unicode block
+  
<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_
+  assigned to a character.
+  (Contributed by Stan Ulbrych in :gh:`66802`.)
+
 
 unittest
 --------
diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py
index 1d03e7d9fec717..8d4ba677faaa6f 100644
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -973,6 +973,97 @@ def graphemes(*args):
             'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
             ['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
 
+    def test_block(self):
+        self.assertEqual(self.db.block('\u0000'), 'Basic Latin')
+        self.assertEqual(self.db.block('\u0041'), 'Basic Latin')
+        self.assertEqual(self.db.block('\u007F'), 'Basic Latin')
+        self.assertEqual(self.db.block('\u0080'), 'Latin-1 Supplement')
+        self.assertEqual(self.db.block('\u00FF'), 'Latin-1 Supplement')
+        self.assertEqual(self.db.block('\u1159'), 'Hangul Jamo')
+        self.assertEqual(self.db.block('\u11F9'), 'Hangul Jamo')
+        self.assertEqual(self.db.block('\uD788'), 'Hangul Syllables')
+        self.assertEqual(self.db.block('\uD7A3'), 'Hangul Syllables')
+        # New in 5.0.0
+        self.assertEqual(self.db.block('\u05BA'), 'Hebrew')
+        self.assertEqual(self.db.block('\u20EF'), 'Combining Diacritical Marks 
for Symbols')
+        # New in 5.1.0
+        self.assertEqual(self.db.block('\u2064'), 'General Punctuation')
+        self.assertEqual(self.db.block('\uAA4D'), 'Cham')
+        # New in 5.2.0
+        self.assertEqual(self.db.block('\u0816'), 'Samaritan')
+        self.assertEqual(self.db.block('\uA97C'), 'Hangul Jamo Extended-A')
+        self.assertEqual(self.db.block('\uD7C6'), 'Hangul Jamo Extended-B')
+        self.assertEqual(self.db.block('\uD7FB'), 'Hangul Jamo Extended-B')
+        # New in 6.0.0
+        self.assertEqual(self.db.block('\u093A'), 'Devanagari')
+        self.assertEqual(self.db.block('\U00011002'), 'Brahmi')
+        # New in 6.1.0
+        self.assertEqual(self.db.block('\U000E0FFF'), 'No_Block')
+        self.assertEqual(self.db.block('\U00016F7E'), 'Miao')
+        # New in 6.2.0
+        self.assertEqual(self.db.block('\U0001F1E6'), 'Enclosed Alphanumeric 
Supplement')
+        self.assertEqual(self.db.block('\U0001F1FF'), 'Enclosed Alphanumeric 
Supplement')
+        # New in 6.3.0
+        self.assertEqual(self.db.block('\u180E'), 'Mongolian')
+        self.assertEqual(self.db.block('\u1A1B'), 'Buginese')
+        # New in 7.0.0
+        self.assertEqual(self.db.block('\u0E33'), 'Thai')
+        self.assertEqual(self.db.block('\u0EB3'), 'Lao')
+        self.assertEqual(self.db.block('\U0001BCA3'), 'Shorthand Format 
Controls')
+        self.assertEqual(self.db.block('\U0001E8D6'), 'Mende Kikakui')
+        self.assertEqual(self.db.block('\U0001163E'), 'Modi')
+        # New in 8.0.0
+        self.assertEqual(self.db.block('\u08E3'), 'Arabic Extended-A')
+        self.assertEqual(self.db.block('\U00011726'), 'Ahom')
+        # New in 9.0.0
+        self.assertEqual(self.db.block('\u0600'), 'Arabic')
+        self.assertEqual(self.db.block('\U000E007F'), 'Tags')
+        self.assertEqual(self.db.block('\U00011CB4'), 'Marchen')
+        self.assertEqual(self.db.block('\u200D'), 'General Punctuation')
+        # New in 10.0.0
+        self.assertEqual(self.db.block('\U00011D46'), 'Masaram Gondi')
+        self.assertEqual(self.db.block('\U00011D47'), 'Masaram Gondi')
+        self.assertEqual(self.db.block('\U00011A97'), 'Soyombo')
+        # New in 11.0.0
+        self.assertEqual(self.db.block('\U000110CD'), 'Kaithi')
+        self.assertEqual(self.db.block('\u07FD'), 'NKo')
+        self.assertEqual(self.db.block('\U00011EF6'), 'Makasar')
+        # New in 12.0.0
+        self.assertEqual(self.db.block('\U00011A84'), 'Soyombo')
+        self.assertEqual(self.db.block('\U00013438'), 'Egyptian Hieroglyph 
Format Controls')
+        self.assertEqual(self.db.block('\U0001E2EF'), 'Wancho')
+        self.assertEqual(self.db.block('\U00016F87'), 'Miao')
+        # New in 13.0.0
+        self.assertEqual(self.db.block('\U00011941'), 'Dives Akuru')
+        self.assertEqual(self.db.block('\U00016FE4'), 'Ideographic Symbols and 
Punctuation')
+        self.assertEqual(self.db.block('\U00011942'), 'Dives Akuru')
+        # New in 14.0.0
+        self.assertEqual(self.db.block('\u0891'), 'Arabic Extended-B')
+        self.assertEqual(self.db.block('\U0001E2AE'), 'Toto')
+        # New in 15.0.0
+        self.assertEqual(self.db.block('\U00011F02'), 'Kawi')
+        self.assertEqual(self.db.block('\U0001343F'), 'Egyptian Hieroglyph 
Format Controls')
+        self.assertEqual(self.db.block('\U0001E4EF'), 'Nag Mundari')
+        self.assertEqual(self.db.block('\U00011F3F'), 'Kawi')
+        # New in 16.0.0
+        self.assertEqual(self.db.block('\U000113D1'), 'Tulu-Tigalari')
+        self.assertEqual(self.db.block('\U0001E5EF'), 'Ol Onal')
+        self.assertEqual(self.db.block('\U0001612C'), 'Gurung Khema')
+        self.assertEqual(self.db.block('\U00016D63'), 'Kirat Rai')
+        # New in 17.0.0
+        self.assertEqual(self.db.block('\u1AEB'), 'Combining Diacritical Marks 
Extended')
+        self.assertEqual(self.db.block('\U00011B67'), 'Sharada Supplement')
+        # Unassigned
+        self.assertEqual(self.db.block('\U00100000'), 'Supplementary Private 
Use Area-B')
+        self.assertEqual(self.db.block('\U0010FFFF'), 'Supplementary Private 
Use Area-B')
+
+    def test_block_invalid_input(self):
+        self.assertRaises(TypeError, self.db.block)
+        self.assertRaises(TypeError, self.db.block, b'x')
+        self.assertRaises(TypeError, self.db.block, 120)
+        self.assertRaises(TypeError, self.db.block, '')
+        self.assertRaises(TypeError, self.db.block, 'xx')
+
 
 class Unicode_3_2_0_FunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
     db = unicodedata.ucd_3_2_0
diff --git 
a/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst 
b/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst
new file mode 100644
index 00000000000000..68a25262c7d7f7
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-02-20-13-03-10.gh-issue-66802.OYcAi_.rst
@@ -0,0 +1,3 @@
+Add :func:`unicodedata.block` function to return the `Unicode block
+<https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189>`_ 
of a
+character.
diff --git a/Modules/clinic/unicodedata.c.h b/Modules/clinic/unicodedata.c.h
index 8e2dd7a0ce5663..5443893079b1af 100644
--- a/Modules/clinic/unicodedata.c.h
+++ b/Modules/clinic/unicodedata.c.h
@@ -691,6 +691,42 @@ unicodedata_iter_graphemes(PyObject *module, PyObject 
*const *args, Py_ssize_t n
     return return_value;
 }
 
+PyDoc_STRVAR(unicodedata_block__doc__,
+"block($module, chr, /)\n"
+"--\n"
+"\n"
+"Return block assigned to the character chr.");
+
+#define UNICODEDATA_BLOCK_METHODDEF    \
+    {"block", (PyCFunction)unicodedata_block, METH_O, 
unicodedata_block__doc__},
+
+static PyObject *
+unicodedata_block_impl(PyObject *module, int chr);
+
+static PyObject *
+unicodedata_block(PyObject *module, PyObject *arg)
+{
+    PyObject *return_value = NULL;
+    int chr;
+
+    if (!PyUnicode_Check(arg)) {
+        _PyArg_BadArgument("block", "argument", "a unicode character", arg);
+        goto exit;
+    }
+    if (PyUnicode_GET_LENGTH(arg) != 1) {
+        PyErr_Format(PyExc_TypeError,
+            "block(): argument must be a unicode character, "
+            "not a string of length %zd",
+            PyUnicode_GET_LENGTH(arg));
+        goto exit;
+    }
+    chr = PyUnicode_READ_CHAR(arg, 0);
+    return_value = unicodedata_block_impl(module, chr);
+
+exit:
+    return return_value;
+}
+
 PyDoc_STRVAR(unicodedata_grapheme_cluster_break__doc__,
 "grapheme_cluster_break($module, chr, /)\n"
 "--\n"
@@ -798,4 +834,4 @@ unicodedata_extended_pictographic(PyObject *module, 
PyObject *arg)
 exit:
     return return_value;
 }
-/*[clinic end generated code: output=0f09cc90f06ace76 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=482a87df218f07c1 input=a9049054013a1b77]*/
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
index 1ed9760874b2a6..f20726a937ce38 100644
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -2066,6 +2066,39 @@ unicodedata_iter_graphemes_impl(PyObject *module, 
PyObject *unistr,
     return (PyObject*)gbi;
 }
 
+/*[clinic input]
+unicodedata.block
+
+    chr: int(accept={str})
+    /
+
+Return block assigned to the character chr.
+[clinic start generated code]*/
+
+static PyObject *
+unicodedata_block_impl(PyObject *module, int chr)
+/*[clinic end generated code: output=5f8b40c49eaec75a input=0834cf2642d6eaae]*/
+{
+    Py_UCS4 c = (Py_UCS4)chr;
+    int lo = 0, hi = BLOCK_COUNT - 1;
+    while (lo <= hi) {
+        int mid = (lo + hi) / 2;
+        if (c < _PyUnicode_Blocks[mid].start) {
+            hi = mid - 1;
+        }
+        else if (c > _PyUnicode_Blocks[mid].end) {
+            lo = mid + 1;
+        }
+        else {
+            size_t name = _PyUnicode_Blocks[mid].name;
+            return PyUnicode_FromString(_PyUnicode_BlockNames[name]);
+        }
+    }
+    // Otherwise, return the default value per
+    // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G64189
+    return PyUnicode_FromString("No_Block");
+}
+
 /*[clinic input]
 unicodedata.grapheme_cluster_break
 
@@ -2128,6 +2161,7 @@ unicodedata_extended_pictographic_impl(PyObject *module, 
int chr)
 // an UCD instance.
 static PyMethodDef unicodedata_functions[] = {
     // Module only functions.
+    UNICODEDATA_BLOCK_METHODDEF
     UNICODEDATA_GRAPHEME_CLUSTER_BREAK_METHODDEF
     UNICODEDATA_INDIC_CONJUNCT_BREAK_METHODDEF
     UNICODEDATA_EXTENDED_PICTOGRAPHIC_METHODDEF
@@ -2137,7 +2171,7 @@ static PyMethodDef unicodedata_functions[] = {
 
     // The following definitions are shared between the module
     // and the UCD class.
-#define DB_methods (unicodedata_functions + 6)
+#define DB_methods (unicodedata_functions + 7)
 
     UNICODEDATA_UCD_DECIMAL_METHODDEF
     UNICODEDATA_UCD_DIGIT_METHODDEF
diff --git a/Modules/unicodedata_db.h b/Modules/unicodedata_db.h
index 3cc5776a1f240d..9e88f5cca7115b 100644
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
@@ -796,6 +796,709 @@ const char * const _PyUnicode_IndicConjunctBreakNames[] = 
{
     "Extend",
     NULL
 };
+static const char * const _PyUnicode_BlockNames[] = {
+    "Basic Latin",
+    "Latin-1 Supplement",
+    "Latin Extended-A",
+    "Latin Extended-B",
+    "IPA Extensions",
+    "Spacing Modifier Letters",
+    "Combining Diacritical Marks",
+    "Greek and Coptic",
+    "Cyrillic",
+    "Cyrillic Supplement",
+    "Armenian",
+    "Hebrew",
+    "Arabic",
+    "Syriac",
+    "Arabic Supplement",
+    "Thaana",
+    "NKo",
+    "Samaritan",
+    "Mandaic",
+    "Syriac Supplement",
+    "Arabic Extended-B",
+    "Arabic Extended-A",
+    "Devanagari",
+    "Bengali",
+    "Gurmukhi",
+    "Gujarati",
+    "Oriya",
+    "Tamil",
+    "Telugu",
+    "Kannada",
+    "Malayalam",
+    "Sinhala",
+    "Thai",
+    "Lao",
+    "Tibetan",
+    "Myanmar",
+    "Georgian",
+    "Hangul Jamo",
+    "Ethiopic",
+    "Ethiopic Supplement",
+    "Cherokee",
+    "Unified Canadian Aboriginal Syllabics",
+    "Ogham",
+    "Runic",
+    "Tagalog",
+    "Hanunoo",
+    "Buhid",
+    "Tagbanwa",
+    "Khmer",
+    "Mongolian",
+    "Unified Canadian Aboriginal Syllabics Extended",
+    "Limbu",
+    "Tai Le",
+    "New Tai Lue",
+    "Khmer Symbols",
+    "Buginese",
+    "Tai Tham",
+    "Combining Diacritical Marks Extended",
+    "Balinese",
+    "Sundanese",
+    "Batak",
+    "Lepcha",
+    "Ol Chiki",
+    "Cyrillic Extended-C",
+    "Georgian Extended",
+    "Sundanese Supplement",
+    "Vedic Extensions",
+    "Phonetic Extensions",
+    "Phonetic Extensions Supplement",
+    "Combining Diacritical Marks Supplement",
+    "Latin Extended Additional",
+    "Greek Extended",
+    "General Punctuation",
+    "Superscripts and Subscripts",
+    "Currency Symbols",
+    "Combining Diacritical Marks for Symbols",
+    "Letterlike Symbols",
+    "Number Forms",
+    "Arrows",
+    "Mathematical Operators",
+    "Miscellaneous Technical",
+    "Control Pictures",
+    "Optical Character Recognition",
+    "Enclosed Alphanumerics",
+    "Box Drawing",
+    "Block Elements",
+    "Geometric Shapes",
+    "Miscellaneous Symbols",
+    "Dingbats",
+    "Miscellaneous Mathematical Symbols-A",
+    "Supplemental Arrows-A",
+    "Braille Patterns",
+    "Supplemental Arrows-B",
+    "Miscellaneous Mathematical Symbols-B",
+    "Supplemental Mathematical Operators",
+    "Miscellaneous Symbols and Arrows",
+    "Glagolitic",
+    "Latin Extended-C",
+    "Coptic",
+    "Georgian Supplement",
+    "Tifinagh",
+    "Ethiopic Extended",
+    "Cyrillic Extended-A",
+    "Supplemental Punctuation",
+    "CJK Radicals Supplement",
+    "Kangxi Radicals",
+    "Ideographic Description Characters",
+    "CJK Symbols and Punctuation",
+    "Hiragana",
+    "Katakana",
+    "Bopomofo",
+    "Hangul Compatibility Jamo",
+    "Kanbun",
+    "Bopomofo Extended",
+    "CJK Strokes",
+    "Katakana Phonetic Extensions",
+    "Enclosed CJK Letters and Months",
+    "CJK Compatibility",
+    "CJK Unified Ideographs Extension A",
+    "Yijing Hexagram Symbols",
+    "CJK Unified Ideographs",
+    "Yi Syllables",
+    "Yi Radicals",
+    "Lisu",
+    "Vai",
+    "Cyrillic Extended-B",
+    "Bamum",
+    "Modifier Tone Letters",
+    "Latin Extended-D",
+    "Syloti Nagri",
+    "Common Indic Number Forms",
+    "Phags-pa",
+    "Saurashtra",
+    "Devanagari Extended",
+    "Kayah Li",
+    "Rejang",
+    "Hangul Jamo Extended-A",
+    "Javanese",
+    "Myanmar Extended-B",
+    "Cham",
+    "Myanmar Extended-A",
+    "Tai Viet",
+    "Meetei Mayek Extensions",
+    "Ethiopic Extended-A",
+    "Latin Extended-E",
+    "Cherokee Supplement",
+    "Meetei Mayek",
+    "Hangul Syllables",
+    "Hangul Jamo Extended-B",
+    "High Surrogates",
+    "High Private Use Surrogates",
+    "Low Surrogates",
+    "Private Use Area",
+    "CJK Compatibility Ideographs",
+    "Alphabetic Presentation Forms",
+    "Arabic Presentation Forms-A",
+    "Variation Selectors",
+    "Vertical Forms",
+    "Combining Half Marks",
+    "CJK Compatibility Forms",
+    "Small Form Variants",
+    "Arabic Presentation Forms-B",
+    "Halfwidth and Fullwidth Forms",
+    "Specials",
+    "Linear B Syllabary",
+    "Linear B Ideograms",
+    "Aegean Numbers",
+    "Ancient Greek Numbers",
+    "Ancient Symbols",
+    "Phaistos Disc",
+    "Lycian",
+    "Carian",
+    "Coptic Epact Numbers",
+    "Old Italic",
+    "Gothic",
+    "Old Permic",
+    "Ugaritic",
+    "Old Persian",
+    "Deseret",
+    "Shavian",
+    "Osmanya",
+    "Osage",
+    "Elbasan",
+    "Caucasian Albanian",
+    "Vithkuqi",
+    "Todhri",
+    "Linear A",
+    "Latin Extended-F",
+    "Cypriot Syllabary",
+    "Imperial Aramaic",
+    "Palmyrene",
+    "Nabataean",
+    "Hatran",
+    "Phoenician",
+    "Lydian",
+    "Sidetic",
+    "Meroitic Hieroglyphs",
+    "Meroitic Cursive",
+    "Kharoshthi",
+    "Old South Arabian",
+    "Old North Arabian",
+    "Manichaean",
+    "Avestan",
+    "Inscriptional Parthian",
+    "Inscriptional Pahlavi",
+    "Psalter Pahlavi",
+    "Old Turkic",
+    "Old Hungarian",
+    "Hanifi Rohingya",
+    "Garay",
+    "Rumi Numeral Symbols",
+    "Yezidi",
+    "Arabic Extended-C",
+    "Old Sogdian",
+    "Sogdian",
+    "Old Uyghur",
+    "Chorasmian",
+    "Elymaic",
+    "Brahmi",
+    "Kaithi",
+    "Sora Sompeng",
+    "Chakma",
+    "Mahajani",
+    "Sharada",
+    "Sinhala Archaic Numbers",
+    "Khojki",
+    "Multani",
+    "Khudawadi",
+    "Grantha",
+    "Tulu-Tigalari",
+    "Newa",
+    "Tirhuta",
+    "Siddham",
+    "Modi",
+    "Mongolian Supplement",
+    "Takri",
+    "Myanmar Extended-C",
+    "Ahom",
+    "Dogra",
+    "Warang Citi",
+    "Dives Akuru",
+    "Nandinagari",
+    "Zanabazar Square",
+    "Soyombo",
+    "Unified Canadian Aboriginal Syllabics Extended-A",
+    "Pau Cin Hau",
+    "Devanagari Extended-A",
+    "Sharada Supplement",
+    "Sunuwar",
+    "Bhaiksuki",
+    "Marchen",
+    "Masaram Gondi",
+    "Gunjala Gondi",
+    "Tolong Siki",
+    "Makasar",
+    "Kawi",
+    "Lisu Supplement",
+    "Tamil Supplement",
+    "Cuneiform",
+    "Cuneiform Numbers and Punctuation",
+    "Early Dynastic Cuneiform",
+    "Cypro-Minoan",
+    "Egyptian Hieroglyphs",
+    "Egyptian Hieroglyph Format Controls",
+    "Egyptian Hieroglyphs Extended-A",
+    "Anatolian Hieroglyphs",
+    "Gurung Khema",
+    "Bamum Supplement",
+    "Mro",
+    "Tangsa",
+    "Bassa Vah",
+    "Pahawh Hmong",
+    "Kirat Rai",
+    "Medefaidrin",
+    "Beria Erfe",
+    "Miao",
+    "Ideographic Symbols and Punctuation",
+    "Tangut",
+    "Tangut Components",
+    "Khitan Small Script",
+    "Tangut Supplement",
+    "Tangut Components Supplement",
+    "Kana Extended-B",
+    "Kana Supplement",
+    "Kana Extended-A",
+    "Small Kana Extension",
+    "Nushu",
+    "Duployan",
+    "Shorthand Format Controls",
+    "Symbols for Legacy Computing Supplement",
+    "Miscellaneous Symbols Supplement",
+    "Znamenny Musical Notation",
+    "Byzantine Musical Symbols",
+    "Musical Symbols",
+    "Ancient Greek Musical Notation",
+    "Kaktovik Numerals",
+    "Mayan Numerals",
+    "Tai Xuan Jing Symbols",
+    "Counting Rod Numerals",
+    "Mathematical Alphanumeric Symbols",
+    "Sutton SignWriting",
+    "Latin Extended-G",
+    "Glagolitic Supplement",
+    "Cyrillic Extended-D",
+    "Nyiakeng Puachue Hmong",
+    "Toto",
+    "Wancho",
+    "Nag Mundari",
+    "Ol Onal",
+    "Tai Yo",
+    "Ethiopic Extended-B",
+    "Mende Kikakui",
+    "Adlam",
+    "Indic Siyaq Numbers",
+    "Ottoman Siyaq Numbers",
+    "Arabic Mathematical Alphabetic Symbols",
+    "Mahjong Tiles",
+    "Domino Tiles",
+    "Playing Cards",
+    "Enclosed Alphanumeric Supplement",
+    "Enclosed Ideographic Supplement",
+    "Miscellaneous Symbols and Pictographs",
+    "Emoticons",
+    "Ornamental Dingbats",
+    "Transport and Map Symbols",
+    "Alchemical Symbols",
+    "Geometric Shapes Extended",
+    "Supplemental Arrows-C",
+    "Supplemental Symbols and Pictographs",
+    "Chess Symbols",
+    "Symbols and Pictographs Extended-A",
+    "Symbols for Legacy Computing",
+    "CJK Unified Ideographs Extension B",
+    "CJK Unified Ideographs Extension C",
+    "CJK Unified Ideographs Extension D",
+    "CJK Unified Ideographs Extension E",
+    "CJK Unified Ideographs Extension F",
+    "CJK Unified Ideographs Extension I",
+    "CJK Compatibility Ideographs Supplement",
+    "CJK Unified Ideographs Extension G",
+    "CJK Unified Ideographs Extension H",
+    "CJK Unified Ideographs Extension J",
+    "Tags",
+    "Variation Selectors Supplement",
+    "Supplementary Private Use Area-A",
+    "Supplementary Private Use Area-B",
+};
+typedef struct {
+    Py_UCS4 start;
+    Py_UCS4 end;
+    unsigned short name;
+} _PyUnicode_Block;
+static const _PyUnicode_Block _PyUnicode_Blocks[] = {
+    {0x0000, 0x007F, 0},
+    {0x0080, 0x00FF, 1},
+    {0x0100, 0x017F, 2},
+    {0x0180, 0x024F, 3},
+    {0x0250, 0x02AF, 4},
+    {0x02B0, 0x02FF, 5},
+    {0x0300, 0x036F, 6},
+    {0x0370, 0x03FF, 7},
+    {0x0400, 0x04FF, 8},
+    {0x0500, 0x052F, 9},
+    {0x0530, 0x058F, 10},
+    {0x0590, 0x05FF, 11},
+    {0x0600, 0x06FF, 12},
+    {0x0700, 0x074F, 13},
+    {0x0750, 0x077F, 14},
+    {0x0780, 0x07BF, 15},
+    {0x07C0, 0x07FF, 16},
+    {0x0800, 0x083F, 17},
+    {0x0840, 0x085F, 18},
+    {0x0860, 0x086F, 19},
+    {0x0870, 0x089F, 20},
+    {0x08A0, 0x08FF, 21},
+    {0x0900, 0x097F, 22},
+    {0x0980, 0x09FF, 23},
+    {0x0A00, 0x0A7F, 24},
+    {0x0A80, 0x0AFF, 25},
+    {0x0B00, 0x0B7F, 26},
+    {0x0B80, 0x0BFF, 27},
+    {0x0C00, 0x0C7F, 28},
+    {0x0C80, 0x0CFF, 29},
+    {0x0D00, 0x0D7F, 30},
+    {0x0D80, 0x0DFF, 31},
+    {0x0E00, 0x0E7F, 32},
+    {0x0E80, 0x0EFF, 33},
+    {0x0F00, 0x0FFF, 34},
+    {0x1000, 0x109F, 35},
+    {0x10A0, 0x10FF, 36},
+    {0x1100, 0x11FF, 37},
+    {0x1200, 0x137F, 38},
+    {0x1380, 0x139F, 39},
+    {0x13A0, 0x13FF, 40},
+    {0x1400, 0x167F, 41},
+    {0x1680, 0x169F, 42},
+    {0x16A0, 0x16FF, 43},
+    {0x1700, 0x171F, 44},
+    {0x1720, 0x173F, 45},
+    {0x1740, 0x175F, 46},
+    {0x1760, 0x177F, 47},
+    {0x1780, 0x17FF, 48},
+    {0x1800, 0x18AF, 49},
+    {0x18B0, 0x18FF, 50},
+    {0x1900, 0x194F, 51},
+    {0x1950, 0x197F, 52},
+    {0x1980, 0x19DF, 53},
+    {0x19E0, 0x19FF, 54},
+    {0x1A00, 0x1A1F, 55},
+    {0x1A20, 0x1AAF, 56},
+    {0x1AB0, 0x1AFF, 57},
+    {0x1B00, 0x1B7F, 58},
+    {0x1B80, 0x1BBF, 59},
+    {0x1BC0, 0x1BFF, 60},
+    {0x1C00, 0x1C4F, 61},
+    {0x1C50, 0x1C7F, 62},
+    {0x1C80, 0x1C8F, 63},
+    {0x1C90, 0x1CBF, 64},
+    {0x1CC0, 0x1CCF, 65},
+    {0x1CD0, 0x1CFF, 66},
+    {0x1D00, 0x1D7F, 67},
+    {0x1D80, 0x1DBF, 68},
+    {0x1DC0, 0x1DFF, 69},
+    {0x1E00, 0x1EFF, 70},
+    {0x1F00, 0x1FFF, 71},
+    {0x2000, 0x206F, 72},
+    {0x2070, 0x209F, 73},
+    {0x20A0, 0x20CF, 74},
+    {0x20D0, 0x20FF, 75},
+    {0x2100, 0x214F, 76},
+    {0x2150, 0x218F, 77},
+    {0x2190, 0x21FF, 78},
+    {0x2200, 0x22FF, 79},
+    {0x2300, 0x23FF, 80},
+    {0x2400, 0x243F, 81},
+    {0x2440, 0x245F, 82},
+    {0x2460, 0x24FF, 83},
+    {0x2500, 0x257F, 84},
+    {0x2580, 0x259F, 85},
+    {0x25A0, 0x25FF, 86},
+    {0x2600, 0x26FF, 87},
+    {0x2700, 0x27BF, 88},
+    {0x27C0, 0x27EF, 89},
+    {0x27F0, 0x27FF, 90},
+    {0x2800, 0x28FF, 91},
+    {0x2900, 0x297F, 92},
+    {0x2980, 0x29FF, 93},
+    {0x2A00, 0x2AFF, 94},
+    {0x2B00, 0x2BFF, 95},
+    {0x2C00, 0x2C5F, 96},
+    {0x2C60, 0x2C7F, 97},
+    {0x2C80, 0x2CFF, 98},
+    {0x2D00, 0x2D2F, 99},
+    {0x2D30, 0x2D7F, 100},
+    {0x2D80, 0x2DDF, 101},
+    {0x2DE0, 0x2DFF, 102},
+    {0x2E00, 0x2E7F, 103},
+    {0x2E80, 0x2EFF, 104},
+    {0x2F00, 0x2FDF, 105},
+    {0x2FF0, 0x2FFF, 106},
+    {0x3000, 0x303F, 107},
+    {0x3040, 0x309F, 108},
+    {0x30A0, 0x30FF, 109},
+    {0x3100, 0x312F, 110},
+    {0x3130, 0x318F, 111},
+    {0x3190, 0x319F, 112},
+    {0x31A0, 0x31BF, 113},
+    {0x31C0, 0x31EF, 114},
+    {0x31F0, 0x31FF, 115},
+    {0x3200, 0x32FF, 116},
+    {0x3300, 0x33FF, 117},
+    {0x3400, 0x4DBF, 118},
+    {0x4DC0, 0x4DFF, 119},
+    {0x4E00, 0x9FFF, 120},
+    {0xA000, 0xA48F, 121},
+    {0xA490, 0xA4CF, 122},
+    {0xA4D0, 0xA4FF, 123},
+    {0xA500, 0xA63F, 124},
+    {0xA640, 0xA69F, 125},
+    {0xA6A0, 0xA6FF, 126},
+    {0xA700, 0xA71F, 127},
+    {0xA720, 0xA7FF, 128},
+    {0xA800, 0xA82F, 129},
+    {0xA830, 0xA83F, 130},
+    {0xA840, 0xA87F, 131},
+    {0xA880, 0xA8DF, 132},
+    {0xA8E0, 0xA8FF, 133},
+    {0xA900, 0xA92F, 134},
+    {0xA930, 0xA95F, 135},
+    {0xA960, 0xA97F, 136},
+    {0xA980, 0xA9DF, 137},
+    {0xA9E0, 0xA9FF, 138},
+    {0xAA00, 0xAA5F, 139},
+    {0xAA60, 0xAA7F, 140},
+    {0xAA80, 0xAADF, 141},
+    {0xAAE0, 0xAAFF, 142},
+    {0xAB00, 0xAB2F, 143},
+    {0xAB30, 0xAB6F, 144},
+    {0xAB70, 0xABBF, 145},
+    {0xABC0, 0xABFF, 146},
+    {0xAC00, 0xD7AF, 147},
+    {0xD7B0, 0xD7FF, 148},
+    {0xD800, 0xDB7F, 149},
+    {0xDB80, 0xDBFF, 150},
+    {0xDC00, 0xDFFF, 151},
+    {0xE000, 0xF8FF, 152},
+    {0xF900, 0xFAFF, 153},
+    {0xFB00, 0xFB4F, 154},
+    {0xFB50, 0xFDFF, 155},
+    {0xFE00, 0xFE0F, 156},
+    {0xFE10, 0xFE1F, 157},
+    {0xFE20, 0xFE2F, 158},
+    {0xFE30, 0xFE4F, 159},
+    {0xFE50, 0xFE6F, 160},
+    {0xFE70, 0xFEFF, 161},
+    {0xFF00, 0xFFEF, 162},
+    {0xFFF0, 0xFFFF, 163},
+    {0x10000, 0x1007F, 164},
+    {0x10080, 0x100FF, 165},
+    {0x10100, 0x1013F, 166},
+    {0x10140, 0x1018F, 167},
+    {0x10190, 0x101CF, 168},
+    {0x101D0, 0x101FF, 169},
+    {0x10280, 0x1029F, 170},
+    {0x102A0, 0x102DF, 171},
+    {0x102E0, 0x102FF, 172},
+    {0x10300, 0x1032F, 173},
+    {0x10330, 0x1034F, 174},
+    {0x10350, 0x1037F, 175},
+    {0x10380, 0x1039F, 176},
+    {0x103A0, 0x103DF, 177},
+    {0x10400, 0x1044F, 178},
+    {0x10450, 0x1047F, 179},
+    {0x10480, 0x104AF, 180},
+    {0x104B0, 0x104FF, 181},
+    {0x10500, 0x1052F, 182},
+    {0x10530, 0x1056F, 183},
+    {0x10570, 0x105BF, 184},
+    {0x105C0, 0x105FF, 185},
+    {0x10600, 0x1077F, 186},
+    {0x10780, 0x107BF, 187},
+    {0x10800, 0x1083F, 188},
+    {0x10840, 0x1085F, 189},
+    {0x10860, 0x1087F, 190},
+    {0x10880, 0x108AF, 191},
+    {0x108E0, 0x108FF, 192},
+    {0x10900, 0x1091F, 193},
+    {0x10920, 0x1093F, 194},
+    {0x10940, 0x1095F, 195},
+    {0x10980, 0x1099F, 196},
+    {0x109A0, 0x109FF, 197},
+    {0x10A00, 0x10A5F, 198},
+    {0x10A60, 0x10A7F, 199},
+    {0x10A80, 0x10A9F, 200},
+    {0x10AC0, 0x10AFF, 201},
+    {0x10B00, 0x10B3F, 202},
+    {0x10B40, 0x10B5F, 203},
+    {0x10B60, 0x10B7F, 204},
+    {0x10B80, 0x10BAF, 205},
+    {0x10C00, 0x10C4F, 206},
+    {0x10C80, 0x10CFF, 207},
+    {0x10D00, 0x10D3F, 208},
+    {0x10D40, 0x10D8F, 209},
+    {0x10E60, 0x10E7F, 210},
+    {0x10E80, 0x10EBF, 211},
+    {0x10EC0, 0x10EFF, 212},
+    {0x10F00, 0x10F2F, 213},
+    {0x10F30, 0x10F6F, 214},
+    {0x10F70, 0x10FAF, 215},
+    {0x10FB0, 0x10FDF, 216},
+    {0x10FE0, 0x10FFF, 217},
+    {0x11000, 0x1107F, 218},
+    {0x11080, 0x110CF, 219},
+    {0x110D0, 0x110FF, 220},
+    {0x11100, 0x1114F, 221},
+    {0x11150, 0x1117F, 222},
+    {0x11180, 0x111DF, 223},
+    {0x111E0, 0x111FF, 224},
+    {0x11200, 0x1124F, 225},
+    {0x11280, 0x112AF, 226},
+    {0x112B0, 0x112FF, 227},
+    {0x11300, 0x1137F, 228},
+    {0x11380, 0x113FF, 229},
+    {0x11400, 0x1147F, 230},
+    {0x11480, 0x114DF, 231},
+    {0x11580, 0x115FF, 232},
+    {0x11600, 0x1165F, 233},
+    {0x11660, 0x1167F, 234},
+    {0x11680, 0x116CF, 235},
+    {0x116D0, 0x116FF, 236},
+    {0x11700, 0x1174F, 237},
+    {0x11800, 0x1184F, 238},
+    {0x118A0, 0x118FF, 239},
+    {0x11900, 0x1195F, 240},
+    {0x119A0, 0x119FF, 241},
+    {0x11A00, 0x11A4F, 242},
+    {0x11A50, 0x11AAF, 243},
+    {0x11AB0, 0x11ABF, 244},
+    {0x11AC0, 0x11AFF, 245},
+    {0x11B00, 0x11B5F, 246},
+    {0x11B60, 0x11B7F, 247},
+    {0x11BC0, 0x11BFF, 248},
+    {0x11C00, 0x11C6F, 249},
+    {0x11C70, 0x11CBF, 250},
+    {0x11D00, 0x11D5F, 251},
+    {0x11D60, 0x11DAF, 252},
+    {0x11DB0, 0x11DEF, 253},
+    {0x11EE0, 0x11EFF, 254},
+    {0x11F00, 0x11F5F, 255},
+    {0x11FB0, 0x11FBF, 256},
+    {0x11FC0, 0x11FFF, 257},
+    {0x12000, 0x123FF, 258},
+    {0x12400, 0x1247F, 259},
+    {0x12480, 0x1254F, 260},
+    {0x12F90, 0x12FFF, 261},
+    {0x13000, 0x1342F, 262},
+    {0x13430, 0x1345F, 263},
+    {0x13460, 0x143FF, 264},
+    {0x14400, 0x1467F, 265},
+    {0x16100, 0x1613F, 266},
+    {0x16800, 0x16A3F, 267},
+    {0x16A40, 0x16A6F, 268},
+    {0x16A70, 0x16ACF, 269},
+    {0x16AD0, 0x16AFF, 270},
+    {0x16B00, 0x16B8F, 271},
+    {0x16D40, 0x16D7F, 272},
+    {0x16E40, 0x16E9F, 273},
+    {0x16EA0, 0x16EDF, 274},
+    {0x16F00, 0x16F9F, 275},
+    {0x16FE0, 0x16FFF, 276},
+    {0x17000, 0x187FF, 277},
+    {0x18800, 0x18AFF, 278},
+    {0x18B00, 0x18CFF, 279},
+    {0x18D00, 0x18D7F, 280},
+    {0x18D80, 0x18DFF, 281},
+    {0x1AFF0, 0x1AFFF, 282},
+    {0x1B000, 0x1B0FF, 283},
+    {0x1B100, 0x1B12F, 284},
+    {0x1B130, 0x1B16F, 285},
+    {0x1B170, 0x1B2FF, 286},
+    {0x1BC00, 0x1BC9F, 287},
+    {0x1BCA0, 0x1BCAF, 288},
+    {0x1CC00, 0x1CEBF, 289},
+    {0x1CEC0, 0x1CEFF, 290},
+    {0x1CF00, 0x1CFCF, 291},
+    {0x1D000, 0x1D0FF, 292},
+    {0x1D100, 0x1D1FF, 293},
+    {0x1D200, 0x1D24F, 294},
+    {0x1D2C0, 0x1D2DF, 295},
+    {0x1D2E0, 0x1D2FF, 296},
+    {0x1D300, 0x1D35F, 297},
+    {0x1D360, 0x1D37F, 298},
+    {0x1D400, 0x1D7FF, 299},
+    {0x1D800, 0x1DAAF, 300},
+    {0x1DF00, 0x1DFFF, 301},
+    {0x1E000, 0x1E02F, 302},
+    {0x1E030, 0x1E08F, 303},
+    {0x1E100, 0x1E14F, 304},
+    {0x1E290, 0x1E2BF, 305},
+    {0x1E2C0, 0x1E2FF, 306},
+    {0x1E4D0, 0x1E4FF, 307},
+    {0x1E5D0, 0x1E5FF, 308},
+    {0x1E6C0, 0x1E6FF, 309},
+    {0x1E7E0, 0x1E7FF, 310},
+    {0x1E800, 0x1E8DF, 311},
+    {0x1E900, 0x1E95F, 312},
+    {0x1EC70, 0x1ECBF, 313},
+    {0x1ED00, 0x1ED4F, 314},
+    {0x1EE00, 0x1EEFF, 315},
+    {0x1F000, 0x1F02F, 316},
+    {0x1F030, 0x1F09F, 317},
+    {0x1F0A0, 0x1F0FF, 318},
+    {0x1F100, 0x1F1FF, 319},
+    {0x1F200, 0x1F2FF, 320},
+    {0x1F300, 0x1F5FF, 321},
+    {0x1F600, 0x1F64F, 322},
+    {0x1F650, 0x1F67F, 323},
+    {0x1F680, 0x1F6FF, 324},
+    {0x1F700, 0x1F77F, 325},
+    {0x1F780, 0x1F7FF, 326},
+    {0x1F800, 0x1F8FF, 327},
+    {0x1F900, 0x1F9FF, 328},
+    {0x1FA00, 0x1FA6F, 329},
+    {0x1FA70, 0x1FAFF, 330},
+    {0x1FB00, 0x1FBFF, 331},
+    {0x20000, 0x2A6DF, 332},
+    {0x2A700, 0x2B73F, 333},
+    {0x2B740, 0x2B81F, 334},
+    {0x2B820, 0x2CEAF, 335},
+    {0x2CEB0, 0x2EBEF, 336},
+    {0x2EBF0, 0x2EE5F, 337},
+    {0x2F800, 0x2FA1F, 338},
+    {0x30000, 0x3134F, 339},
+    {0x31350, 0x323AF, 340},
+    {0x323B0, 0x3347F, 341},
+    {0xE0000, 0xE007F, 342},
+    {0xE0100, 0xE01EF, 343},
+    {0xF0000, 0xFFFFF, 344},
+    {0x100000, 0x10FFFF, 345},
+};
+#define BLOCK_COUNT 346
+
 static const char *decomp_prefix[] = {
     "",
     "<noBreak>",
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
index 11f626ca0aba7a..5db850ca2d1f0c 100644
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -60,6 +60,7 @@
 CASE_FOLDING = "CaseFolding%s.txt"
 GRAPHEME_CLUSTER_BREAK = "auxiliary/GraphemeBreakProperty%s.txt"
 EMOJI_DATA = "emoji/emoji-data%s.txt"
+BLOCKS = "Blocks%s.txt"
 
 # Private Use Areas -- in planes 1, 15, 16
 PUA_1 = range(0xE000, 0xF900)
@@ -392,6 +393,34 @@ def makeunicodedata(unicode, trace):
         fprint("    NULL")
         fprint("};")
 
+        # Generate block tables
+        names = []
+        name_to_index = {}
+        blocks = []
+        for start, end, name in unicode.blocks:
+            if name not in name_to_index:
+                name_to_index[name] = len(names)
+                names.append(name)
+            blocks.append((start, end, name_to_index[name]))
+
+        fprint("static const char * const _PyUnicode_BlockNames[] = {")
+        for name in names:
+            fprint('    "%s",' % name)
+        fprint("};")
+
+        fprint("typedef struct {")
+        fprint("    Py_UCS4 start;")
+        fprint("    Py_UCS4 end;")
+        fprint("    unsigned short name;")
+        fprint("} _PyUnicode_Block;")
+
+        fprint("static const _PyUnicode_Block _PyUnicode_Blocks[] = {")
+        for start, end, name in blocks:
+            fprint("    {0x%04X, 0x%04X, %d}," % (start, end, name))
+        fprint("};")
+        fprint(f"#define BLOCK_COUNT {len(blocks)}")
+        fprint()
+
         fprint("static const char *decomp_prefix[] = {")
         for name in decomp_prefix:
             fprint("    \"%s\"," % name)
@@ -1205,6 +1234,13 @@ def __init__(self, version, ideograph_check=True):
                     ext_picts[char] = True
             self.ext_picts = ext_picts
 
+            # See 
https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G64189
+            self.blocks = []
+            for record in UcdFile(BLOCKS, version).records():
+                start_end, name = record
+                start, end = [int(c, 16) for c in start_end.split('..')]
+                self.blocks.append((start, end, name))
+            self.blocks.sort()
 
     def uselatin1(self):
         # restrict character range to ISO Latin 1

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]

Reply via email to