https://github.com/python/cpython/commit/92c0ec2b007757287a5c4791437a8d5a6173ce58
commit: 92c0ec2b007757287a5c4791437a8d5a6173ce58
branch: main
author: Serhiy Storchaka <[email protected]>
committer: serhiy-storchaka <[email protected]>
date: 2026-01-29T17:33:10+02:00
summary:
gh-144264: Speed up Base64 decoding of data containing ignored characters
(GH-144265)
Try the fast path again after decoding a quad the slow path.
Use a bitmap cache for the ignorechars argument.
files:
A Misc/NEWS.d/next/Library/2026-01-27-10-02-04.gh-issue-144264.Wmzbol.rst
M Lib/test/test_binascii.py
M Modules/binascii.c
diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py
index 4cfc332e89bea8..49accb08b62e40 100644
--- a/Lib/test/test_binascii.py
+++ b/Lib/test/test_binascii.py
@@ -202,6 +202,17 @@ def assertNonBase64Data(data, expected, ignorechars):
assertNonBase64Data(b'a\nb==', b'i', ignorechars=bytearray(b'\n'))
assertNonBase64Data(b'a\nb==', b'i', ignorechars=memoryview(b'\n'))
+ # Same cell in the cache: '\r' >> 3 == '\n' >> 3.
+ data = self.type2test(b'\r\n')
+ with self.assertRaises(binascii.Error):
+ binascii.a2b_base64(data, ignorechars=b'\r')
+ self.assertEqual(binascii.a2b_base64(data, ignorechars=b'\r\n'), b'')
+ # Same bit mask in the cache: '*' & 31 == '\n' & 31.
+ data = self.type2test(b'*\n')
+ with self.assertRaises(binascii.Error):
+ binascii.a2b_base64(data, ignorechars=b'*')
+ self.assertEqual(binascii.a2b_base64(data, ignorechars=b'*\n'), b'')
+
data = self.type2test(b'a\nb==')
with self.assertRaises(TypeError):
binascii.a2b_base64(data, ignorechars='')
diff --git
a/Misc/NEWS.d/next/Library/2026-01-27-10-02-04.gh-issue-144264.Wmzbol.rst
b/Misc/NEWS.d/next/Library/2026-01-27-10-02-04.gh-issue-144264.Wmzbol.rst
new file mode 100644
index 00000000000000..11e3fdeb4355cf
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-01-27-10-02-04.gh-issue-144264.Wmzbol.rst
@@ -0,0 +1,3 @@
+Speed up Base64 decoding of data containing ignored characters (both in
+non-strict mode and with an explicit *ignorechars* argument).
+It is now up to 2 times faster for multiline Base64 data.
diff --git a/Modules/binascii.c b/Modules/binascii.c
index 593b27ac5ede65..201e7798bb7a8c 100644
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -469,12 +469,23 @@ binascii_b2a_uu_impl(PyObject *module, Py_buffer *data,
int backtick)
return PyBytesWriter_FinishWithPointer(writer, ascii_data);
}
+typedef unsigned char ignorecache_t[32];
static int
-ignorechar(unsigned char c, Py_buffer *ignorechars)
+ignorechar(unsigned char c, const Py_buffer *ignorechars,
+ ignorecache_t ignorecache)
{
- return (ignorechars->buf != NULL &&
- memchr(ignorechars->buf, c, ignorechars->len));
+ if (ignorechars == NULL) {
+ return 0;
+ }
+ if (ignorecache[c >> 3] & (1 << (c & 7))) {
+ return 1;
+ }
+ if (memchr(ignorechars->buf, c, ignorechars->len)) {
+ ignorecache[c >> 3] |= 1 << (c & 7);
+ return 1;
+ }
+ return 0;
}
/*[clinic input]
@@ -508,6 +519,13 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer
*data, int strict_mode,
if (strict_mode == -1) {
strict_mode = (ignorechars->buf != NULL);
}
+ if (!strict_mode || ignorechars->buf == NULL || ignorechars->len == 0) {
+ ignorechars = NULL;
+ }
+ ignorecache_t ignorecache;
+ if (ignorechars != NULL) {
+ memset(ignorecache, 0, sizeof(ignorecache));
+ }
/* Allocate the buffer */
Py_ssize_t bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later
*/
@@ -517,8 +535,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data,
int strict_mode,
}
unsigned char *bin_data = PyBytesWriter_GetData(writer);
- size_t i = 0; /* Current position in input */
-
+fastpath:
/* Fast path: use optimized decoder for complete quads.
* This works for both strict and non-strict mode for valid input.
* The fast path stops at padding, invalid chars, or incomplete groups.
@@ -527,7 +544,8 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data,
int strict_mode,
Py_ssize_t fast_chars = base64_decode_fast(ascii_data,
(Py_ssize_t)ascii_len,
bin_data, table_a2b_base64);
if (fast_chars > 0) {
- i = (size_t)fast_chars;
+ ascii_data += fast_chars;
+ ascii_len -= fast_chars;
bin_data += (fast_chars / 4) * 3;
}
}
@@ -536,8 +554,8 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data,
int strict_mode,
int quad_pos = 0;
unsigned char leftchar = 0;
int pads = 0;
- for (; i < ascii_len; i++) {
- unsigned char this_ch = ascii_data[i];
+ for (; ascii_len; ascii_data++, ascii_len--) {
+ unsigned char this_ch = *ascii_data;
/* Check for pad sequences and ignore
** the invalid ones.
@@ -549,7 +567,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data,
int strict_mode,
if (quad_pos == 0) {
state = get_binascii_state(module);
if (state) {
- PyErr_SetString(state->Error, (i == 0)
+ PyErr_SetString(state->Error, (ascii_data == data->buf)
? "Leading padding not allowed"
: "Excess padding not allowed");
}
@@ -580,7 +598,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data,
int strict_mode,
unsigned char v = table_a2b_base64[this_ch];
if (v >= 64) {
- if (strict_mode && !ignorechar(this_ch, ignorechars)) {
+ if (strict_mode && !ignorechar(this_ch, ignorechars, ignorecache))
{
state = get_binascii_state(module);
if (state) {
PyErr_SetString(state->Error, "Only base64 data is
allowed");
@@ -621,7 +639,9 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data,
int strict_mode,
quad_pos = 0;
*bin_data++ = (leftchar << 6) | (v);
leftchar = 0;
- break;
+ ascii_data++;
+ ascii_len--;
+ goto fastpath;
}
}
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3//lists/python-checkins.python.org
Member address: [email protected]