std::regex builds a cache of equivalence classes by calling
std::regex_traits<char>::transform_primary(c) for every char, which then
calls std::collate<char>::transform which calls strxfrm. On several
targets strxfrm fails for non-ASCII characters. Because strxfrm has no
return value reserved to indicate an error, some implementations return
INT_MAX or SIZE_MAX. This causes std::collate::transform to try to
allocate a huge buffer, which is either very slow or throws
std::bad_alloc. We should check errno after calling strxfrm to detect
errors and then throw a more appropriate exception instead of trying to
allocate a huge buffer.
Unfortunately the std::collate<C>::_M_transform function has a
non-throwing exception specifier, so we can't do the error handling
there.
As well as checking errno, this patch changes std::collate::do_transform
to use __builtin_alloca for small inputs, and to use RAII to deallocate
the buffers used for large inputs.
This change isn't sufficient to fix the three std::regex bugs caused by
the lack of error handling in std::collate::do_transform, we also need
to make std::regex_traits::transform_primary handle exceptions. This
change also attempts to make transform_primary closer to the effects
described in the standard, by not even attempting to use std::collate
if the locale's std::collate facet has been replaced (see PR 118105).
Arguably, we should not even try to call transform_primary for any char
values over 127, since they're never valid in locales that use UTF-8 or
7-bit ASCII, and probably for other charsets too. Handling 128
exceptions for every std::regex compilation is very inefficient, but at
least it now works instead of failing with std::bad_alloc, and no longer
allocates 128 x 2GB. Maybe for C++26 we could check the locale's
std::text_encoding and use that to decide whether to cache equivalence
classes for char values over 127.
I'm unsure if std::regex_traits<C>::transform_primary is supposed to
convert the string to lower case or not. The general regex traits
requirements ([re.req] p20) do say "when character case is not
considered" but the specification for the std::regex_traits<char> and
std::regex_traits<wchar_t> specializations ([re.traits] p7) don't say
anything about that.
libstdc++-v3/ChangeLog:
PR libstdc++/85824
PR libstdc++/94409
PR libstdc++/98723
PR libstdc++/118105
* include/bits/locale_classes.tcc (collate::do_transform): Check
errno after calling _M_transform. Use RAII type to manage the
buffer and to restore errno.
* include/bits/regex.h (regex_traits::transform_primary): Handle
exceptions from std::collate::transform and do not try to use
std::collate for user-defined facets.
---
Tested x86_64-linux.
libstdc++-v3/include/bits/locale_classes.tcc | 94 ++++++++++++++------
libstdc++-v3/include/bits/regex.h | 43 ++++++---
2 files changed, 96 insertions(+), 41 deletions(-)
diff --git a/libstdc++-v3/include/bits/locale_classes.tcc
b/libstdc++-v3/include/bits/locale_classes.tcc
index 2b78008e9ae..6e8f27bf0d9 100644
--- a/libstdc++-v3/include/bits/locale_classes.tcc
+++ b/libstdc++-v3/include/bits/locale_classes.tcc
@@ -37,6 +37,9 @@
#ifdef _GLIBCXX_SYSHDR
#pragma GCC system_header
#endif
+
+#include <cerrno>
+
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wc++11-extensions" // extern template
#pragma GCC diagnostic ignored "-Wvariadic-macros"
@@ -295,43 +298,76 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
size_t __len = (__hi - __lo) * 2;
- _CharT* __c = new _CharT[__len];
+ struct _Buf
+ {
+ _Buf(size_t __n, void* __buf, int __e)
+ : _M_c(__buf ? (_CharT*)__buf : new _CharT[__n]),
+ _M_stackbuf(__buf),
+ _M_errno(__e)
+ { }
- __try
+ ~_Buf()
{
- // strxfrm stops when it sees a nul character so we break
- // the string into zero-terminated substrings and pass those
- // to strxfrm.
- for (;;)
+ if (_M_c != _M_stackbuf)
+ delete[] _M_c;
+ if (errno == 0)
+ errno = _M_errno;
+ }
+
+ void _M_realloc(size_t __len)
+ {
+ _CharT* __p = new _CharT[__len];
+ if (_M_c != _M_stackbuf)
+ delete[] _M_c;
+ _M_c = __p;
+ }
+
+ _CharT* _M_c;
+ void* const _M_stackbuf;
+ int _M_errno;
+ };
+
+ const size_t __bytes = __len * sizeof(_CharT);
+ _Buf __buf(__len, __bytes <= 256 ? __builtin_alloca(__bytes) : 0, errno);
+ errno = 0;
+
+ // strxfrm stops when it sees a nul character so we break
+ // the string into zero-terminated substrings and pass those
+ // to strxfrm.
+ for (;;)
+ {
+ // First try a buffer perhaps big enough.
+ size_t __res = _M_transform(__buf._M_c, __p, __len);
+ // If the buffer was not large enough, try again with the
+ // correct size.
+ if (__res >= __len)
{
- // First try a buffer perhaps big enough.
- size_t __res = _M_transform(__c, __p, __len);
- // If the buffer was not large enough, try again with the
- // correct size.
- if (__res >= __len)
+ if (__builtin_expect(errno, 0))
{
- __len = __res + 1;
- delete [] __c, __c = 0;
- __c = new _CharT[__len];
- __res = _M_transform(__c, __p, __len);
+#if __cpp_exceptions
+ __throw_system_error(errno);
+#else
+ // std::regex can call this function internally with
+ // char values that always fail, so we don't want to
+ // use _GLIBCXX_THROW_OR_ABORT here.
+ __ret.clear();
+ break;
+#endif
}
- __ret.append(__c, __res);
- __p += char_traits<_CharT>::length(__p);
- if (__p == __pend)
- break;
-
- __p++;
- __ret.push_back(_CharT());
+ __len = __res + 1;
+ __buf._M_realloc(__len);
+ __res = _M_transform(__buf._M_c, __p, __len);
}
- }
- __catch(...)
- {
- delete [] __c;
- __throw_exception_again;
- }
- delete [] __c;
+ __ret.append(__buf._M_c, __res);
+ __p += char_traits<_CharT>::length(__p);
+ if (__p == __pend)
+ break;
+
+ __p++;
+ __ret.push_back(_CharT());
+ }
return __ret;
}
diff --git a/libstdc++-v3/include/bits/regex.h
b/libstdc++-v3/include/bits/regex.h
index 68ff479c905..57ea68e7ee9 100644
--- a/libstdc++-v3/include/bits/regex.h
+++ b/libstdc++-v3/include/bits/regex.h
@@ -253,9 +253,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
* @param __first beginning of the character sequence.
* @param __last one-past-the-end of the character sequence.
*
- * Effects: if typeid(use_facet<collate<_Ch_type> >) ==
- * typeid(collate_byname<_Ch_type>) and the form of the sort key
- * returned by collate_byname<_Ch_type>::transform(__first, __last)
+ * Effects: if `typeid(use_facet<collate<_Ch_type>>(getloc())) ==
+ * typeid(collate_byname<_Ch_type>)` and the form of the sort key
+ * returned by `collate_byname<_Ch_type>::transform(__first, __last)`
* is known and can be converted into a primary sort key
* then returns that key, otherwise returns an empty string.
*
@@ -265,17 +265,36 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
string_type
transform_primary(_Fwd_iter __first, _Fwd_iter __last) const
{
+ string_type __ret;
+#if __cpp_rtti
+ const auto& __fclt = use_facet<collate<char_type>>(_M_locale);
+ if (typeid(__fclt) != typeid(collate<char_type>)) // FIXME: PR 118110
+ return __ret;
+
// TODO : this is not entirely correct.
// This function requires extra support from the platform.
- //
- // Read http://gcc.gnu.org/ml/libstdc++/2013-09/msg00117.html and
- // http://www.open-std.org/Jtc1/sc22/wg21/docs/papers/2003/n1429.htm
- // for details.
- typedef std::ctype<char_type> __ctype_type;
- const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
- _GLIBCXX_STD_C::vector<char_type> __s(__first, __last);
- __fctyp.tolower(__s.data(), __s.data() + __s.size());
- return this->transform(__s.data(), __s.data() + __s.size());
+ // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118105
+
+ const auto& __fctyp(use_facet<ctype<char_type>>(_M_locale));
+ basic_string<char_type> __s(__first, __last);
+ const auto __p = const_cast<char_type*>(__s.c_str());
+ const auto __pend = __p + __s.size();
+ // XXX: should we use tolower here? The regex traits requirements
+ // say that transform_primary ignores case, but the specification
+ // for the std::regex_traits<char> and std::regex_traits<wchar_t>
+ // specializations don't, they seem to suggest just using the
+ // collate::transform function to get a primary sort key.
+ __fctyp.tolower(__p, __pend);
+
+ __try
+ {
+ __ret = __fclt.transform(__p, __pend);
+ }
+ __catch (const exception&)
+ {
+ }
+#endif
+ return __ret;
}
/**
--
2.47.1