On Tue, 14 Oct 2025 at 14:11, Tomasz Kaminski <[email protected]> wrote:
>
>
>
> On Tue, Oct 14, 2025 at 2:57 PM Tomasz Kaminski <[email protected]> wrote:
>>
>>
>>
>>
>> On Tue, Oct 14, 2025 at 2:53 PM Jonathan Wakely <[email protected]> wrote:
>>>
>>> On Wed, 10 Sep 2025 at 15:53 +0200, Tomasz Kamiński wrote:
>>> >This patch implements _Escaping_sink that stores characters in a local
>>> >(stack)
>>> >buffer. When the buffer is full, the range of characters is escaped and
>>> >written
>>> >to the underlying sink.
>>> >
>>> >To support above, the __write_escaped_unicode_part function are defined.
>>> >It takes __str and __prev_esc by reference. The __prev_esc value is
>>> >updated based
>>> >on the last character written. If the buffer ends with an incomplete code
>>> >point
>>> >sequence, __str is left non-empty and the characters are not written.
>>> >_Escaping_sink then copies these characters to the front of the buffer to
>>> >reconstruct the full code point.
>>> >
>>> >__formatter__str::_M_format_range now uses _Escaping_sink to escape any
>>> >non-continuous character sequences.
>>> >
>>> >This addresses PR119820 by removing the code constructing string
>>> >completely.
>>> >
>>> > PR libstdc++/PR119820
>>> >
>>> >libstdc++-v3/ChangeLog:
>>> >
>>> > * include/std/format (__format::__write_escape_seqs)
>>> > (__format::_Escaping_sink): Define.
>>> > (__format::__write_escaped_unicode_part): Extract from
>>> > __format::__write_escaped_unicode.
>>> > (__format::__write_escaped_unicode): Forward to
>>> > __write_escaped_unicode_part.
>>> > (__formatter_str::_M_format_range): Use _Escaping sink.
>>> > * testsuite/std/format/ranges/string.cc: New tests for
>>> > character which codepoints will be split in buffer and
>>> > escaping. Invoked test_padding.
>>> >---
>>> >v2 just updates the patch description.
>>> >
>>> > libstdc++-v3/include/std/format | 197 +++++++++++++-----
>>> > .../testsuite/std/format/ranges/string.cc | 89 ++++++++
>>> > 2 files changed, 231 insertions(+), 55 deletions(-)
>>> >
>>> >diff --git a/libstdc++-v3/include/std/format
>>> >b/libstdc++-v3/include/std/format
>>> >index d6a2170e45d..347f9f0a479 100644
>>> >--- a/libstdc++-v3/include/std/format
>>> >+++ b/libstdc++-v3/include/std/format
>>> >@@ -105,6 +105,7 @@ namespace __format
>>> > template<typename _CharT> class _Sink;
>>> > template<typename _CharT> class _Fixedbuf_sink;
>>> > template<typename _Out, typename _CharT> class _Padding_sink;
>>> >+ template<typename _Out, typename _CharT> class _Escaping_sink;
>>> >
>>> > // Output iterator that writes to a type-erase character sink.
>>> > template<typename _CharT>
>>> >@@ -1066,6 +1067,17 @@ namespace __format
>>> > return ++__out;
>>> > }
>>> >
>>> >+ template<typename _Out, typename _CharT>
>>> >+ _Out
>>> >+ __write_escape_seqs(_Out __out, basic_string_view<_CharT> __units)
>>> >+ {
>>> >+ using _UChar = make_unsigned_t<_CharT>;
>>> >+ for (_CharT __c : __units)
>>> >+ __out = __format::__write_escape_seq(
>>> >+ __out, static_cast<_UChar>(__c),
>>> >_Escapes<_CharT>::_S_x());
>>>
>>> This is always a _Sink_iter, so we don't need to pass
>>> std::move(__out), right?
>>
>> Yes, but that the case everywhere, so I will just add std::move here for
>> consistency.
>
> Other functions for writing escaped sequences also do not do moves, so I will
> leave it
> as is. Both _Sink_iter and _Drop_iter do not need move.
Yeah, we could remove lots of std::move calls elsewhere, because
they're redundant. But with -O1 they'll be completely removed by
-ffold-simple-inlines so I don't think it's a priority to do anything.
I agree with not bothering to add the std::move to the new function in
this patch.
>>>
>>>
>>> OK for trunk.
>>>
>>>
>>> >+ return __out;
>>> >+ }
>>> >+
>>> > template<typename _Out, typename _CharT>
>>> > _Out
>>> > __write_escaped_char(_Out __out, _CharT __c)
>>> >@@ -1124,12 +1136,10 @@ namespace __format
>>> >
>>> > template<typename _CharT, typename _Out>
>>> > _Out
>>> >- __write_escaped_unicode(_Out __out,
>>> >- basic_string_view<_CharT> __str,
>>> >- _Term_char __term)
>>> >+ __write_escaped_unicode_part(_Out __out, basic_string_view<_CharT>&
>>> >__str,
>>> >+ bool& __prev_esc, _Term_char __term)
>>> > {
>>> > using _Str_view = basic_string_view<_CharT>;
>>> >- using _UChar = make_unsigned_t<_CharT>;
>>> > using _Esc = _Escapes<_CharT>;
>>> >
>>> > static constexpr char32_t __replace = U'\uFFFD';
>>> >@@ -1143,10 +1153,10 @@ namespace __format
>>> > }();
>>> >
>>> > __unicode::_Utf_view<char32_t, _Str_view> __v(std::move(__str));
>>> >+ __str = {};
>>> >+
>>> > auto __first = __v.begin();
>>> > auto const __last = __v.end();
>>> >-
>>> >- bool __prev_esc = true;
>>> > while (__first != __last)
>>> > {
>>> > bool __esc_ascii = false;
>>> >@@ -1185,15 +1195,32 @@ namespace __format
>>> > __out = __format::__write_escaped_char(__out, *__first.base());
>>> > else if (__esc_unicode)
>>> > __out = __format::__write_escape_seq(__out, *__first,
>>> > _Esc::_S_u());
>>> >- else // __esc_replace
>>> >- for (_CharT __c : _Str_view(__first.base(), __first._M_units()))
>>> >- __out = __format::__write_escape_seq(__out,
>>> >- static_cast<_UChar>(__c),
>>> >- _Esc::_S_x());
>>> >+ // __esc_replace
>>> >+ else if (_Str_view __units(__first.base(), __first._M_units());
>>> >+ __units.end() != __last.base())
>>> >+ __out = __format::__write_escape_seqs(__out, __units);
>>> >+ else
>>> >+ {
>>> >+ __str = __units;
>>> >+ return __out;
>>> >+ }
>>> >+
>>> > __prev_esc = true;
>>> > ++__first;
>>> >-
>>> > }
>>> >+
>>> >+ return __out;
>>> >+ }
>>> >+
>>> >+ template<typename _CharT, typename _Out>
>>> >+ _Out
>>> >+ __write_escaped_unicode(_Out __out, basic_string_view<_CharT> __str,
>>> >+ _Term_char __term)
>>> >+ {
>>> >+ bool __prev_escape = true;
>>> >+ __out = __format::__write_escaped_unicode_part(__out, __str,
>>> >+ __prev_escape, __term);
>>> >+ __out = __format::__write_escape_seqs(__out, __str);
>>> > return __out;
>>> > }
>>> >
>>> >@@ -1412,55 +1439,28 @@ namespace __format
>>> > size_t(ranges::distance(__rg)));
>>> > return format(__str, __fc);
>>> > }
>>> >- else if (!_M_spec._M_debug)
>>> >+ else
>>> > {
>>> >+ auto __handle_debug = [this, &__rg]<typename _NOut>(_NOut
>>> >__nout)
>>> >+ {
>>> >+ if (!_M_spec._M_debug)
>>> >+ return ranges::copy(__rg, std::move(__nout)).out;
>>> >+
>>> >+ _Escaping_sink<_NOut, _CharT>
>>> >+ __sink(std::move(__nout), _Term_quote);
>>> >+ ranges::copy(__rg, __sink.out());
>>> >+ return __sink._M_finish();
>>> >+ };
>>> >+
>>> > const size_t __padwidth = _M_spec._M_get_width(__fc);
>>> > if (__padwidth == 0 && _M_spec._M_prec_kind == _WP_none)
>>> >- return ranges::copy(__rg, __fc.out()).out;
>>> >+ return __handle_debug(__fc.out());
>>> >
>>> >- _Padding_sink<_Out, _CharT> __sink(__fc.out(), __padwidth,
>>> >-
>>> >_M_spec._M_get_precision(__fc));
>>> >- ranges::copy(__rg, __sink.out());
>>> >+ _Padding_sink<_Out, _CharT>
>>> >+ __sink(__fc.out(), __padwidth,
>>> >_M_spec._M_get_precision(__fc));
>>> >+ __handle_debug(__sink.out());
>>> > return __sink._M_finish(_M_spec._M_align, _M_spec._M_fill);
>>> > }
>>> >- else if constexpr (ranges::forward_range<_Rg> ||
>>> >ranges::sized_range<_Rg>)
>>> >- {
>>> >- const size_t __n(ranges::distance(__rg));
>>> >- size_t __w = __n;
>>> >- if constexpr
>>> >(!__unicode::__literal_encoding_is_unicode<_CharT>())
>>> >- if (size_t __max = _M_spec._M_get_precision(__fc); __n >
>>> >__max)
>>> >- __w == __max;
>>> >-
>>> >- if (__w <= __format::__stackbuf_size<_CharT>)
>>> >- {
>>> >- _CharT __buf[__format::__stackbuf_size<_CharT>];
>>> >- ranges::copy_n(ranges::begin(__rg), __w, __buf);
>>> >- return _M_format_escaped(_String_view(__buf, __n), __fc);
>>> >- }
>>> >- else if constexpr (ranges::random_access_range<_Rg>)
>>> >- {
>>> >- ranges::iterator_t<_Rg> __first = ranges::begin(__rg);
>>> >- ranges::subrange __sub(__first, __first + __w);
>>> >- return _M_format_escaped(_String(from_range, __sub),
>>> >__fc);
>>> >- }
>>> >- else if (__w <= __n)
>>> >- {
>>> >- ranges::subrange __sub(
>>> >- counted_iterator(ranges::begin(__rg), __w),
>>> >- default_sentinel);
>>> >- return _M_format_escaped(_String(from_range, __sub),
>>> >__fc);
>>> >- }
>>> >- else if constexpr (ranges::sized_range<_Rg>)
>>> >- return _M_format_escaped(_String(from_range, __rg), __fc);
>>> >- else
>>> >- {
>>> >- // N.B. preserve the computed size
>>> >- ranges::subrange __sub(__rg, __n);
>>> >- return _M_format_escaped(_String(from_range, __sub),
>>> >__fc);
>>> >- }
>>> >- }
>>> >- else
>>> >- return _M_format_escaped(_String(from_range, __rg), __fc);
>>> > }
>>> >
>>> > constexpr void
>>> >@@ -3915,6 +3915,93 @@ namespace __format
>>> > }
>>> > };
>>> >
>>> >+ template<typename _Out, typename _CharT>
>>> >+ class _Escaping_sink : public _Buf_sink<_CharT>
>>> >+ {
>>> >+ using _Esc = _Escapes<_CharT>;
>>> >+
>>> >+ _Out _M_out;
>>> >+ _Term_char _M_term : 2;
>>> >+ unsigned _M_prev_escape : 1;
>>> >+ unsigned _M_out_discards : 1;
>>> >+
>>> >+ void
>>> >+ _M_sync_discarding()
>>> >+ {
>>> >+ if constexpr (is_same_v<_Out, _Sink_iter<_CharT>>)
>>> >+ _M_out_discards = _M_out._M_discarding();
>>> >+ }
>>> >+
>>> >+ void
>>> >+ _M_write()
>>> >+ {
>>> >+ span<_CharT> __bytes = this->_M_used();
>>> >+ basic_string_view<_CharT> __str(__bytes.data(), __bytes.size());
>>> >+
>>> >+ size_t __rem = 0;
>>> >+ if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>())
>>> >+ {
>>> >+ bool __prev_escape = _M_prev_escape;
>>> >+ _M_out = __format::__write_escaped_unicode_part(
>>> >+ std::move(_M_out), __str, __prev_escape, _M_term);
>>> >+ _M_prev_escape = __prev_escape;
>>> >+
>>> >+ __rem = __str.size();
>>> >+ if (__rem > 0 && __str.data() != this->_M_buf) [[unlikely]]
>>> >+ ranges::move(__str, this->_M_buf);
>>> >+ }
>>> >+ else
>>> >+ _M_out = __format::__write_escaped_ascii(
>>> >+ std::move(_M_out), __str, _M_term);
>>> >+
>>> >+ this->_M_reset(this->_M_buf, __rem);
>>> >+ _M_sync_discarding();
>>> >+ }
>>> >+
>>> >+ void
>>> >+ _M_overflow() override
>>> >+ {
>>> >+ if (_M_out_discards)
>>> >+ this->_M_rewind();
>>> >+ else
>>> >+ _M_write();
>>> >+ }
>>> >+
>>> >+ bool
>>> >+ _M_discarding() const override
>>> >+ { return _M_out_discards; }
>>> >+
>>> >+ public:
>>> >+ [[__gnu__::__always_inline__]]
>>> >+ explicit
>>> >+ _Escaping_sink(_Out __out, _Term_char __term)
>>> >+ : _M_out(std::move(__out)), _M_term(__term),
>>> >+ _M_prev_escape(true), _M_out_discards(false)
>>> >+ {
>>> >+ _M_out = __format::__write(std::move(_M_out),
>>> >_Esc::_S_term(_M_term));
>>> >+ _M_sync_discarding();
>>> >+ }
>>> >+
>>> >+ _Out
>>> >+ _M_finish()
>>> >+ {
>>> >+ if (_M_out_discards)
>>> >+ return std::move(_M_out);
>>> >+
>>> >+ if (!this->_M_used().empty())
>>> >+ {
>>> >+ _M_write();
>>> >+ if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>())
>>> >+ if (auto __rem = this->_M_used(); !__rem.empty())
>>> >+ {
>>> >+ basic_string_view<_CharT> __str(__rem.data(), __rem.size());
>>> >+ _M_out = __format::__write_escape_seqs(std::move(_M_out),
>>> >__str);
>>> >+ }
>>> >+ }
>>> >+ return __format::__write(std::move(_M_out), _Esc::_S_term(_M_term));
>>> >+ }
>>> >+ };
>>> >+
>>> > enum class _Arg_t : unsigned char {
>>> > _Arg_none, _Arg_bool, _Arg_c, _Arg_i, _Arg_u, _Arg_ll, _Arg_ull,
>>> > _Arg_flt, _Arg_dbl, _Arg_ldbl, _Arg_str, _Arg_sv, _Arg_ptr,
>>> > _Arg_handle,
>>> >diff --git a/libstdc++-v3/testsuite/std/format/ranges/string.cc
>>> >b/libstdc++-v3/testsuite/std/format/ranges/string.cc
>>> >index 99e5eaf411f..a7d584f8e42 100644
>>> >--- a/libstdc++-v3/testsuite/std/format/ranges/string.cc
>>> >+++ b/libstdc++-v3/testsuite/std/format/ranges/string.cc
>>> >@@ -279,6 +279,93 @@ void test_padding()
>>> > VERIFY( strip_prefix(resv, 46, '*') );
>>> > VERIFY( strip_quotes(resv) );
>>> > VERIFY( resv == in );
>>> >+
>>> >+ // width is 5, size is 15
>>> >+ in = "\u2160\u2161\u2162\u2163\u2164";
>>> >+ in += in; // width is 10, size is 30
>>> >+ in += in; // width is 20, size is 60
>>> >+ in += in; // width is 40, size is 120
>>> >+ in += in; // width is 80, size is 240
>>> >+ in += in; // width is 160, size is 480
>>> >+
>>> >+ lc.assign_range(in);
>>> >+
>>> >+ resv = res = std::format("{:s}", lc);
>>> >+ VERIFY( resv == in );
>>> >+
>>> >+ resv = res = std::format("{:*>10s}", lc);
>>> >+ VERIFY( resv == in );
>>> >+
>>> >+ resv = res = std::format("{:*>200s}", lc);
>>> >+ VERIFY( strip_prefix(resv, 40, '*') );
>>> >+ VERIFY( resv == in );
>>> >+
>>> >+ resv = res = std::format("{:?s}", lc);
>>> >+ VERIFY( strip_quotes(resv) );
>>> >+ VERIFY( resv == in );
>>> >+
>>> >+ resv = res = std::format("{:*>10?s}", lc);
>>> >+ VERIFY( strip_quotes(resv) );
>>> >+ VERIFY( resv == in );
>>> >+
>>> >+ resv = res = std::format("{:*>200?s}", lc);
>>> >+ VERIFY( strip_prefix(resv, 38, '*') );
>>> >+ VERIFY( strip_quotes(resv) );
>>> >+ VERIFY( resv == in );
>>> >+}
>>> >+
>>> >+void test_escaping()
>>> >+{
>>> >+ std::string res;
>>> >+ std::string_view resv;
>>> >+
>>> >+ const std::string_view input =
>>> >+ "\t\n\r\\\""
>>> >+ "\u008a" // Cc, Control, Line Tabulation Set,
>>> >+ "\u00ad" // Cf, Format, Soft Hyphen
>>> >+ "\u1d3d" // Lm, Modifier letter, Modifier Letter Capital Ou
>>> >+ "\u00a0" // Zs, Space Separator, No-Break Space (NBSP)
>>> >+ "\u2029" // Zp, Paragraph Separator, Paragraph Separator
>>> >+ "\U0001f984" // So, Other Symbol, Unicorn Face
>>> >+ ;
>>> >+ const std::string_view output =
>>> >+ R"(\t\n\r\\\")"
>>> >+ R"(\u{8a})"
>>> >+ R"(\u{ad})"
>>> >+ "\u1d3d"
>>> >+ R"(\u{a0})"
>>> >+ R"(\u{2029})"
>>> >+ "\U0001f984";
>>> >+
>>> >+ std::forward_list<char> lc(std::from_range, input);
>>> >+ resv = res = std::format("{:s}", lc);
>>> >+ VERIFY( resv == input );
>>> >+ resv = res = std::format("{:?s}", lc);
>>> >+ VERIFY( strip_quotes(resv) );
>>> >+ VERIFY( resv == output );
>>> >+
>>> >+ // width is 5, size is 15
>>> >+ std::string in = "\u2160\u2161\u2162\u2163\u2164";
>>> >+ in += in; // width is 10, size is 30
>>> >+ in += in; // width is 20, size is 60
>>> >+ in += in; // width is 40, size is 120
>>> >+ in += in; // width is 80, size is 240
>>> >+ in += in; // width is 160, size is 480
>>> >+ std::string_view inv = in;
>>> >+
>>> >+ // last charcter is incomplete
>>> >+ lc.assign_range(inv.substr(0, 479));
>>> >+
>>> >+ // non-debug format, chars copied as is
>>> >+ resv = res = std::format("{:s}", lc);
>>> >+ VERIFY( resv == inv.substr(0, 479) );
>>> >+
>>> >+ // debug-format, incomplete code-point sequence is copied
>>> >+ resv = res = std::format("{:?s}", lc);
>>> >+ VERIFY( strip_quotes(resv) );
>>> >+ VERIFY( resv.substr(0, 477) == inv.substr(0, 477) );
>>> >+ resv.remove_prefix(477);
>>> >+ VERIFY( resv == R"(\x{e2}\x{85})" );
>>> > }
>>> >
>>> > int main()
>>> >@@ -287,4 +374,6 @@ int main()
>>> > test_outputs<char>();
>>> > test_outputs<wchar_t>();
>>> > test_nested();
>>> >+ test_padding();
>>> >+ test_escaping();
>>> > }
>>> >--
>>> >2.51.0
>>> >
>>> >
>>>