This patch implements _Escaping_sink that stores characters in a local (stack) buffer. When the buffer is full, the range of characters is escaped and written to the underlying sink.
To support above, the __write_escaped_unicode_part function are defined. It takes __str and __prev_esc by reference. The __prev_esc value is updated based on the last character written. If the buffer ends with an incomplete code point sequence, __str is left non-empty and the characters are not written. _Escaping_sink then copies these characters to the front of the buffer to reconstruct the full code point. __formatter__str::_M_format_range now uses _Escaping_sink to escape any non-continuous character sequences. This addresses PR119820 by removing the code constructing string completely. PR libstdc++/PR119820 libstdc++-v3/ChangeLog: * include/std/format (__format::__write_escape_seqs) (__format::_Escaping_sink): Define. (__format::__write_escaped_unicode_part): Extract from __format::__write_escaped_unicode. (__format::__write_escaped_unicode): Forward to __write_escaped_unicode_part. (__formatter_str::_M_format_range): Use _Escaping sink. * testsuite/std/format/ranges/string.cc: New tests for character which codepoints will be split in buffer and escaping. Invoked test_padding. --- v2 just updates the patch description. libstdc++-v3/include/std/format | 197 +++++++++++++----- .../testsuite/std/format/ranges/string.cc | 89 ++++++++ 2 files changed, 231 insertions(+), 55 deletions(-) diff --git a/libstdc++-v3/include/std/format b/libstdc++-v3/include/std/format index d6a2170e45d..347f9f0a479 100644 --- a/libstdc++-v3/include/std/format +++ b/libstdc++-v3/include/std/format @@ -105,6 +105,7 @@ namespace __format template<typename _CharT> class _Sink; template<typename _CharT> class _Fixedbuf_sink; template<typename _Out, typename _CharT> class _Padding_sink; + template<typename _Out, typename _CharT> class _Escaping_sink; // Output iterator that writes to a type-erase character sink. template<typename _CharT> @@ -1066,6 +1067,17 @@ namespace __format return ++__out; } + template<typename _Out, typename _CharT> + _Out + __write_escape_seqs(_Out __out, basic_string_view<_CharT> __units) + { + using _UChar = make_unsigned_t<_CharT>; + for (_CharT __c : __units) + __out = __format::__write_escape_seq( + __out, static_cast<_UChar>(__c), _Escapes<_CharT>::_S_x()); + return __out; + } + template<typename _Out, typename _CharT> _Out __write_escaped_char(_Out __out, _CharT __c) @@ -1124,12 +1136,10 @@ namespace __format template<typename _CharT, typename _Out> _Out - __write_escaped_unicode(_Out __out, - basic_string_view<_CharT> __str, - _Term_char __term) + __write_escaped_unicode_part(_Out __out, basic_string_view<_CharT>& __str, + bool& __prev_esc, _Term_char __term) { using _Str_view = basic_string_view<_CharT>; - using _UChar = make_unsigned_t<_CharT>; using _Esc = _Escapes<_CharT>; static constexpr char32_t __replace = U'\uFFFD'; @@ -1143,10 +1153,10 @@ namespace __format }(); __unicode::_Utf_view<char32_t, _Str_view> __v(std::move(__str)); + __str = {}; + auto __first = __v.begin(); auto const __last = __v.end(); - - bool __prev_esc = true; while (__first != __last) { bool __esc_ascii = false; @@ -1185,15 +1195,32 @@ namespace __format __out = __format::__write_escaped_char(__out, *__first.base()); else if (__esc_unicode) __out = __format::__write_escape_seq(__out, *__first, _Esc::_S_u()); - else // __esc_replace - for (_CharT __c : _Str_view(__first.base(), __first._M_units())) - __out = __format::__write_escape_seq(__out, - static_cast<_UChar>(__c), - _Esc::_S_x()); + // __esc_replace + else if (_Str_view __units(__first.base(), __first._M_units()); + __units.end() != __last.base()) + __out = __format::__write_escape_seqs(__out, __units); + else + { + __str = __units; + return __out; + } + __prev_esc = true; ++__first; - } + + return __out; + } + + template<typename _CharT, typename _Out> + _Out + __write_escaped_unicode(_Out __out, basic_string_view<_CharT> __str, + _Term_char __term) + { + bool __prev_escape = true; + __out = __format::__write_escaped_unicode_part(__out, __str, + __prev_escape, __term); + __out = __format::__write_escape_seqs(__out, __str); return __out; } @@ -1412,55 +1439,28 @@ namespace __format size_t(ranges::distance(__rg))); return format(__str, __fc); } - else if (!_M_spec._M_debug) + else { + auto __handle_debug = [this, &__rg]<typename _NOut>(_NOut __nout) + { + if (!_M_spec._M_debug) + return ranges::copy(__rg, std::move(__nout)).out; + + _Escaping_sink<_NOut, _CharT> + __sink(std::move(__nout), _Term_quote); + ranges::copy(__rg, __sink.out()); + return __sink._M_finish(); + }; + const size_t __padwidth = _M_spec._M_get_width(__fc); if (__padwidth == 0 && _M_spec._M_prec_kind == _WP_none) - return ranges::copy(__rg, __fc.out()).out; + return __handle_debug(__fc.out()); - _Padding_sink<_Out, _CharT> __sink(__fc.out(), __padwidth, - _M_spec._M_get_precision(__fc)); - ranges::copy(__rg, __sink.out()); + _Padding_sink<_Out, _CharT> + __sink(__fc.out(), __padwidth, _M_spec._M_get_precision(__fc)); + __handle_debug(__sink.out()); return __sink._M_finish(_M_spec._M_align, _M_spec._M_fill); } - else if constexpr (ranges::forward_range<_Rg> || ranges::sized_range<_Rg>) - { - const size_t __n(ranges::distance(__rg)); - size_t __w = __n; - if constexpr (!__unicode::__literal_encoding_is_unicode<_CharT>()) - if (size_t __max = _M_spec._M_get_precision(__fc); __n > __max) - __w == __max; - - if (__w <= __format::__stackbuf_size<_CharT>) - { - _CharT __buf[__format::__stackbuf_size<_CharT>]; - ranges::copy_n(ranges::begin(__rg), __w, __buf); - return _M_format_escaped(_String_view(__buf, __n), __fc); - } - else if constexpr (ranges::random_access_range<_Rg>) - { - ranges::iterator_t<_Rg> __first = ranges::begin(__rg); - ranges::subrange __sub(__first, __first + __w); - return _M_format_escaped(_String(from_range, __sub), __fc); - } - else if (__w <= __n) - { - ranges::subrange __sub( - counted_iterator(ranges::begin(__rg), __w), - default_sentinel); - return _M_format_escaped(_String(from_range, __sub), __fc); - } - else if constexpr (ranges::sized_range<_Rg>) - return _M_format_escaped(_String(from_range, __rg), __fc); - else - { - // N.B. preserve the computed size - ranges::subrange __sub(__rg, __n); - return _M_format_escaped(_String(from_range, __sub), __fc); - } - } - else - return _M_format_escaped(_String(from_range, __rg), __fc); } constexpr void @@ -3915,6 +3915,93 @@ namespace __format } }; + template<typename _Out, typename _CharT> + class _Escaping_sink : public _Buf_sink<_CharT> + { + using _Esc = _Escapes<_CharT>; + + _Out _M_out; + _Term_char _M_term : 2; + unsigned _M_prev_escape : 1; + unsigned _M_out_discards : 1; + + void + _M_sync_discarding() + { + if constexpr (is_same_v<_Out, _Sink_iter<_CharT>>) + _M_out_discards = _M_out._M_discarding(); + } + + void + _M_write() + { + span<_CharT> __bytes = this->_M_used(); + basic_string_view<_CharT> __str(__bytes.data(), __bytes.size()); + + size_t __rem = 0; + if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>()) + { + bool __prev_escape = _M_prev_escape; + _M_out = __format::__write_escaped_unicode_part( + std::move(_M_out), __str, __prev_escape, _M_term); + _M_prev_escape = __prev_escape; + + __rem = __str.size(); + if (__rem > 0 && __str.data() != this->_M_buf) [[unlikely]] + ranges::move(__str, this->_M_buf); + } + else + _M_out = __format::__write_escaped_ascii( + std::move(_M_out), __str, _M_term); + + this->_M_reset(this->_M_buf, __rem); + _M_sync_discarding(); + } + + void + _M_overflow() override + { + if (_M_out_discards) + this->_M_rewind(); + else + _M_write(); + } + + bool + _M_discarding() const override + { return _M_out_discards; } + + public: + [[__gnu__::__always_inline__]] + explicit + _Escaping_sink(_Out __out, _Term_char __term) + : _M_out(std::move(__out)), _M_term(__term), + _M_prev_escape(true), _M_out_discards(false) + { + _M_out = __format::__write(std::move(_M_out), _Esc::_S_term(_M_term)); + _M_sync_discarding(); + } + + _Out + _M_finish() + { + if (_M_out_discards) + return std::move(_M_out); + + if (!this->_M_used().empty()) + { + _M_write(); + if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>()) + if (auto __rem = this->_M_used(); !__rem.empty()) + { + basic_string_view<_CharT> __str(__rem.data(), __rem.size()); + _M_out = __format::__write_escape_seqs(std::move(_M_out), __str); + } + } + return __format::__write(std::move(_M_out), _Esc::_S_term(_M_term)); + } + }; + enum class _Arg_t : unsigned char { _Arg_none, _Arg_bool, _Arg_c, _Arg_i, _Arg_u, _Arg_ll, _Arg_ull, _Arg_flt, _Arg_dbl, _Arg_ldbl, _Arg_str, _Arg_sv, _Arg_ptr, _Arg_handle, diff --git a/libstdc++-v3/testsuite/std/format/ranges/string.cc b/libstdc++-v3/testsuite/std/format/ranges/string.cc index 99e5eaf411f..a7d584f8e42 100644 --- a/libstdc++-v3/testsuite/std/format/ranges/string.cc +++ b/libstdc++-v3/testsuite/std/format/ranges/string.cc @@ -279,6 +279,93 @@ void test_padding() VERIFY( strip_prefix(resv, 46, '*') ); VERIFY( strip_quotes(resv) ); VERIFY( resv == in ); + + // width is 5, size is 15 + in = "\u2160\u2161\u2162\u2163\u2164"; + in += in; // width is 10, size is 30 + in += in; // width is 20, size is 60 + in += in; // width is 40, size is 120 + in += in; // width is 80, size is 240 + in += in; // width is 160, size is 480 + + lc.assign_range(in); + + resv = res = std::format("{:s}", lc); + VERIFY( resv == in ); + + resv = res = std::format("{:*>10s}", lc); + VERIFY( resv == in ); + + resv = res = std::format("{:*>200s}", lc); + VERIFY( strip_prefix(resv, 40, '*') ); + VERIFY( resv == in ); + + resv = res = std::format("{:?s}", lc); + VERIFY( strip_quotes(resv) ); + VERIFY( resv == in ); + + resv = res = std::format("{:*>10?s}", lc); + VERIFY( strip_quotes(resv) ); + VERIFY( resv == in ); + + resv = res = std::format("{:*>200?s}", lc); + VERIFY( strip_prefix(resv, 38, '*') ); + VERIFY( strip_quotes(resv) ); + VERIFY( resv == in ); +} + +void test_escaping() +{ + std::string res; + std::string_view resv; + + const std::string_view input = + "\t\n\r\\\"" + "\u008a" // Cc, Control, Line Tabulation Set, + "\u00ad" // Cf, Format, Soft Hyphen + "\u1d3d" // Lm, Modifier letter, Modifier Letter Capital Ou + "\u00a0" // Zs, Space Separator, No-Break Space (NBSP) + "\u2029" // Zp, Paragraph Separator, Paragraph Separator + "\U0001f984" // So, Other Symbol, Unicorn Face + ; + const std::string_view output = + R"(\t\n\r\\\")" + R"(\u{8a})" + R"(\u{ad})" + "\u1d3d" + R"(\u{a0})" + R"(\u{2029})" + "\U0001f984"; + + std::forward_list<char> lc(std::from_range, input); + resv = res = std::format("{:s}", lc); + VERIFY( resv == input ); + resv = res = std::format("{:?s}", lc); + VERIFY( strip_quotes(resv) ); + VERIFY( resv == output ); + + // width is 5, size is 15 + std::string in = "\u2160\u2161\u2162\u2163\u2164"; + in += in; // width is 10, size is 30 + in += in; // width is 20, size is 60 + in += in; // width is 40, size is 120 + in += in; // width is 80, size is 240 + in += in; // width is 160, size is 480 + std::string_view inv = in; + + // last charcter is incomplete + lc.assign_range(inv.substr(0, 479)); + + // non-debug format, chars copied as is + resv = res = std::format("{:s}", lc); + VERIFY( resv == inv.substr(0, 479) ); + + // debug-format, incomplete code-point sequence is copied + resv = res = std::format("{:?s}", lc); + VERIFY( strip_quotes(resv) ); + VERIFY( resv.substr(0, 477) == inv.substr(0, 477) ); + resv.remove_prefix(477); + VERIFY( resv == R"(\x{e2}\x{85})" ); } int main() @@ -287,4 +374,6 @@ int main() test_outputs<char>(); test_outputs<wchar_t>(); test_nested(); + test_padding(); + test_escaping(); } -- 2.51.0