This patch implements _Escaping_sink, that stores characters in the local (stack) buffer, and when filled escapes the range to underlying sink.
To support that we define __write_escaped_unicode_part functions, that takes the __str and __prev_esc by reference. The __prev_esc value is updated based on last written character. And the __str is left non-empty, if the buffer ends with incomplete code point sequence. In such case the characters are not written. The _Escaping_sink will copy such charcters to front of the buffer, so the full representation can be reconstructed. __formatter__str::_M_format_range now uses the _Escaping_sink to escape any non-continous sequence of charcters. This addresses PR119820 by removing the code constructing string completly. PR libstdc++/PR119820 libstdc++-v3/ChangeLog: * include/std/format (__format::__write_escape_seqs) (__format::_Escaping_sink): Define. (__format::__write_escaped_unicode_part): Extract from __format::__write_escaped_unicode. (__format::__write_escaped_unicode): Forward to __write_escaped_unicode_part. (__formatter_str::_M_format_range): Use _Escaping sink. * testsuite/std/format/ranges/string.cc: New tests for charcter which codepoints will be split in buffer. --- There is Polish saying about "shooting the mosquito with cannonball", and using _Escaping_sink to resolve PR119820 feels that way. However, I have already implemented _Escaping_sink some time ago, and was sitting in my repository, waiting for additional tests where charcter will be split between buffers to be written, and the issue pushed me to add the test. And continuing with the parallel, the mosquitos are eliminted, and we got very nice cannon. So it's seem worth doing. Testing on x86_64-linux. The std/format/* test passed. OK for trunk? libstdc++-v3/include/std/format | 198 +++++++++++++----- .../testsuite/std/format/ranges/string.cc | 34 +++ 2 files changed, 177 insertions(+), 55 deletions(-) diff --git a/libstdc++-v3/include/std/format b/libstdc++-v3/include/std/format index d584b81c78a..eea35df4835 100644 --- a/libstdc++-v3/include/std/format +++ b/libstdc++-v3/include/std/format @@ -105,6 +105,7 @@ namespace __format template<typename _CharT> class _Sink; template<typename _CharT> class _Fixedbuf_sink; template<typename _Out, typename _CharT> class _Padding_sink; + template<typename _Out, typename _CharT> class _Escaping_sink; // Output iterator that writes to a type-erase character sink. template<typename _CharT> @@ -1062,6 +1063,17 @@ namespace __format return ++__out; } + template<typename _Out, typename _CharT> + _Out + __write_escape_seqs(_Out __out, basic_string_view<_CharT> __units) + { + using _UChar = make_unsigned_t<_CharT>; + for (_CharT __c : __units) + __out = __format::__write_escape_seq( + __out, static_cast<_UChar>(__c), _Escapes<_CharT>::_S_x()); + return __out; + } + template<typename _Out, typename _CharT> _Out __write_escaped_char(_Out __out, _CharT __c) @@ -1120,12 +1132,10 @@ namespace __format template<typename _CharT, typename _Out> _Out - __write_escaped_unicode(_Out __out, - basic_string_view<_CharT> __str, - _Term_char __term) + __write_escaped_unicode_part(_Out __out, basic_string_view<_CharT>& __str, + bool& __prev_esc, _Term_char __term) { using _Str_view = basic_string_view<_CharT>; - using _UChar = make_unsigned_t<_CharT>; using _Esc = _Escapes<_CharT>; static constexpr char32_t __replace = U'\uFFFD'; @@ -1139,10 +1149,10 @@ namespace __format }(); __unicode::_Utf_view<char32_t, _Str_view> __v(std::move(__str)); + __str = {}; + auto __first = __v.begin(); auto const __last = __v.end(); - - bool __prev_esc = true; while (__first != __last) { bool __esc_ascii = false; @@ -1181,15 +1191,32 @@ namespace __format __out = __format::__write_escaped_char(__out, *__first.base()); else if (__esc_unicode) __out = __format::__write_escape_seq(__out, *__first, _Esc::_S_u()); - else // __esc_replace - for (_CharT __c : _Str_view(__first.base(), __first._M_units())) - __out = __format::__write_escape_seq(__out, - static_cast<_UChar>(__c), - _Esc::_S_x()); + // __esc_replace + else if (_Str_view __units(__first.base(), __first._M_units()); + __units.end() != __last.base()) + __out = __format::__write_escape_seqs(__out, __units); + else + { + __str = __units; + return __out; + } + __prev_esc = true; ++__first; - } + + return __out; + } + + template<typename _CharT, typename _Out> + _Out + __write_escaped_unicode(_Out __out, basic_string_view<_CharT> __str, + _Term_char __term) + { + bool __prev_escape = true; + __out = __format::__write_escaped_unicode_part(__out, __str, + __prev_escape, __term); + __out = __format::__write_escape_seqs(__out, __str); return __out; } @@ -1408,55 +1435,28 @@ namespace __format size_t(ranges::distance(__rg))); return format(__str, __fc); } - else if (!_M_spec._M_debug) + else { + auto __handle_debug = [this, &__rg]<typename _NOut>(_NOut __nout) + { + if (!_M_spec._M_debug) + return ranges::copy(__rg, std::move(__nout)).out; + + _Escaping_sink<_NOut, _CharT> + __sink(std::move(__nout), _Term_quote); + ranges::copy(__rg, __sink.out()); + return __sink._M_finish(); + }; + const size_t __padwidth = _M_spec._M_get_width(__fc); if (__padwidth == 0 && _M_spec._M_prec_kind == _WP_none) - return ranges::copy(__rg, __fc.out()).out; + return __handle_debug(__fc.out()); - _Padding_sink<_Out, _CharT> __sink(__fc.out(), __padwidth, - _M_spec._M_get_precision(__fc)); - ranges::copy(__rg, __sink.out()); + _Padding_sink<_Out, _CharT> + __sink(__fc.out(), __padwidth, _M_spec._M_get_precision(__fc)); + __handle_debug(__sink.out()); return __sink._M_finish(_M_spec._M_align, _M_spec._M_fill); } - else if constexpr (ranges::forward_range<_Rg> || ranges::sized_range<_Rg>) - { - const size_t __n(ranges::distance(__rg)); - size_t __w = __n; - if constexpr (!__unicode::__literal_encoding_is_unicode<_CharT>()) - if (size_t __max = _M_spec._M_get_precision(__fc); __n > __max) - __w == __max; - - if (__w <= __format::__stackbuf_size<_CharT>) - { - _CharT __buf[__format::__stackbuf_size<_CharT>]; - ranges::copy_n(ranges::begin(__rg), __w, __buf); - return _M_format_escaped(_String_view(__buf, __n), __fc); - } - else if constexpr (ranges::random_access_range<_Rg>) - { - ranges::iterator_t<_Rg> __first = ranges::begin(__rg); - ranges::subrange __sub(__first, __first + __w); - return _M_format_escaped(_String(from_range, __sub), __fc); - } - else if (__w <= __n) - { - ranges::subrange __sub( - counted_iterator(ranges::begin(__rg), __w), - default_sentinel); - return _M_format_escaped(_String(from_range, __sub), __fc); - } - else if constexpr (ranges::sized_range<_Rg>) - return _M_format_escaped(_String(from_range, __rg), __fc); - else - { - // N.B. preserve the computed size - ranges::subrange __sub(__rg, __n); - return _M_format_escaped(_String(from_range, __sub), __fc); - } - } - else - return _M_format_escaped(_String(from_range, __rg), __fc); } constexpr void @@ -3888,6 +3888,94 @@ namespace __format } }; + template<typename _Out, typename _CharT> + class _Escaping_sink : public _Buf_sink<_CharT> + { + using _Esc = _Escapes<_CharT>; + + _Out _M_out; + _Term_char _M_term : 2; + unsigned _M_prev_escape : 1; + unsigned _M_out_discards : 1; + + void + _M_sync_discarding() + { + if constexpr (is_same_v<_Out, _Sink_iter<_CharT>>) + _M_out_discards = _M_out._M_discarding(); + } + + void + _M_write() + { + span<_CharT> __bytes = this->_M_used(); + basic_string_view<_CharT> __str(__bytes.data(), __bytes.size()); + + size_t __rem = 0; + if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>()) + { + bool __prev_escape = _M_prev_escape; + _M_out = __format::__write_escaped_unicode_part( + std::move(_M_out), __str, __prev_escape, _M_term); + _M_prev_escape = __prev_escape; + + __rem = __str.size(); + if (__rem > 0 && __str.data() != this->_M_buf) [[unlikely]] + ranges::move(__str, this->_M_buf); + } + else + _M_out = __format::__write_escaped_ascii( + std::move(_M_out), __str, _M_term); + + this->_M_reset(this->_M_buf, __rem); + _M_sync_discarding(); + } + + void + _M_overflow() override + { + if (_M_out_discards) + this->_M_rewind(); + else + _M_write(); + } + + bool + _M_discarding() const override + { return _M_out_discards; } + + public: + [[__gnu__::__always_inline__]] + explicit + _Escaping_sink(_Out __out, _Term_char __term) + : _M_out(std::move(__out)), _M_term(__term), + _M_prev_escape(true), _M_out_discards(false) + { + _M_out = __format::__write(std::move(_M_out), _Esc::_S_term(_M_term)); + _M_sync_discarding(); + } + + _Out + _M_finish() + { + if (_M_out_discards) + return std::move(_M_out); + + if (!this->_M_used().empty()) + { + _M_write(); + if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>()) + if (auto __rem = this->_M_used(); !__rem.empty()) + { + basic_string_view<_CharT> __str(__rem.data(), __rem.size()); + _M_out = __format::__write_escape_seqs(std::move(_M_out), __str); + } + } + return __format::__write(std::move(_M_out), _Esc::_S_term(_M_term)); + + } + }; + enum class _Arg_t : unsigned char { _Arg_none, _Arg_bool, _Arg_c, _Arg_i, _Arg_u, _Arg_ll, _Arg_ull, _Arg_flt, _Arg_dbl, _Arg_ldbl, _Arg_str, _Arg_sv, _Arg_ptr, _Arg_handle, diff --git a/libstdc++-v3/testsuite/std/format/ranges/string.cc b/libstdc++-v3/testsuite/std/format/ranges/string.cc index 99e5eaf411f..0e856b32d44 100644 --- a/libstdc++-v3/testsuite/std/format/ranges/string.cc +++ b/libstdc++-v3/testsuite/std/format/ranges/string.cc @@ -279,6 +279,39 @@ void test_padding() VERIFY( strip_prefix(resv, 46, '*') ); VERIFY( strip_quotes(resv) ); VERIFY( resv == in ); + + // width is 5, size is 15 + in = "\u2160\u2161\u2162\u2163\u2164"; + in += in; // width is 10, size is 30 + in += in; // width is 20, size is 60 + in += in; // width is 40, size is 120 + in += in; // width is 80, size is 240 + in += in; // width is 160, size is 480 + + lc = std::forward_list<char>(std::from_range, in); + + resv = res = std::format("{:s}", lc); + VERIFY( resv == in ); + + resv = res = std::format("{:*>10s}", lc); + VERIFY( resv == in ); + + resv = res = std::format("{:*>200s}", lc); + VERIFY( strip_prefix(resv, 40, '*') ); + VERIFY( resv == in ); + + resv = res = std::format("{:?s}", lc); + VERIFY( strip_quotes(resv) ); + VERIFY( resv == in ); + + resv = res = std::format("{:*>10?s}", lc); + VERIFY( strip_quotes(resv) ); + VERIFY( resv == in ); + + resv = res = std::format("{:*>200?s}", lc); + VERIFY( strip_prefix(resv, 38, '*') ); + VERIFY( strip_quotes(resv) ); + VERIFY( resv == in ); } int main() @@ -287,4 +320,5 @@ int main() test_outputs<char>(); test_outputs<wchar_t>(); test_nested(); + test_padding(); } -- 2.49.0