On Wed, 10 Sep 2025, Tomasz Kamiński wrote:

> This patch implements _Escaping_sink that stores characters in a local (stack)
> buffer. When the buffer is full, the range of characters is escaped and 
> written
> to the underlying sink.
> 
> To support above, the __write_escaped_unicode_part function are defined.
> It takes __str and __prev_esc by reference. The __prev_esc value is updated 
> based
> on the last character written. If the buffer ends with an incomplete code 
> point
> sequence, __str is left non-empty and the characters are not written.
> _Escaping_sink then copies these characters to the front of the buffer to
> reconstruct the full code point.
> 
> __formatter__str::_M_format_range now uses _Escaping_sink to escape any
> non-continuous character sequences.
> 
> This addresses PR119820 by removing the code constructing string
> completely.
> 
>       PR libstdc++/PR119820
> 
> libstdc++-v3/ChangeLog:
> 
>       * include/std/format (__format::__write_escape_seqs)
>       (__format::_Escaping_sink): Define.
>       (__format::__write_escaped_unicode_part): Extract from
>       __format::__write_escaped_unicode.
>       (__format::__write_escaped_unicode): Forward to
>       __write_escaped_unicode_part.
>       (__formatter_str::_M_format_range): Use _Escaping sink.
>       * testsuite/std/format/ranges/string.cc: New tests for
>       character which codepoints will be split in buffer and
>       escaping. Invoked test_padding.

Looks good to me!

> ---
> v2 just updates the patch description.
> 
>  libstdc++-v3/include/std/format               | 197 +++++++++++++-----
>  .../testsuite/std/format/ranges/string.cc     |  89 ++++++++
>  2 files changed, 231 insertions(+), 55 deletions(-)
> 
> diff --git a/libstdc++-v3/include/std/format b/libstdc++-v3/include/std/format
> index d6a2170e45d..347f9f0a479 100644
> --- a/libstdc++-v3/include/std/format
> +++ b/libstdc++-v3/include/std/format
> @@ -105,6 +105,7 @@ namespace __format
>    template<typename _CharT> class _Sink;
>    template<typename _CharT> class _Fixedbuf_sink;
>    template<typename _Out, typename _CharT> class _Padding_sink;
> +  template<typename _Out, typename _CharT> class _Escaping_sink;
>  
>    // Output iterator that writes to a type-erase character sink.
>    template<typename _CharT>
> @@ -1066,6 +1067,17 @@ namespace __format
>        return ++__out;
>      }
>  
> +  template<typename _Out, typename _CharT>
> +    _Out
> +    __write_escape_seqs(_Out __out, basic_string_view<_CharT> __units)
> +    {
> +      using _UChar = make_unsigned_t<_CharT>;
> +      for (_CharT __c : __units)
> +     __out = __format::__write_escape_seq(
> +               __out, static_cast<_UChar>(__c), _Escapes<_CharT>::_S_x());
> +      return __out;
> +    }
> +
>    template<typename _Out, typename _CharT>
>      _Out
>      __write_escaped_char(_Out __out, _CharT __c)
> @@ -1124,12 +1136,10 @@ namespace __format
>  
>    template<typename _CharT, typename _Out>
>      _Out
> -    __write_escaped_unicode(_Out __out,
> -                         basic_string_view<_CharT> __str,
> -                         _Term_char __term)
> +    __write_escaped_unicode_part(_Out __out, basic_string_view<_CharT>& 
> __str,
> +                              bool& __prev_esc, _Term_char __term)
>      {
>        using _Str_view = basic_string_view<_CharT>;
> -      using _UChar = make_unsigned_t<_CharT>;
>        using _Esc = _Escapes<_CharT>;
>  
>        static constexpr char32_t __replace = U'\uFFFD';
> @@ -1143,10 +1153,10 @@ namespace __format
>       }();
>  
>        __unicode::_Utf_view<char32_t, _Str_view> __v(std::move(__str));
> +      __str = {};
> +
>        auto __first = __v.begin();
>        auto const __last = __v.end();
> -
> -      bool __prev_esc = true;
>        while (__first != __last)
>       {
>         bool __esc_ascii = false;
> @@ -1185,15 +1195,32 @@ namespace __format
>           __out = __format::__write_escaped_char(__out, *__first.base());
>         else if (__esc_unicode)
>           __out = __format::__write_escape_seq(__out, *__first, _Esc::_S_u());
> -       else // __esc_replace
> -         for (_CharT __c : _Str_view(__first.base(), __first._M_units()))
> -           __out = __format::__write_escape_seq(__out,
> -                                                static_cast<_UChar>(__c),
> -                                                _Esc::_S_x());
> +       // __esc_replace
> +       else if (_Str_view __units(__first.base(), __first._M_units());
> +                __units.end() != __last.base())
> +         __out = __format::__write_escape_seqs(__out, __units);
> +       else
> +         {
> +           __str = __units;
> +           return __out;
> +         }
> +
>         __prev_esc = true;
>         ++__first;
> -
>       }
> +
> +      return __out;
> +    }
> +
> +  template<typename _CharT, typename _Out>
> +    _Out
> +    __write_escaped_unicode(_Out __out, basic_string_view<_CharT> __str,
> +                         _Term_char __term)
> +    {
> +      bool __prev_escape = true;
> +      __out = __format::__write_escaped_unicode_part(__out, __str,
> +                                                  __prev_escape, __term);
> +      __out = __format::__write_escape_seqs(__out, __str);
>        return __out;
>      }
>  
> @@ -1412,55 +1439,28 @@ namespace __format
>                                size_t(ranges::distance(__rg)));
>             return format(__str, __fc);
>           }
> -       else if (!_M_spec._M_debug)
> +       else
>           {
> +           auto __handle_debug = [this, &__rg]<typename _NOut>(_NOut __nout)
> +             {
> +               if (!_M_spec._M_debug)
> +                 return ranges::copy(__rg, std::move(__nout)).out;
> +
> +               _Escaping_sink<_NOut, _CharT>
> +                 __sink(std::move(__nout), _Term_quote);
> +               ranges::copy(__rg, __sink.out());
> +               return __sink._M_finish();
> +             };
> +
>             const size_t __padwidth = _M_spec._M_get_width(__fc);
>             if (__padwidth == 0 && _M_spec._M_prec_kind == _WP_none)
> -             return ranges::copy(__rg, __fc.out()).out;
> +             return __handle_debug(__fc.out());
>  
> -           _Padding_sink<_Out, _CharT> __sink(__fc.out(), __padwidth,
> -                                              
> _M_spec._M_get_precision(__fc));
> -           ranges::copy(__rg, __sink.out());
> +           _Padding_sink<_Out, _CharT>
> +             __sink(__fc.out(), __padwidth, _M_spec._M_get_precision(__fc));
> +           __handle_debug(__sink.out());
>             return __sink._M_finish(_M_spec._M_align, _M_spec._M_fill);
>           }
> -       else if constexpr (ranges::forward_range<_Rg> || 
> ranges::sized_range<_Rg>)
> -         {
> -           const size_t __n(ranges::distance(__rg));
> -           size_t __w = __n;
> -           if constexpr (!__unicode::__literal_encoding_is_unicode<_CharT>())
> -             if (size_t __max = _M_spec._M_get_precision(__fc); __n > __max)
> -               __w == __max;
> -
> -           if (__w <= __format::__stackbuf_size<_CharT>)
> -             {
> -               _CharT __buf[__format::__stackbuf_size<_CharT>];
> -               ranges::copy_n(ranges::begin(__rg), __w, __buf);
> -               return _M_format_escaped(_String_view(__buf, __n), __fc);
> -             }
> -           else if constexpr (ranges::random_access_range<_Rg>)
> -             {
> -               ranges::iterator_t<_Rg> __first = ranges::begin(__rg);
> -               ranges::subrange __sub(__first, __first + __w);
> -               return _M_format_escaped(_String(from_range, __sub), __fc);
> -             }
> -           else if (__w <= __n)
> -             {
> -               ranges::subrange __sub(
> -                 counted_iterator(ranges::begin(__rg), __w),
> -                 default_sentinel);
> -               return _M_format_escaped(_String(from_range, __sub), __fc);
> -             }
> -           else if constexpr (ranges::sized_range<_Rg>)
> -             return _M_format_escaped(_String(from_range, __rg), __fc);
> -           else
> -             {
> -               // N.B. preserve the computed size
> -               ranges::subrange __sub(__rg, __n);
> -               return _M_format_escaped(_String(from_range, __sub), __fc);
> -             }
> -         }
> -       else
> -         return _M_format_escaped(_String(from_range, __rg), __fc);
>       }
>  
>        constexpr void
> @@ -3915,6 +3915,93 @@ namespace __format
>        }
>      };
>  
> +  template<typename _Out, typename _CharT>
> +    class _Escaping_sink : public _Buf_sink<_CharT>
> +    {
> +      using _Esc = _Escapes<_CharT>;
> +
> +      _Out _M_out;
> +      _Term_char _M_term : 2;
> +      unsigned _M_prev_escape : 1;
> +      unsigned _M_out_discards : 1;
> +
> +      void
> +      _M_sync_discarding()
> +      {
> +     if constexpr (is_same_v<_Out, _Sink_iter<_CharT>>)
> +       _M_out_discards = _M_out._M_discarding();
> +      }
> +
> +      void
> +      _M_write()
> +      {
> +     span<_CharT> __bytes = this->_M_used();
> +     basic_string_view<_CharT> __str(__bytes.data(), __bytes.size());
> +
> +     size_t __rem = 0;
> +     if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>())
> +       {
> +         bool __prev_escape = _M_prev_escape;
> +         _M_out = __format::__write_escaped_unicode_part(
> +                    std::move(_M_out), __str, __prev_escape, _M_term);
> +         _M_prev_escape = __prev_escape;
> +
> +         __rem = __str.size();
> +         if (__rem > 0 && __str.data() != this->_M_buf) [[unlikely]]
> +           ranges::move(__str, this->_M_buf);
> +       }
> +     else
> +       _M_out = __format::__write_escaped_ascii(
> +                   std::move(_M_out), __str, _M_term);
> +
> +     this->_M_reset(this->_M_buf, __rem);
> +     _M_sync_discarding();
> +      }
> +
> +      void
> +      _M_overflow() override
> +      {
> +     if (_M_out_discards)
> +       this->_M_rewind();
> +     else
> +       _M_write();
> +      }
> +
> +      bool
> +      _M_discarding() const override
> +      { return _M_out_discards; }
> +
> +    public:
> +      [[__gnu__::__always_inline__]]
> +      explicit
> +      _Escaping_sink(_Out __out, _Term_char __term)
> +      : _M_out(std::move(__out)), _M_term(__term),
> +     _M_prev_escape(true), _M_out_discards(false)
> +      {
> +     _M_out = __format::__write(std::move(_M_out), _Esc::_S_term(_M_term));
> +     _M_sync_discarding();
> +      }
> +
> +      _Out
> +      _M_finish()
> +      {
> +     if (_M_out_discards)
> +       return std::move(_M_out);
> +
> +     if (!this->_M_used().empty())
> +     {
> +       _M_write();
> +       if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>())
> +         if (auto __rem = this->_M_used(); !__rem.empty())
> +           {
> +             basic_string_view<_CharT> __str(__rem.data(), __rem.size());
> +             _M_out = __format::__write_escape_seqs(std::move(_M_out), 
> __str);
> +           }
> +     }
> +     return __format::__write(std::move(_M_out), _Esc::_S_term(_M_term));
> +      }
> +    };
> +
>    enum class _Arg_t : unsigned char {
>      _Arg_none, _Arg_bool, _Arg_c, _Arg_i, _Arg_u, _Arg_ll, _Arg_ull,
>      _Arg_flt, _Arg_dbl, _Arg_ldbl, _Arg_str, _Arg_sv, _Arg_ptr, _Arg_handle,
> diff --git a/libstdc++-v3/testsuite/std/format/ranges/string.cc 
> b/libstdc++-v3/testsuite/std/format/ranges/string.cc
> index 99e5eaf411f..a7d584f8e42 100644
> --- a/libstdc++-v3/testsuite/std/format/ranges/string.cc
> +++ b/libstdc++-v3/testsuite/std/format/ranges/string.cc
> @@ -279,6 +279,93 @@ void test_padding()
>    VERIFY( strip_prefix(resv, 46, '*') );
>    VERIFY( strip_quotes(resv) );
>    VERIFY( resv == in );
> +
> +  // width is 5, size is 15
> +  in = "\u2160\u2161\u2162\u2163\u2164";
> +  in += in; // width is 10, size is 30
> +  in += in; // width is 20, size is 60
> +  in += in; // width is 40, size is 120
> +  in += in; // width is 80, size is 240
> +  in += in; // width is 160, size is 480
> +
> +  lc.assign_range(in);
> +
> +  resv = res = std::format("{:s}", lc);
> +  VERIFY( resv == in );
> +
> +  resv = res = std::format("{:*>10s}", lc);
> +  VERIFY( resv == in );
> +
> +  resv = res = std::format("{:*>200s}", lc);
> +  VERIFY( strip_prefix(resv, 40, '*') );
> +  VERIFY( resv == in );
> +
> +  resv = res = std::format("{:?s}", lc);
> +  VERIFY( strip_quotes(resv) );
> +  VERIFY( resv == in );
> +
> +  resv = res = std::format("{:*>10?s}", lc);
> +  VERIFY( strip_quotes(resv) );
> +  VERIFY( resv == in );
> +
> +  resv = res = std::format("{:*>200?s}", lc);
> +  VERIFY( strip_prefix(resv, 38, '*') );
> +  VERIFY( strip_quotes(resv) );
> +  VERIFY( resv == in );
> +}
> +
> +void test_escaping()
> +{
> +  std::string res;
> +  std::string_view resv;
> +
> +  const std::string_view input =
> +    "\t\n\r\\\""
> +    "\u008a"     // Cc, Control,             Line Tabulation Set,
> +    "\u00ad"     // Cf, Format,              Soft Hyphen
> +    "\u1d3d"     // Lm, Modifier letter,     Modifier Letter Capital Ou
> +    "\u00a0"     // Zs, Space Separator,     No-Break Space (NBSP)
> +    "\u2029"     // Zp, Paragraph Separator, Paragraph Separator
> +    "\U0001f984" // So, Other Symbol,        Unicorn Face
> +  ;
> +  const std::string_view output =
> +   R"(\t\n\r\\\")"
> +   R"(\u{8a})"
> +   R"(\u{ad})"
> +   "\u1d3d"
> +   R"(\u{a0})"
> +   R"(\u{2029})"
> +   "\U0001f984";
> +
> +  std::forward_list<char> lc(std::from_range, input);
> +  resv = res = std::format("{:s}", lc);
> +  VERIFY( resv == input );
> +  resv = res = std::format("{:?s}", lc);
> +  VERIFY( strip_quotes(resv) );
> +  VERIFY( resv == output );
> +
> +  // width is 5, size is 15
> +  std::string in = "\u2160\u2161\u2162\u2163\u2164";
> +  in += in; // width is 10, size is 30
> +  in += in; // width is 20, size is 60
> +  in += in; // width is 40, size is 120
> +  in += in; // width is 80, size is 240
> +  in += in; // width is 160, size is 480
> +  std::string_view inv = in;
> +
> +  // last charcter is incomplete
> +  lc.assign_range(inv.substr(0, 479));
> +
> +  // non-debug format, chars copied as is
> +  resv = res = std::format("{:s}", lc);
> +  VERIFY( resv == inv.substr(0, 479) );
> +
> +  // debug-format, incomplete code-point sequence is copied
> +  resv = res = std::format("{:?s}", lc);
> +  VERIFY( strip_quotes(resv) );
> +  VERIFY( resv.substr(0, 477) == inv.substr(0, 477) );
> +  resv.remove_prefix(477);
> +  VERIFY( resv == R"(\x{e2}\x{85})" );
>  }
>  
>  int main()
> @@ -287,4 +374,6 @@ int main()
>    test_outputs<char>();
>    test_outputs<wchar_t>();
>    test_nested();
> +  test_padding();
> +  test_escaping();
>  }
> -- 
> 2.51.0
> 
> 

Reply via email to