This patch implements _Escaping_sink that stores characters in a local (stack)
buffer. When the buffer is full, the range of characters is escaped and written
to the underlying sink.

To support above, the __write_escaped_unicode_part function are defined.
It takes __str and __prev_esc by reference. The __prev_esc value is updated 
based
on the last character written. If the buffer ends with an incomplete code point
sequence, __str is left non-empty and the characters are not written.
_Escaping_sink then copies these characters to the front of the buffer to
reconstruct the full code point.

__formatter__str::_M_format_range now uses _Escaping_sink to escape any
non-continuous character sequences.

This addresses PR119820 by removing the code constructing string
completely.

        PR libstdc++/PR119820

libstdc++-v3/ChangeLog:

        * include/std/format (__format::__write_escape_seqs)
        (__format::_Escaping_sink): Define.
        (__format::__write_escaped_unicode_part): Extract from
        __format::__write_escaped_unicode.
        (__format::__write_escaped_unicode): Forward to
        __write_escaped_unicode_part.
        (__formatter_str::_M_format_range): Use _Escaping sink.
        * testsuite/std/format/ranges/string.cc: New tests for
        character which codepoints will be split in buffer and
        escaping. Invoked test_padding.
---
v2 just updates the patch description.

 libstdc++-v3/include/std/format               | 197 +++++++++++++-----
 .../testsuite/std/format/ranges/string.cc     |  89 ++++++++
 2 files changed, 231 insertions(+), 55 deletions(-)

diff --git a/libstdc++-v3/include/std/format b/libstdc++-v3/include/std/format
index d6a2170e45d..347f9f0a479 100644
--- a/libstdc++-v3/include/std/format
+++ b/libstdc++-v3/include/std/format
@@ -105,6 +105,7 @@ namespace __format
   template<typename _CharT> class _Sink;
   template<typename _CharT> class _Fixedbuf_sink;
   template<typename _Out, typename _CharT> class _Padding_sink;
+  template<typename _Out, typename _CharT> class _Escaping_sink;
 
   // Output iterator that writes to a type-erase character sink.
   template<typename _CharT>
@@ -1066,6 +1067,17 @@ namespace __format
       return ++__out;
     }
 
+  template<typename _Out, typename _CharT>
+    _Out
+    __write_escape_seqs(_Out __out, basic_string_view<_CharT> __units)
+    {
+      using _UChar = make_unsigned_t<_CharT>;
+      for (_CharT __c : __units)
+       __out = __format::__write_escape_seq(
+                 __out, static_cast<_UChar>(__c), _Escapes<_CharT>::_S_x());
+      return __out;
+    }
+
   template<typename _Out, typename _CharT>
     _Out
     __write_escaped_char(_Out __out, _CharT __c)
@@ -1124,12 +1136,10 @@ namespace __format
 
   template<typename _CharT, typename _Out>
     _Out
-    __write_escaped_unicode(_Out __out,
-                           basic_string_view<_CharT> __str,
-                           _Term_char __term)
+    __write_escaped_unicode_part(_Out __out, basic_string_view<_CharT>& __str,
+                                bool& __prev_esc, _Term_char __term)
     {
       using _Str_view = basic_string_view<_CharT>;
-      using _UChar = make_unsigned_t<_CharT>;
       using _Esc = _Escapes<_CharT>;
 
       static constexpr char32_t __replace = U'\uFFFD';
@@ -1143,10 +1153,10 @@ namespace __format
        }();
 
       __unicode::_Utf_view<char32_t, _Str_view> __v(std::move(__str));
+      __str = {};
+
       auto __first = __v.begin();
       auto const __last = __v.end();
-
-      bool __prev_esc = true;
       while (__first != __last)
        {
          bool __esc_ascii = false;
@@ -1185,15 +1195,32 @@ namespace __format
            __out = __format::__write_escaped_char(__out, *__first.base());
          else if (__esc_unicode)
            __out = __format::__write_escape_seq(__out, *__first, _Esc::_S_u());
-         else // __esc_replace
-           for (_CharT __c : _Str_view(__first.base(), __first._M_units()))
-             __out = __format::__write_escape_seq(__out,
-                                                  static_cast<_UChar>(__c),
-                                                  _Esc::_S_x());
+         // __esc_replace
+         else if (_Str_view __units(__first.base(), __first._M_units());
+                  __units.end() != __last.base())
+           __out = __format::__write_escape_seqs(__out, __units);
+         else
+           {
+             __str = __units;
+             return __out;
+           }
+
          __prev_esc = true;
          ++__first;
-
        }
+
+      return __out;
+    }
+
+  template<typename _CharT, typename _Out>
+    _Out
+    __write_escaped_unicode(_Out __out, basic_string_view<_CharT> __str,
+                           _Term_char __term)
+    {
+      bool __prev_escape = true;
+      __out = __format::__write_escaped_unicode_part(__out, __str,
+                                                    __prev_escape, __term);
+      __out = __format::__write_escape_seqs(__out, __str);
       return __out;
     }
 
@@ -1412,55 +1439,28 @@ namespace __format
                                 size_t(ranges::distance(__rg)));
              return format(__str, __fc);
            }
-         else if (!_M_spec._M_debug)
+         else
            {
+             auto __handle_debug = [this, &__rg]<typename _NOut>(_NOut __nout)
+               {
+                 if (!_M_spec._M_debug)
+                   return ranges::copy(__rg, std::move(__nout)).out;
+
+                 _Escaping_sink<_NOut, _CharT>
+                   __sink(std::move(__nout), _Term_quote);
+                 ranges::copy(__rg, __sink.out());
+                 return __sink._M_finish();
+               };
+
              const size_t __padwidth = _M_spec._M_get_width(__fc);
              if (__padwidth == 0 && _M_spec._M_prec_kind == _WP_none)
-               return ranges::copy(__rg, __fc.out()).out;
+               return __handle_debug(__fc.out());
 
-             _Padding_sink<_Out, _CharT> __sink(__fc.out(), __padwidth,
-                                                
_M_spec._M_get_precision(__fc));
-             ranges::copy(__rg, __sink.out());
+             _Padding_sink<_Out, _CharT>
+               __sink(__fc.out(), __padwidth, _M_spec._M_get_precision(__fc));
+             __handle_debug(__sink.out());
              return __sink._M_finish(_M_spec._M_align, _M_spec._M_fill);
            }
-         else if constexpr (ranges::forward_range<_Rg> || 
ranges::sized_range<_Rg>)
-           {
-             const size_t __n(ranges::distance(__rg));
-             size_t __w = __n;
-             if constexpr (!__unicode::__literal_encoding_is_unicode<_CharT>())
-               if (size_t __max = _M_spec._M_get_precision(__fc); __n > __max)
-                 __w == __max;
-
-             if (__w <= __format::__stackbuf_size<_CharT>)
-               {
-                 _CharT __buf[__format::__stackbuf_size<_CharT>];
-                 ranges::copy_n(ranges::begin(__rg), __w, __buf);
-                 return _M_format_escaped(_String_view(__buf, __n), __fc);
-               }
-             else if constexpr (ranges::random_access_range<_Rg>)
-               {
-                 ranges::iterator_t<_Rg> __first = ranges::begin(__rg);
-                 ranges::subrange __sub(__first, __first + __w);
-                 return _M_format_escaped(_String(from_range, __sub), __fc);
-               }
-             else if (__w <= __n)
-               {
-                 ranges::subrange __sub(
-                   counted_iterator(ranges::begin(__rg), __w),
-                   default_sentinel);
-                 return _M_format_escaped(_String(from_range, __sub), __fc);
-               }
-             else if constexpr (ranges::sized_range<_Rg>)
-               return _M_format_escaped(_String(from_range, __rg), __fc);
-             else
-               {
-                 // N.B. preserve the computed size
-                 ranges::subrange __sub(__rg, __n);
-                 return _M_format_escaped(_String(from_range, __sub), __fc);
-               }
-           }
-         else
-           return _M_format_escaped(_String(from_range, __rg), __fc);
        }
 
       constexpr void
@@ -3915,6 +3915,93 @@ namespace __format
       }
     };
 
+  template<typename _Out, typename _CharT>
+    class _Escaping_sink : public _Buf_sink<_CharT>
+    {
+      using _Esc = _Escapes<_CharT>;
+
+      _Out _M_out;
+      _Term_char _M_term : 2;
+      unsigned _M_prev_escape : 1;
+      unsigned _M_out_discards : 1;
+
+      void
+      _M_sync_discarding()
+      {
+       if constexpr (is_same_v<_Out, _Sink_iter<_CharT>>)
+         _M_out_discards = _M_out._M_discarding();
+      }
+
+      void
+      _M_write()
+      {
+       span<_CharT> __bytes = this->_M_used();
+       basic_string_view<_CharT> __str(__bytes.data(), __bytes.size());
+
+       size_t __rem = 0;
+       if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>())
+         {
+           bool __prev_escape = _M_prev_escape;
+           _M_out = __format::__write_escaped_unicode_part(
+                      std::move(_M_out), __str, __prev_escape, _M_term);
+           _M_prev_escape = __prev_escape;
+
+           __rem = __str.size();
+           if (__rem > 0 && __str.data() != this->_M_buf) [[unlikely]]
+             ranges::move(__str, this->_M_buf);
+         }
+       else
+         _M_out = __format::__write_escaped_ascii(
+                     std::move(_M_out), __str, _M_term);
+
+       this->_M_reset(this->_M_buf, __rem);
+       _M_sync_discarding();
+      }
+
+      void
+      _M_overflow() override
+      {
+       if (_M_out_discards)
+         this->_M_rewind();
+       else
+         _M_write();
+      }
+
+      bool
+      _M_discarding() const override
+      { return _M_out_discards; }
+
+    public:
+      [[__gnu__::__always_inline__]]
+      explicit
+      _Escaping_sink(_Out __out, _Term_char __term)
+      : _M_out(std::move(__out)), _M_term(__term),
+       _M_prev_escape(true), _M_out_discards(false)
+      {
+       _M_out = __format::__write(std::move(_M_out), _Esc::_S_term(_M_term));
+       _M_sync_discarding();
+      }
+
+      _Out
+      _M_finish()
+      {
+       if (_M_out_discards)
+         return std::move(_M_out);
+
+       if (!this->_M_used().empty())
+       {
+         _M_write();
+         if constexpr (__unicode::__literal_encoding_is_unicode<_CharT>())
+           if (auto __rem = this->_M_used(); !__rem.empty())
+             {
+               basic_string_view<_CharT> __str(__rem.data(), __rem.size());
+               _M_out = __format::__write_escape_seqs(std::move(_M_out), 
__str);
+             }
+       }
+       return __format::__write(std::move(_M_out), _Esc::_S_term(_M_term));
+      }
+    };
+
   enum class _Arg_t : unsigned char {
     _Arg_none, _Arg_bool, _Arg_c, _Arg_i, _Arg_u, _Arg_ll, _Arg_ull,
     _Arg_flt, _Arg_dbl, _Arg_ldbl, _Arg_str, _Arg_sv, _Arg_ptr, _Arg_handle,
diff --git a/libstdc++-v3/testsuite/std/format/ranges/string.cc 
b/libstdc++-v3/testsuite/std/format/ranges/string.cc
index 99e5eaf411f..a7d584f8e42 100644
--- a/libstdc++-v3/testsuite/std/format/ranges/string.cc
+++ b/libstdc++-v3/testsuite/std/format/ranges/string.cc
@@ -279,6 +279,93 @@ void test_padding()
   VERIFY( strip_prefix(resv, 46, '*') );
   VERIFY( strip_quotes(resv) );
   VERIFY( resv == in );
+
+  // width is 5, size is 15
+  in = "\u2160\u2161\u2162\u2163\u2164";
+  in += in; // width is 10, size is 30
+  in += in; // width is 20, size is 60
+  in += in; // width is 40, size is 120
+  in += in; // width is 80, size is 240
+  in += in; // width is 160, size is 480
+
+  lc.assign_range(in);
+
+  resv = res = std::format("{:s}", lc);
+  VERIFY( resv == in );
+
+  resv = res = std::format("{:*>10s}", lc);
+  VERIFY( resv == in );
+
+  resv = res = std::format("{:*>200s}", lc);
+  VERIFY( strip_prefix(resv, 40, '*') );
+  VERIFY( resv == in );
+
+  resv = res = std::format("{:?s}", lc);
+  VERIFY( strip_quotes(resv) );
+  VERIFY( resv == in );
+
+  resv = res = std::format("{:*>10?s}", lc);
+  VERIFY( strip_quotes(resv) );
+  VERIFY( resv == in );
+
+  resv = res = std::format("{:*>200?s}", lc);
+  VERIFY( strip_prefix(resv, 38, '*') );
+  VERIFY( strip_quotes(resv) );
+  VERIFY( resv == in );
+}
+
+void test_escaping()
+{
+  std::string res;
+  std::string_view resv;
+
+  const std::string_view input =
+    "\t\n\r\\\""
+    "\u008a"     // Cc, Control,             Line Tabulation Set,
+    "\u00ad"     // Cf, Format,              Soft Hyphen
+    "\u1d3d"     // Lm, Modifier letter,     Modifier Letter Capital Ou
+    "\u00a0"     // Zs, Space Separator,     No-Break Space (NBSP)
+    "\u2029"     // Zp, Paragraph Separator, Paragraph Separator
+    "\U0001f984" // So, Other Symbol,        Unicorn Face
+  ;
+  const std::string_view output =
+   R"(\t\n\r\\\")"
+   R"(\u{8a})"
+   R"(\u{ad})"
+   "\u1d3d"
+   R"(\u{a0})"
+   R"(\u{2029})"
+   "\U0001f984";
+
+  std::forward_list<char> lc(std::from_range, input);
+  resv = res = std::format("{:s}", lc);
+  VERIFY( resv == input );
+  resv = res = std::format("{:?s}", lc);
+  VERIFY( strip_quotes(resv) );
+  VERIFY( resv == output );
+
+  // width is 5, size is 15
+  std::string in = "\u2160\u2161\u2162\u2163\u2164";
+  in += in; // width is 10, size is 30
+  in += in; // width is 20, size is 60
+  in += in; // width is 40, size is 120
+  in += in; // width is 80, size is 240
+  in += in; // width is 160, size is 480
+  std::string_view inv = in;
+
+  // last charcter is incomplete
+  lc.assign_range(inv.substr(0, 479));
+
+  // non-debug format, chars copied as is
+  resv = res = std::format("{:s}", lc);
+  VERIFY( resv == inv.substr(0, 479) );
+
+  // debug-format, incomplete code-point sequence is copied
+  resv = res = std::format("{:?s}", lc);
+  VERIFY( strip_quotes(resv) );
+  VERIFY( resv.substr(0, 477) == inv.substr(0, 477) );
+  resv.remove_prefix(477);
+  VERIFY( resv == R"(\x{e2}\x{85})" );
 }
 
 int main()
@@ -287,4 +374,6 @@ int main()
   test_outputs<char>();
   test_outputs<wchar_t>();
   test_nested();
+  test_padding();
+  test_escaping();
 }
-- 
2.51.0

Reply via email to