Am Do., 11. März 2021 um 18:17 Uhr schrieb Patrick Palka via Libstdc++
<libstd...@gcc.gnu.org>:
>
> This implements a minimal integer class type that emulates 128-bit
> unsigned arithmetic using a pair of 64-bit integers, which the
> floating-point std::to_chars implementation then uses as a drop-in
> replacement for unsigned __int128 on targets that lack the latter.
> This allows us to fully support formatting of large long double types
> on targets that lack __int128.
>
> Since Ryu performs 128-bit division/modulus only by 2, 5 and 10, the
> integer class type supports only these divisors rather than supporting
> general division/modulus.
>
> Tested on x86, x86_64, ppc64le, ppc64be and aarch64, with and without
> performing the equivalent of -U__SIZEOF_INT128__ in floating_to_chars.cc
> (so that we also test using the class type on targets when __int128 is
> available).
>
> libstdc++-v3/ChangeLog:
>
>         * src/c++17/floating_to_chars.cc: Simplify the file as if
>         __SIZEOF_INT128__ is always defined.
>         [!defined __SIZEOF_INT128__]: Include "uint128_t.h".  Define
>         a to_chars overload for the uint128_t class type.
>         * src/c++17/uint128_t.h: New file.
>         * testsuite/20_util/to_chars/long_double.cc: No longer expect an
>         execution FAIL on targets that have a large long double type
>         but lack __int128.
> ---
>  libstdc++-v3/src/c++17/floating_to_chars.cc   |  58 ++--
>  libstdc++-v3/src/c++17/uint128_t.h            | 297 ++++++++++++++++++
>  .../testsuite/20_util/to_chars/long_double.cc |   1 -
>  3 files changed, 332 insertions(+), 24 deletions(-)
>  create mode 100644 libstdc++-v3/src/c++17/uint128_t.h
>
> diff --git a/libstdc++-v3/src/c++17/floating_to_chars.cc 
> b/libstdc++-v3/src/c++17/floating_to_chars.cc
> index da3fbaa1ed1..86f4401e134 100644
> --- a/libstdc++-v3/src/c++17/floating_to_chars.cc
> +++ b/libstdc++-v3/src/c++17/floating_to_chars.cc
> @@ -64,25 +64,19 @@ extern "C" int __sprintfieee128(char*, const char*, ...);
>
>  #if __LDBL_MANT_DIG__ == __DBL_MANT_DIG__
>  # define LONG_DOUBLE_KIND LDK_BINARY64
> -#elif defined(__SIZEOF_INT128__)
> -// The Ryu routines need a 128-bit integer type in order to do shortest
> -// formatting of types larger than 64-bit double, so without __int128 we 
> can't
> -// support any large long double format.  This is the case for e.g. i386.
> -# if __LDBL_MANT_DIG__ == 64
> +#elif __LDBL_MANT_DIG__ == 64
>  #  define LONG_DOUBLE_KIND LDK_FLOAT80
> -# elif __LDBL_MANT_DIG__ == 113
> -#  define LONG_DOUBLE_KIND LDK_BINARY128
> -# elif __LDBL_MANT_DIG__ == 106
> -#  define LONG_DOUBLE_KIND LDK_IBM128
> -# endif
> -# if defined _GLIBCXX_USE_FLOAT128 && __FLT128_MANT_DIG__ == 113
> -// Define overloads of std::to_chars for __float128.
> -#  define FLOAT128_TO_CHARS 1
> -# endif
> +#elif __LDBL_MANT_DIG__ == 113
> +# define LONG_DOUBLE_KIND LDK_BINARY128
> +#elif __LDBL_MANT_DIG__ == 106
> +# define LONG_DOUBLE_KIND LDK_IBM128
> +#else
> +# define LONG_DOUBLE_KIND LDK_UNSUPPORTED
>  #endif
>
> -#if !defined(LONG_DOUBLE_KIND)
> -# define LONG_DOUBLE_KIND LDK_UNSUPPORTED
> +#if defined _GLIBCXX_USE_FLOAT128 && __FLT128_MANT_DIG__ == 113
> +// Define overloads of std::to_chars for __float128.
> +# define FLOAT128_TO_CHARS 1
>  #endif
>
>  // For now we only support __float128 when it's the powerpc64 __ieee128 type.
> @@ -100,6 +94,8 @@ namespace
>  {
>  #if defined __SIZEOF_INT128__
>    using uint128_t = unsigned __int128;
> +#else
> +# include "uint128_t.h"
>  #endif
>
>    namespace ryu
> @@ -114,7 +110,6 @@ namespace
>  #include "ryu/d2fixed.c"
>  #include "ryu/f2s.c"
>
> -#ifdef __SIZEOF_INT128__
>      namespace generic128
>      {
>        // Put the generic Ryu bits in their own namespace to avoid name 
> conflicts.
> @@ -129,7 +124,6 @@ namespace
>      int
>      to_chars(const floating_decimal_128 v, char* const result)
>      { return generic128::generic_to_chars(v, result); }
> -#endif
>    } // namespace ryu
>
>    // A traits class that contains pertinent information about the binary
> @@ -407,10 +401,8 @@ namespace
>           return uint32_t{};
>         else if constexpr (total_bits <= 64)
>           return uint64_t{};
> -#ifdef __SIZEOF_INT128__
>         else if constexpr (total_bits <= 128)
>           return uint128_t{};
> -#endif
>        };
>        using uint_t = decltype(get_uint_t());
>        uint_t value_bits = 0;
> @@ -503,7 +495,6 @@ namespace
>         return ryu::floating_to_fd32(value);
>        else if constexpr (std::is_same_v<T, double>)
>         return ryu::floating_to_fd64(value);
> -#ifdef __SIZEOF_INT128__
>        else if constexpr (std::is_same_v<T, long double>
>                          || std::is_same_v<T, F128_type>)
>         {
> @@ -519,7 +510,6 @@ namespace
>                                                 mantissa_bits, exponent_bits,
>                                                 !has_implicit_leading_bit);
>         }
> -#endif
>      }
>
>    // This subroutine returns true if the shortest scientific form fd is a
> @@ -558,10 +548,32 @@ namespace
>    get_mantissa_length(const ryu::floating_decimal_64 fd)
>    { return ryu::decimalLength17(fd.mantissa); }
>
> -#ifdef __SIZEOF_INT128__
>    int
>    get_mantissa_length(const ryu::floating_decimal_128 fd)
>    { return ryu::generic128::decimalLength(fd.mantissa); }
> +
> +#if !defined __SIZEOF_INT128__
> +  // An implementation of base-10 std::to_chars for uint128_t on targets that
> +  // lack __int128.
> +  std::to_chars_result
> +  to_chars(char* first, char* last, uint128_t x)
> +  {
> +    const int len = ryu::generic128::decimalLength(x);
> +    if (last - first < len)
> +      return {last, std::errc::value_too_large};
> +    if (x == 0)
> +      {
> +       *first++ = '0';
> +       return {first, std::errc{}};
> +      }
> +    for (int i = 0; i < len; ++i)
> +      {
> +       first[len - 1 - i] = '0' + static_cast<char>(x % 10);
> +       x /= 10;
> +      }
> +    __glibcxx_assert(x == 0);
> +    return {first + len, std::errc{}};
> +  }
>  #endif
>  } // anon namespace
>
> diff --git a/libstdc++-v3/src/c++17/uint128_t.h 
> b/libstdc++-v3/src/c++17/uint128_t.h
> new file mode 100644
> index 00000000000..90ebae2ffd2
> --- /dev/null
> +++ b/libstdc++-v3/src/c++17/uint128_t.h
> @@ -0,0 +1,297 @@
> +// A relatiely minimal unsigned 128-bit integer class type, used by the
> +// floating-point std::to_chars implementation on targets that lack __int128.
> +
> +// Copyright (C) 2021 Free Software Foundation, Inc.
> +//
> +// This file is part of the GNU ISO C++ Library.  This library is free
> +// software; you can redistribute it and/or modify it under the
> +// terms of the GNU General Public License as published by the
> +// Free Software Foundation; either version 3, or (at your option)
> +// any later version.
> +
> +// This library is distributed in the hope that it will be useful,
> +// but WITHOUT ANY WARRANTY; without even the implied warranty of
> +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +// GNU General Public License for more details.
> +
> +// Under Section 7 of GPL version 3, you are granted additional
> +// permissions described in the GCC Runtime Library Exception, version
> +// 3.1, as published by the Free Software Foundation.
> +
> +// You should have received a copy of the GNU General Public License and
> +// a copy of the GCC Runtime Library Exception along with this program;
> +// see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +// <http://www.gnu.org/licenses/>.
> +
> +struct uint128_t
> +{
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> +  uint64_t lo, hi;
> +#else
> +  uint64_t hi, lo;
> +#endif
> +
> +  uint128_t() = default;
> +
> +  constexpr
> +  uint128_t(uint64_t lo, uint64_t hi = 0)
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> +    : lo(lo), hi(hi)
> +#else
> +    : hi(hi), lo(lo)
> +#endif
> +  { }
> +
> +  constexpr explicit
> +  operator bool() const
> +  { return *this != 0; }
> +
> +  template<typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
> +    constexpr explicit
> +    operator T() const
> +    {
> +      static_assert(sizeof(T) <= sizeof(uint64_t));
> +      return static_cast<T>(lo);
> +    }
> +
> +  friend constexpr uint128_t
> +  operator&(uint128_t x, const uint128_t y)
> +  {
> +    x.lo &= y.lo;
> +    x.hi &= y.hi;
> +    return x;
> +  }
> +
> +  friend constexpr uint128_t
> +  operator|(uint128_t x, const uint128_t y)
> +  {
> +    x.lo |= y.lo;
> +    x.hi |= y.hi;
> +    return x;
> +  }
> +
> +  friend constexpr uint128_t
> +  operator<<(uint128_t x, const uint128_t y)
> +  {
> +    __glibcxx_assert(y < 128);
> +    // TODO: Convince GCC to use shldq on x86 here.
> +    if (y.lo >= 64)
> +      {
> +       x.hi = x.lo << (y.lo - 64);
> +       x.lo = 0;
> +      }
> +    else if (y.lo != 0)
> +      {
> +       x.hi <<= y.lo;
> +       x.hi |= x.lo >> (64 - y.lo);
> +       x.lo <<= y.lo;
> +      }
> +    return x;
> +  }
> +
> +  friend constexpr uint128_t
> +  operator>>(uint128_t x, const uint128_t y)
> +  {
> +    __glibcxx_assert(y < 128);
> +    // TODO: Convince GCC to use shrdq on x86 here.
> +    if (y.lo >= 64)
> +      {
> +       x.lo = x.hi >> (y.lo - 64);
> +       x.hi = 0;
> +      }
> +    else if (y.lo != 0)
> +      {
> +       x.lo >>= y.lo;
> +       x.lo |= x.hi << (64 - y.lo);
> +       x.hi >>= y.lo;
> +      }
> +    return x;
> +  }
> +
> +  constexpr uint128_t
> +  operator~() const
> +  { return {~lo, ~hi}; }
> +
> +  constexpr uint128_t
> +  operator-() const
> +  { return operator~() + 1; }
> +
> +  friend constexpr uint128_t
> +  operator+(uint128_t x, const uint128_t y)
> +  {
> +    x.hi += __builtin_add_overflow(x.lo, y.lo, &x.lo);
> +    x.hi += y.hi;
> +    return x;
> +  }
> +
> +  friend constexpr uint128_t
> +  operator-(uint128_t x, const uint128_t y)
> +  {
> +    x.hi -= __builtin_sub_overflow(x.lo, y.lo, &x.lo);
> +    x.hi -= y.hi;
> +    return x;
> +  }
> +
> +  static constexpr uint128_t
> +  umul64_64_128(const uint64_t x, const uint64_t y)
> +  {
> +    const uint64_t xl = x & 0xffffffff;
> +    const uint64_t xh = x >> 32;
> +    const uint64_t yl = y & 0xffffffff;
> +    const uint64_t yh = y >> 32;
> +    const uint64_t ll = xl * yl;
> +    const uint64_t lh = xl * yh;
> +    const uint64_t hl = xh * yl;
> +    const uint64_t hh = xh * yh;
> +    const uint64_t m = (ll >> 32) + lh + (hl & 0xffffffff);
> +    const uint64_t l = (ll & 0xffffffff ) | (m << 32);
> +    const uint64_t h = (m >> 32) + (hl >> 32) + hh;
> +    return {l, h};
> +  }
> +
> +  friend constexpr uint128_t
> +  operator*(uint128_t x, const uint128_t y)
> +  {
> +    uint128_t z = umul64_64_128(x.lo, y.lo);
> +    z.hi += x.hi*y.lo + y.hi*x.lo;
> +    return z;
> +  }
> +
> +  friend constexpr uint128_t
> +  operator/(const uint128_t x, const uint128_t y)
> +  {
> +    // Ryu performs 128-bit division only by 5 and 10, so that's what we
> +    // implement.  The strategy here is to relate division of x with that of
> +    // x.hi and x.lo separately.
> +    __glibcxx_assert(y == 5 || y == 10);
> +    // The following implements division by 5 and 10.  In either case, we
> +    // first compute division by 5:
> +    //   x/5 = (x.hi*2^64 + x.lo)/5
> +    //       = (x.hi*(2^64-1) + x.hi + x.lo)/5
> +    //       = x.hi*((2^64-1)/5) + (x.hi + x.lo)/5 since CST=(2^64-1)/5 is 
> exact
> +    //       = x.hi*CST + x.hi/5 + x.lo/5 + ((x.lo%5) + (x.hi%5) >= 5)
> +    // We go a step further and replace the last adjustment term with a
> +    // lookup table, which we encode as a binary literal.  This seems to
> +    // yield smaller code on x86 at least.
> +    constexpr auto cst = ~uint64_t(0) / 5;
> +    uint128_t q = uint128_t{x.hi}*cst + uint128_t{x.hi/5 + x.lo/5};
> +    constexpr auto lookup = 0b111100000u;
> +    q += (lookup >> ((x.hi % 5) + (x.lo % 5))) & 1;
> +    if (y == 10)
> +      q >>= 1;
> +    return q;
> +  }
> +
> +  friend constexpr uint128_t
> +  operator%(uint128_t x, const uint128_t y)
> +  {
> +    // Ryu performs 128-bit modulus only by 2, 5 and 10, so that's what we
> +    // implement.  The strategy here is to relate modulus of x with that of
> +    // x.hi and x.lo separately.
> +    if (y == 2)
> +      return x & 1;
> +    __glibcxx_assert(y == 5 || y == 10);
> +    // The following implements modulus by 5 and 10.  In either case,
> +    // we first compute modulus by 5:
> +    //   x (mod 5) = x.hi*2^64 + x.lo (mod 5)
> +    //             = x.hi + x.lo (mod 5) since 2^64 ≡ 1 (mod 5)
> +    // So the straightforward implementation would be
> +    //   ((x.hi % 5) + (x.lo % 5)) % 5
> +    // But we go a step further and replace the outermost % with a
> +    // lookup table:
> +    //           = {0,1,2,3,4,0,1,2,3}[(x.hi % 5) + (x.lo % 5)] (mod 5)
> +    // which we encode as an octal literal.
> +    constexpr auto lookup = 0321043210u;
> +    auto r = (lookup >> 3*((x.hi % 5) + (x.lo % 5))) & 7;
> +    if (y == 10)
> +      // x % 10 = (x % 5)      if x / 5 is even
> +      //          (x % 5) + 5  if x / 5 is odd
> +      // The compiler should be able to CSE the below computation of x/5 and
> +      // the above modulus operations with a nearby inlined computation of 
> x/10.
> +      r += ((x / 5).lo & 1) * 5;
> +    return r;
> +  }
> +
> +  friend constexpr bool
> +  operator==(const uint128_t x, const uint128_t y)
> +  { return x.hi == y.hi && x.lo == y.lo; }
> +
> +  friend constexpr bool
> +  operator<(const uint128_t x, const uint128_t y)
> +  { return x.hi < y.hi || (x.hi == y.hi && x.lo < y.lo); }
> +
> +  friend constexpr auto
> +  __bit_width(const uint128_t x)
> +  {
> +    if (auto w = std::__bit_width(x.hi))
> +      return w + 64;
> +    else
> +      return std::__bit_width(x.lo);
> +  }
> +
> +  friend constexpr auto
> +  __countr_zero(const uint128_t x)
> +  {
> +    auto c = std::__countr_zero(x.lo);
> +    if (c == 64)
> +      return 64 + std::__countr_zero(x.hi);
> +    else
> +      return c;
> +  }
> +
> +  constexpr uint128_t&
> +  operator--()
> +  { return *this -= 1; }
> +
> +  constexpr uint128_t&
> +  operator++()
> +  { return *this += 1; }
> +
> +  constexpr uint128_t&
> +  operator+=(const uint128_t y)
> +  { return *this = *this + y; }
> +
> +  constexpr uint128_t&
> +  operator-=(const uint128_t y)
> +  { return *this = *this - y; }
> +
> +  constexpr uint128_t&
> +  operator*=(const uint128_t y)
> +  { return *this = *this * y; }
> +
> +  constexpr uint128_t&
> +  operator<<=(const uint128_t y)
> +  { return *this = *this << y; }
> +
> +  constexpr uint128_t&
> +  operator>>=(const uint128_t y)
> +  { return *this = *this >> y; }
> +
> +  constexpr uint128_t&
> +  operator|=(const uint128_t y)
> +  { return *this = *this | y; }
> +
> +  constexpr uint128_t&
> +  operator&=(const uint128_t y)
> +  { return *this = *this & y; }
> +
> +  constexpr uint128_t&
> +  operator%=(const uint128_t y)
> +  { return *this = *this % y; }
> +
> +  constexpr uint128_t&
> +  operator/=(const uint128_t y)
> +  { return *this = *this / y; }
> +
> +  friend constexpr bool
> +  operator!=(const uint128_t x, const uint128_t y)
> +  { return !(x == y); }
> +
> +  friend constexpr bool
> +  operator>(const uint128_t x, const uint128_t y)
> +  { return y < x; }
> +
> +  friend constexpr bool
> +  operator>=(const uint128_t x, const uint128_t y)
> +  { return !(x < y); }
> +};
> diff --git a/libstdc++-v3/testsuite/20_util/to_chars/long_double.cc 
> b/libstdc++-v3/testsuite/20_util/to_chars/long_double.cc
> index da847ae5401..5c1f7136f21 100644
> --- a/libstdc++-v3/testsuite/20_util/to_chars/long_double.cc
> +++ b/libstdc++-v3/testsuite/20_util/to_chars/long_double.cc
> @@ -18,7 +18,6 @@
>  // <charconv> is supported in C++14 as a GNU extension, but this test uses 
> C++17
>  // hexadecimal floating-point literals.
>  // { dg-do run { target c++17 } }
> -// { dg-xfail-run-if "Ryu needs __int128" { large_long_double && { ! int128 
> } } }
>  // { dg-require-effective-target ieee-floats }

It seems to me that basically all uint128_t operations should be
unconditionally noexcept. Currently none of them is so (The constexpr
keyword is a red herring in this regard).

Is it worth considering to add (conditional) support for operator<=>
and corresponding simplifications of comparison operators?

Thanks,

- Daniel

Reply via email to