https://gcc.gnu.org/g:5ced917508eee7eb499e19feeb3def1fa1842bb4

commit r15-7528-g5ced917508eee7eb499e19feeb3def1fa1842bb4
Author: Matthew Malcomson <mmalcom...@nvidia.com>
Date:   Fri Feb 7 14:49:11 2025 +0000

    libstdc++: Conditionally use floating-point fetch_add builtins
    
    - Some hardware has support for floating point atomic fetch_add (and
      similar).
    - There are existing compilers targetting this hardware that use
      libstdc++ -- e.g. NVC++.
    - Since the libstdc++ atomic<float>::fetch_add and similar is written
      directly as a CAS loop these compilers can not emit optimal code when
      seeing such constructs.
    - I hope to use __atomic_fetch_add builtins on floating point types
      directly in libstdc++ so these compilers can emit better code.
    - Clang already handles some floating point types in the
      __atomic_fetch_add family of builtins.
    - In order to only use this when available, I originally thought I could
      check against the resolved versions of the builtin in a manner
      something like `__has_builtin(__atomic_fetch_add_<fp-suffix>)`.
      I then realised that clang does not expose resolved versions of these
      atomic builtins to the user.
      From the clang discourse it was suggested we instead use SFINAE (which
      clang already supports).
    - I have recently pushed a patch for allowing the use of SFINAE on
      builtins: r15-6042-g9ed094a817ecaf
      Now that patch is committed, this patch does not change what happens
      for GCC, while it uses the builtin for codegen with clang.
    - I have previously sent a patchset upstream adding the ability to use
      __atomic_fetch_add and similar on floating point types.
      https://gcc.gnu.org/pipermail/gcc-patches/2024-November/668754.html
      Once that patchset is upstream (plus the automatic linking of
      libatomic as Joseph pointed out in the email below
      https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665408.html )
      then current GCC should start to use the builtin branch added in this
      patch.
    
    So *currently*, this patch allows external compilers (NVC++ in
    particular) to generate better code, and similarly lets clang understand
    the operation better since it maps to a known builtin.
    
    I hope that by GCC 16 this patch would also allow GCC to understand the
    operation better via mapping to a known builtin.
    
    libstdc++-v3/ChangeLog:
    
            * include/bits/atomic_base.h (__atomic_fetch_addable): Define
            new concept.
            (__atomic_impl::__fetch_add_flt): Use new concept to make use of
            __atomic_fetch_add when available.
            (__atomic_fetch_subtractable, __fetch_sub_flt): Likewise.
            (__atomic_add_fetchable, __add_fetch_flt): Likewise.
            (__atomic_sub_fetchable, __sub_fetch_flt): Likewise.
    
    Signed-off-by: Matthew Malcomson <mmalcom...@nvidia.com>
    Co-authored-by: Jonathan Wakely <jwak...@redhat.com>

Diff:
---
 libstdc++-v3/include/bits/atomic_base.h | 88 +++++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 26 deletions(-)

diff --git a/libstdc++-v3/include/bits/atomic_base.h 
b/libstdc++-v3/include/bits/atomic_base.h
index 1ef21f30bbce..b56007b7bf5f 100644
--- a/libstdc++-v3/include/bits/atomic_base.h
+++ b/libstdc++-v3/include/bits/atomic_base.h
@@ -1209,54 +1209,90 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       __xor_fetch(_Tp* __ptr, _Val<_Tp> __i) noexcept
       { return __atomic_xor_fetch(__ptr, __i, __ATOMIC_SEQ_CST); }
 
+    template<typename _Tp>
+      concept __atomic_fetch_addable
+       = requires (_Tp __t) { __atomic_fetch_add(&__t, __t, 0); };
+
     template<typename _Tp>
       _Tp
       __fetch_add_flt(_Tp* __ptr, _Val<_Tp> __i, memory_order __m) noexcept
       {
-       _Val<_Tp> __oldval = load(__ptr, memory_order_relaxed);
-       _Val<_Tp> __newval = __oldval + __i;
-       while (!compare_exchange_weak(__ptr, __oldval, __newval, __m,
-                                     memory_order_relaxed))
-         __newval = __oldval + __i;
-       return __oldval;
+       if constexpr (__atomic_fetch_addable<_Tp>)
+         return __atomic_fetch_add(__ptr, __i, int(__m));
+       else
+         {
+           _Val<_Tp> __oldval = load (__ptr, memory_order_relaxed);
+           _Val<_Tp> __newval = __oldval + __i;
+           while (!compare_exchange_weak (__ptr, __oldval, __newval, __m,
+                                          memory_order_relaxed))
+             __newval = __oldval + __i;
+           return __oldval;
+         }
       }
 
+    template<typename _Tp>
+      concept __atomic_fetch_subtractable
+       = requires (_Tp __t) { __atomic_fetch_sub(&__t, __t, 0); };
+
     template<typename _Tp>
       _Tp
       __fetch_sub_flt(_Tp* __ptr, _Val<_Tp> __i, memory_order __m) noexcept
       {
-       _Val<_Tp> __oldval = load(__ptr, memory_order_relaxed);
-       _Val<_Tp> __newval = __oldval - __i;
-       while (!compare_exchange_weak(__ptr, __oldval, __newval, __m,
-                                     memory_order_relaxed))
-         __newval = __oldval - __i;
-       return __oldval;
+       if constexpr (__atomic_fetch_subtractable<_Tp>)
+         return __atomic_fetch_sub(__ptr, __i, int(__m));
+       else
+         {
+           _Val<_Tp> __oldval = load (__ptr, memory_order_relaxed);
+           _Val<_Tp> __newval = __oldval - __i;
+           while (!compare_exchange_weak (__ptr, __oldval, __newval, __m,
+                                          memory_order_relaxed))
+             __newval = __oldval - __i;
+           return __oldval;
+         }
       }
 
+    template<typename _Tp>
+      concept __atomic_add_fetchable
+       = requires (_Tp __t) { __atomic_add_fetch(&__t, __t, 0); };
+
     template<typename _Tp>
       _Tp
       __add_fetch_flt(_Tp* __ptr, _Val<_Tp> __i) noexcept
       {
-       _Val<_Tp> __oldval = load(__ptr, memory_order_relaxed);
-       _Val<_Tp> __newval = __oldval + __i;
-       while (!compare_exchange_weak(__ptr, __oldval, __newval,
-                                     memory_order_seq_cst,
-                                     memory_order_relaxed))
-         __newval = __oldval + __i;
-       return __newval;
+       if constexpr (__atomic_add_fetchable<_Tp>)
+         return __atomic_add_fetch(__ptr, __i, __ATOMIC_SEQ_CST);
+       else
+         {
+           _Val<_Tp> __oldval = load (__ptr, memory_order_relaxed);
+           _Val<_Tp> __newval = __oldval + __i;
+           while (!compare_exchange_weak (__ptr, __oldval, __newval,
+                                          memory_order_seq_cst,
+                                          memory_order_relaxed))
+             __newval = __oldval + __i;
+           return __newval;
+         }
       }
 
+    template<typename _Tp>
+      concept __atomic_sub_fetchable
+       = requires (_Tp __t) { __atomic_sub_fetch(&__t, __t, 0); };
+
     template<typename _Tp>
       _Tp
       __sub_fetch_flt(_Tp* __ptr, _Val<_Tp> __i) noexcept
       {
-       _Val<_Tp> __oldval = load(__ptr, memory_order_relaxed);
-       _Val<_Tp> __newval = __oldval - __i;
-       while (!compare_exchange_weak(__ptr, __oldval, __newval,
-                                     memory_order_seq_cst,
-                                     memory_order_relaxed))
-         __newval = __oldval - __i;
-       return __newval;
+       if constexpr (__atomic_sub_fetchable<_Tp>)
+         return __atomic_sub_fetch(__ptr, __i, __ATOMIC_SEQ_CST);
+       else
+         {
+           _Val<_Tp> __oldval = load (__ptr, memory_order_relaxed);
+           _Val<_Tp> __newval = __oldval - __i;
+           while (!compare_exchange_weak (__ptr, __oldval, __newval,
+                                          memory_order_seq_cst,
+                                          memory_order_relaxed))
+             __newval = __oldval - __i;
+           return __newval;
+         }
       }
   } // namespace __atomic_impl

Reply via email to