https://gcc.gnu.org/g:bdae7824cd9a9d27665bf1b82f60a761a9745a6a

commit r16-1247-gbdae7824cd9a9d27665bf1b82f60a761a9745a6a
Author: Jonathan Wakely <jwak...@redhat.com>
Date:   Wed Jun 4 15:53:20 2025 +0100

    libstdc++: Optimize std::counting_semaphore for futex path
    
    Rename __semaphore_base to __semaphore_impl, because it's not used as a
    base class. Replace the three identical lambda expressions with a named
    class, __semaphore_impl::_Available, which stores the most recent
    value of the counter as a data member, and provides call operators that
    test whether the value is decrementable (i.e. whether the semaphore can
    be acquired).
    
    Add a new __platform_semaphore_impl class template to be used when
    __platform_wait is available, which uses __platform_wait_t for the
    counter and uses more efficient atomic waits for the acquire functions.
    For a binary semaphore some members are further optimized because we
    know the counter can only be zero or one.
    
    Also add a bare wait flag to __atomic_wait_address_v, for consistency
    with __atomic_wait_address_until_v and __atomic_wait_address_for_v and
    to allow semaphores to use it without the redundant overhead of tracking
    waiters.
    
    libstdc++-v3/ChangeLog:
    
            * include/bits/atomic_wait.h (__atomic_wait_address_v): Add bare
            wait flag.
            * include/bits/semaphore_base.h (__semaphore_base): Rename to
            __semaphore_impl. Replace local variable and predicate lambdas
            with _Available struct.
            (__platform_semaphore_impl): New class template.
            (__semaphore_impl): Remove alias template.
            (_Select_semaphore_impl): New alias template.
            * include/std/semaphore (counting_semaphore): Use
            _Select_semaphore_impl.

Diff:
---
 libstdc++-v3/include/bits/atomic_wait.h    |   5 +-
 libstdc++-v3/include/bits/semaphore_base.h | 230 ++++++++++++++++++++++-------
 libstdc++-v3/include/std/semaphore         |   5 +-
 3 files changed, 183 insertions(+), 57 deletions(-)

diff --git a/libstdc++-v3/include/bits/atomic_wait.h 
b/libstdc++-v3/include/bits/atomic_wait.h
index 815726c16ccb..9ae11191d9ab 100644
--- a/libstdc++-v3/include/bits/atomic_wait.h
+++ b/libstdc++-v3/include/bits/atomic_wait.h
@@ -249,12 +249,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       // C++26 will return __val
     }
 
+  // Wait on __addr while *__addr == __old is true.
   inline void
   __atomic_wait_address_v(const __detail::__platform_wait_t* __addr,
                          __detail::__platform_wait_t __old,
-                         int __order)
+                         int __order, bool __bare_wait = false)
   {
-    __detail::__wait_args __args{ __addr, __old, __order };
+    __detail::__wait_args __args{ __addr, __old, __order, __bare_wait };
     // C++26 will not ignore the return value here
     __detail::__wait_impl(__addr, __args);
   }
diff --git a/libstdc++-v3/include/bits/semaphore_base.h 
b/libstdc++-v3/include/bits/semaphore_base.h
index 3f7a33ccd51a..ebbc9a80b91a 100644
--- a/libstdc++-v3/include/bits/semaphore_base.h
+++ b/libstdc++-v3/include/bits/semaphore_base.h
@@ -46,23 +46,20 @@ namespace std _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
-  template<bool _Platform_wait>
-  struct __semaphore_base
+  struct __semaphore_impl
   {
-    using __count_type = __conditional_t<_Platform_wait,
-                                        __detail::__platform_wait_t,
-                                        ptrdiff_t>;
+    using __count_type = ptrdiff_t;
 
     static constexpr ptrdiff_t _S_max
       = __gnu_cxx::__int_traits<__count_type>::__max;
 
     constexpr explicit
-    __semaphore_base(__count_type __count) noexcept
+    __semaphore_impl(__count_type __count) noexcept
     : _M_counter(__count)
     { }
 
-    __semaphore_base(const __semaphore_base&) = delete;
-    __semaphore_base& operator=(const __semaphore_base&) = delete;
+    __semaphore_impl(const __semaphore_impl&) = delete;
+    __semaphore_impl& operator=(const __semaphore_impl&) = delete;
 
     // Load the current counter value.
     _GLIBCXX_ALWAYS_INLINE __count_type
@@ -71,8 +68,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
     // Try to acquire the semaphore (i.e. decrement the counter).
     // Returns false if the current counter is zero, or if another thread
-    // decrements the value first. In the latter case, __cur is set to the
-    // new value.
+    // changes the value first. In the latter case, __cur is set to the new
+    // value.
     _GLIBCXX_ALWAYS_INLINE bool
     _M_do_try_acquire(__count_type& __cur) noexcept
     {
@@ -85,24 +82,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
                                                    memory_order::relaxed);
     }
 
+    // Keep trying to acquire the semaphore in a loop until it succeeds.
     void
     _M_acquire() noexcept
     {
-      auto const __vfn = [this]{ return _M_get_current(); };
-      auto __val = __vfn();
-      auto const __pred = [&__val](__count_type __cur) {
-       if (__cur > 0)
-         {
-           __val = __cur;
-           return true;
-         }
-       return false;
-      };
-      while (!_M_do_try_acquire(__val))
-       if (__val == 0)
-         std::__atomic_wait_address(&_M_counter, __pred, __vfn, true);
+      auto __vfn = [this]{ return _M_get_current(); };
+      _Available __is_available{__vfn()};
+      while (!_M_do_try_acquire(__is_available._M_val))
+       if (!__is_available())
+         std::__atomic_wait_address(&_M_counter, __is_available, __vfn, true);
     }
 
+    // Try to acquire the semaphore, retrying a small number of times
+    // in case of contention.
     bool
     _M_try_acquire() noexcept
     {
@@ -116,22 +108,152 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       bool
       _M_try_acquire_until(const chrono::time_point<_Clock, _Duration>& 
__atime) noexcept
       {
-       auto const __vfn = [this]{ return _M_get_current(); };
-       auto __val = __vfn();
-       auto const __pred = [&__val](__count_type __cur) {
-         if (__cur > 0)
-           {
-             __val = __cur;
-             return true;
-           }
+       auto __vfn = [this]{ return _M_get_current(); };
+       _Available __is_available{__vfn()};
+       while (!_M_do_try_acquire(__is_available._M_val))
+         if (!__is_available())
+           if (!std::__atomic_wait_address_until(&_M_counter, __is_available,
+                                                 __vfn, __atime, true))
+             return false; // timed out
+       return true;
+      }
+
+    template<typename _Rep, typename _Period>
+      bool
+      _M_try_acquire_for(const chrono::duration<_Rep, _Period>& __rtime) 
noexcept
+      {
+       auto __vfn = [this]{ return _M_get_current(); };
+       _Available __is_available{__vfn()};
+       while (!_M_do_try_acquire(__is_available._M_val))
+         if (!__is_available())
+           if (!std::__atomic_wait_address_for(&_M_counter, __is_available,
+                                               __vfn, __rtime, true))
+             return false; // timed out
+       return true;
+      }
+
+    _GLIBCXX_ALWAYS_INLINE ptrdiff_t
+    _M_release(ptrdiff_t __update) noexcept
+    {
+      auto __old = __atomic_impl::fetch_add(&_M_counter, __update,
+                                           memory_order::release);
+      if (__old == 0 && __update > 0)
+       __atomic_notify_address(&_M_counter, true, true);
+      return __old;
+    }
+
+  private:
+    struct _Available
+    {
+      __count_type _M_val; // Cache of the last value loaded from _M_counter.
+
+      // Returns true if the cached value is non-zero and so it should be
+      // possible to acquire the semaphore.
+      bool operator()() const noexcept { return _M_val > 0; }
+
+      // Argument should be the latest value of the counter.
+      // Returns true (and caches the value) if it's non-zero, meaning it
+      // should be possible to acquire the semaphore. Returns false otherwise.
+      bool operator()(__count_type __cur) noexcept
+      {
+       if (__cur == 0)
          return false;
-       };
+       _M_val = __cur;
+       return true;
+      }
+    };
+
+    alignas(__atomic_ref<__count_type>::required_alignment)
+      __count_type _M_counter;
+  };
+
+  // Optimized specialization using __platform_wait (if available)
+  template<bool _Binary>
+  struct __platform_semaphore_impl
+  {
+    using __count_type = __detail::__platform_wait_t;
+
+    static constexpr ptrdiff_t _S_max
+      = _Binary ? 1 : __gnu_cxx::__int_traits<__count_type>::__max;
+
+    constexpr explicit
+    __platform_semaphore_impl(__count_type __count) noexcept
+    : _M_counter(__count)
+    { }
+
+    __platform_semaphore_impl(__platform_semaphore_impl&) = delete;
+    __platform_semaphore_impl& operator=(const __platform_semaphore_impl&) = 
delete;
+
+    // Load the current counter value.
+    _GLIBCXX_ALWAYS_INLINE __count_type
+    _M_get_current() const noexcept
+    {
+      if constexpr (_Binary)
+       return 1; // Not necessarily true, but optimistically assume it is.
+      else
+       return __atomic_impl::load(&_M_counter, memory_order::acquire);
+    }
+
+    // Try to acquire the semaphore (i.e. decrement the counter).
+    // Returns false if the current counter is zero, or if another thread
+    // changes the value first. In the latter case, __cur is set to the new
+    // value.
+    _GLIBCXX_ALWAYS_INLINE bool
+    _M_do_try_acquire(__count_type& __cur) noexcept
+    {
+      if (__cur == 0)
+       return false; // Cannot decrement when it's already zero.
+
+      return __atomic_impl::compare_exchange_strong(&_M_counter,
+                                                   __cur, __cur - 1,
+                                                   memory_order::acquire,
+                                                   memory_order::relaxed);
+    }
+
+    // Keep trying to acquire the semaphore in a loop until it succeeds.
+    void
+    _M_acquire() noexcept
+    {
+      auto __val = _M_get_current();
+      while (!_M_do_try_acquire(__val))
+       if (__val == 0)
+         {
+           std::__atomic_wait_address_v(&_M_counter, __val, __ATOMIC_ACQUIRE,
+                                        true);
+           __val = _M_get_current();
+         }
+    }
+
+    // Try to acquire the semaphore.
+    bool
+    _M_try_acquire() noexcept
+    {
+      if constexpr (_Binary)
+       {
+         __count_type __val = 1;
+         // Do not expect much contention on binary semaphore, only try once.
+         return _M_do_try_acquire(__val);
+       }
+      else
+       // Fastest implementation of this function is just _M_do_try_acquire
+       // but that can fail under contention even when _M_count > 0.
+       // Using _M_try_acquire_for(0ns) will retry a few times in a loop.
+       return _M_try_acquire_for(__detail::__wait_clock_t::duration{});
+    }
+
+    template<typename _Clock, typename _Duration>
+      bool
+      _M_try_acquire_until(const chrono::time_point<_Clock, _Duration>& 
__atime) noexcept
+      {
+       auto __val = _M_get_current();
        while (!_M_do_try_acquire(__val))
          if (__val == 0)
            {
-             if (!std::__atomic_wait_address_until(&_M_counter, __pred,
-                                                   __vfn, __atime, true))
+             if (!std::__atomic_wait_address_until_v(&_M_counter, 0,
+                                                     __ATOMIC_ACQUIRE,
+                                                     __atime, true))
                return false; // timed out
+             __val = _M_get_current();
            }
        return true;
       }
@@ -140,22 +262,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       bool
       _M_try_acquire_for(const chrono::duration<_Rep, _Period>& __rtime) 
noexcept
       {
-       auto const __vfn = [this]{ return _M_get_current(); };
-       auto __val = __vfn();
-       auto const __pred = [&__val](__count_type __cur) {
-         if (__cur > 0)
-           {
-             __val = __cur;
-             return true;
-           }
-         return false;
-       };
+       auto __val = _M_get_current();
        while (!_M_do_try_acquire(__val))
          if (__val == 0)
            {
-             if (!std::__atomic_wait_address_for(&_M_counter, __pred,
-                                                 __vfn, __rtime, true))
+             if (!std::__atomic_wait_address_for_v(&_M_counter, 0,
+                                                   __ATOMIC_ACQUIRE,
+                                                   __rtime, true))
                return false; // timed out
+             __val = _M_get_current();
            }
        return true;
       }
@@ -170,15 +285,26 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       return __old;
     }
 
-  private:
-    alignas(_Platform_wait ? __detail::__platform_wait_alignment
-                          : __alignof__(__count_type))
-    __count_type _M_counter;
+  protected:
+    alignas(__detail::__platform_wait_alignment) __count_type _M_counter;
   };
 
   template<ptrdiff_t _Max>
-    using __semaphore_impl
-      = __semaphore_base<(_Max <= __semaphore_base<true>::_S_max)>;
+    using _Select_semaphore_impl = typename decltype([]
+    {
+      using namespace __detail;
+      if constexpr (__platform_wait_uses_type<__platform_wait_t>)
+       {
+         if constexpr (_Max <= 1)
+           return type_identity<__platform_semaphore_impl<true>>{};
+         else if constexpr (_Max <= __platform_semaphore_impl<false>::_S_max)
+           return type_identity<__platform_semaphore_impl<false>>{};
+         else
+           return type_identity<__semaphore_impl>{};
+       }
+      else
+       return type_identity<__semaphore_impl>{};
+    }())::type;
 
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
diff --git a/libstdc++-v3/include/std/semaphore 
b/libstdc++-v3/include/std/semaphore
index ca1bffe371a0..8f49188563e8 100644
--- a/libstdc++-v3/include/std/semaphore
+++ b/libstdc++-v3/include/std/semaphore
@@ -45,13 +45,12 @@ namespace std _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
-  template<ptrdiff_t __least_max_value = __semaphore_base<true>::_S_max>
+  template<ptrdiff_t __least_max_value = _Select_semaphore_impl<2>::_S_max>
     class counting_semaphore
     {
       static_assert(__least_max_value >= 0);
 
-      using _Impl = __semaphore_impl<__least_max_value>;
-      _Impl _M_sem;
+      _Select_semaphore_impl<__least_max_value> _M_sem;
 
     public:
       constexpr explicit

Reply via email to