https://gcc.gnu.org/g:bdae7824cd9a9d27665bf1b82f60a761a9745a6a
commit r16-1247-gbdae7824cd9a9d27665bf1b82f60a761a9745a6a Author: Jonathan Wakely <jwak...@redhat.com> Date: Wed Jun 4 15:53:20 2025 +0100 libstdc++: Optimize std::counting_semaphore for futex path Rename __semaphore_base to __semaphore_impl, because it's not used as a base class. Replace the three identical lambda expressions with a named class, __semaphore_impl::_Available, which stores the most recent value of the counter as a data member, and provides call operators that test whether the value is decrementable (i.e. whether the semaphore can be acquired). Add a new __platform_semaphore_impl class template to be used when __platform_wait is available, which uses __platform_wait_t for the counter and uses more efficient atomic waits for the acquire functions. For a binary semaphore some members are further optimized because we know the counter can only be zero or one. Also add a bare wait flag to __atomic_wait_address_v, for consistency with __atomic_wait_address_until_v and __atomic_wait_address_for_v and to allow semaphores to use it without the redundant overhead of tracking waiters. libstdc++-v3/ChangeLog: * include/bits/atomic_wait.h (__atomic_wait_address_v): Add bare wait flag. * include/bits/semaphore_base.h (__semaphore_base): Rename to __semaphore_impl. Replace local variable and predicate lambdas with _Available struct. (__platform_semaphore_impl): New class template. (__semaphore_impl): Remove alias template. (_Select_semaphore_impl): New alias template. * include/std/semaphore (counting_semaphore): Use _Select_semaphore_impl. Diff: --- libstdc++-v3/include/bits/atomic_wait.h | 5 +- libstdc++-v3/include/bits/semaphore_base.h | 230 ++++++++++++++++++++++------- libstdc++-v3/include/std/semaphore | 5 +- 3 files changed, 183 insertions(+), 57 deletions(-) diff --git a/libstdc++-v3/include/bits/atomic_wait.h b/libstdc++-v3/include/bits/atomic_wait.h index 815726c16ccb..9ae11191d9ab 100644 --- a/libstdc++-v3/include/bits/atomic_wait.h +++ b/libstdc++-v3/include/bits/atomic_wait.h @@ -249,12 +249,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // C++26 will return __val } + // Wait on __addr while *__addr == __old is true. inline void __atomic_wait_address_v(const __detail::__platform_wait_t* __addr, __detail::__platform_wait_t __old, - int __order) + int __order, bool __bare_wait = false) { - __detail::__wait_args __args{ __addr, __old, __order }; + __detail::__wait_args __args{ __addr, __old, __order, __bare_wait }; // C++26 will not ignore the return value here __detail::__wait_impl(__addr, __args); } diff --git a/libstdc++-v3/include/bits/semaphore_base.h b/libstdc++-v3/include/bits/semaphore_base.h index 3f7a33ccd51a..ebbc9a80b91a 100644 --- a/libstdc++-v3/include/bits/semaphore_base.h +++ b/libstdc++-v3/include/bits/semaphore_base.h @@ -46,23 +46,20 @@ namespace std _GLIBCXX_VISIBILITY(default) { _GLIBCXX_BEGIN_NAMESPACE_VERSION - template<bool _Platform_wait> - struct __semaphore_base + struct __semaphore_impl { - using __count_type = __conditional_t<_Platform_wait, - __detail::__platform_wait_t, - ptrdiff_t>; + using __count_type = ptrdiff_t; static constexpr ptrdiff_t _S_max = __gnu_cxx::__int_traits<__count_type>::__max; constexpr explicit - __semaphore_base(__count_type __count) noexcept + __semaphore_impl(__count_type __count) noexcept : _M_counter(__count) { } - __semaphore_base(const __semaphore_base&) = delete; - __semaphore_base& operator=(const __semaphore_base&) = delete; + __semaphore_impl(const __semaphore_impl&) = delete; + __semaphore_impl& operator=(const __semaphore_impl&) = delete; // Load the current counter value. _GLIBCXX_ALWAYS_INLINE __count_type @@ -71,8 +68,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // Try to acquire the semaphore (i.e. decrement the counter). // Returns false if the current counter is zero, or if another thread - // decrements the value first. In the latter case, __cur is set to the - // new value. + // changes the value first. In the latter case, __cur is set to the new + // value. _GLIBCXX_ALWAYS_INLINE bool _M_do_try_acquire(__count_type& __cur) noexcept { @@ -85,24 +82,19 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION memory_order::relaxed); } + // Keep trying to acquire the semaphore in a loop until it succeeds. void _M_acquire() noexcept { - auto const __vfn = [this]{ return _M_get_current(); }; - auto __val = __vfn(); - auto const __pred = [&__val](__count_type __cur) { - if (__cur > 0) - { - __val = __cur; - return true; - } - return false; - }; - while (!_M_do_try_acquire(__val)) - if (__val == 0) - std::__atomic_wait_address(&_M_counter, __pred, __vfn, true); + auto __vfn = [this]{ return _M_get_current(); }; + _Available __is_available{__vfn()}; + while (!_M_do_try_acquire(__is_available._M_val)) + if (!__is_available()) + std::__atomic_wait_address(&_M_counter, __is_available, __vfn, true); } + // Try to acquire the semaphore, retrying a small number of times + // in case of contention. bool _M_try_acquire() noexcept { @@ -116,22 +108,152 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_try_acquire_until(const chrono::time_point<_Clock, _Duration>& __atime) noexcept { - auto const __vfn = [this]{ return _M_get_current(); }; - auto __val = __vfn(); - auto const __pred = [&__val](__count_type __cur) { - if (__cur > 0) - { - __val = __cur; - return true; - } + auto __vfn = [this]{ return _M_get_current(); }; + _Available __is_available{__vfn()}; + while (!_M_do_try_acquire(__is_available._M_val)) + if (!__is_available()) + if (!std::__atomic_wait_address_until(&_M_counter, __is_available, + __vfn, __atime, true)) + return false; // timed out + return true; + } + + template<typename _Rep, typename _Period> + bool + _M_try_acquire_for(const chrono::duration<_Rep, _Period>& __rtime) noexcept + { + auto __vfn = [this]{ return _M_get_current(); }; + _Available __is_available{__vfn()}; + while (!_M_do_try_acquire(__is_available._M_val)) + if (!__is_available()) + if (!std::__atomic_wait_address_for(&_M_counter, __is_available, + __vfn, __rtime, true)) + return false; // timed out + return true; + } + + _GLIBCXX_ALWAYS_INLINE ptrdiff_t + _M_release(ptrdiff_t __update) noexcept + { + auto __old = __atomic_impl::fetch_add(&_M_counter, __update, + memory_order::release); + if (__old == 0 && __update > 0) + __atomic_notify_address(&_M_counter, true, true); + return __old; + } + + private: + struct _Available + { + __count_type _M_val; // Cache of the last value loaded from _M_counter. + + // Returns true if the cached value is non-zero and so it should be + // possible to acquire the semaphore. + bool operator()() const noexcept { return _M_val > 0; } + + // Argument should be the latest value of the counter. + // Returns true (and caches the value) if it's non-zero, meaning it + // should be possible to acquire the semaphore. Returns false otherwise. + bool operator()(__count_type __cur) noexcept + { + if (__cur == 0) return false; - }; + _M_val = __cur; + return true; + } + }; + + alignas(__atomic_ref<__count_type>::required_alignment) + __count_type _M_counter; + }; + + // Optimized specialization using __platform_wait (if available) + template<bool _Binary> + struct __platform_semaphore_impl + { + using __count_type = __detail::__platform_wait_t; + + static constexpr ptrdiff_t _S_max + = _Binary ? 1 : __gnu_cxx::__int_traits<__count_type>::__max; + + constexpr explicit + __platform_semaphore_impl(__count_type __count) noexcept + : _M_counter(__count) + { } + + __platform_semaphore_impl(__platform_semaphore_impl&) = delete; + __platform_semaphore_impl& operator=(const __platform_semaphore_impl&) = delete; + + // Load the current counter value. + _GLIBCXX_ALWAYS_INLINE __count_type + _M_get_current() const noexcept + { + if constexpr (_Binary) + return 1; // Not necessarily true, but optimistically assume it is. + else + return __atomic_impl::load(&_M_counter, memory_order::acquire); + } + + // Try to acquire the semaphore (i.e. decrement the counter). + // Returns false if the current counter is zero, or if another thread + // changes the value first. In the latter case, __cur is set to the new + // value. + _GLIBCXX_ALWAYS_INLINE bool + _M_do_try_acquire(__count_type& __cur) noexcept + { + if (__cur == 0) + return false; // Cannot decrement when it's already zero. + + return __atomic_impl::compare_exchange_strong(&_M_counter, + __cur, __cur - 1, + memory_order::acquire, + memory_order::relaxed); + } + + // Keep trying to acquire the semaphore in a loop until it succeeds. + void + _M_acquire() noexcept + { + auto __val = _M_get_current(); + while (!_M_do_try_acquire(__val)) + if (__val == 0) + { + std::__atomic_wait_address_v(&_M_counter, __val, __ATOMIC_ACQUIRE, + true); + __val = _M_get_current(); + } + } + + // Try to acquire the semaphore. + bool + _M_try_acquire() noexcept + { + if constexpr (_Binary) + { + __count_type __val = 1; + // Do not expect much contention on binary semaphore, only try once. + return _M_do_try_acquire(__val); + } + else + // Fastest implementation of this function is just _M_do_try_acquire + // but that can fail under contention even when _M_count > 0. + // Using _M_try_acquire_for(0ns) will retry a few times in a loop. + return _M_try_acquire_for(__detail::__wait_clock_t::duration{}); + } + + template<typename _Clock, typename _Duration> + bool + _M_try_acquire_until(const chrono::time_point<_Clock, _Duration>& __atime) noexcept + { + auto __val = _M_get_current(); while (!_M_do_try_acquire(__val)) if (__val == 0) { - if (!std::__atomic_wait_address_until(&_M_counter, __pred, - __vfn, __atime, true)) + if (!std::__atomic_wait_address_until_v(&_M_counter, 0, + __ATOMIC_ACQUIRE, + __atime, true)) return false; // timed out + __val = _M_get_current(); } return true; } @@ -140,22 +262,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION bool _M_try_acquire_for(const chrono::duration<_Rep, _Period>& __rtime) noexcept { - auto const __vfn = [this]{ return _M_get_current(); }; - auto __val = __vfn(); - auto const __pred = [&__val](__count_type __cur) { - if (__cur > 0) - { - __val = __cur; - return true; - } - return false; - }; + auto __val = _M_get_current(); while (!_M_do_try_acquire(__val)) if (__val == 0) { - if (!std::__atomic_wait_address_for(&_M_counter, __pred, - __vfn, __rtime, true)) + if (!std::__atomic_wait_address_for_v(&_M_counter, 0, + __ATOMIC_ACQUIRE, + __rtime, true)) return false; // timed out + __val = _M_get_current(); } return true; } @@ -170,15 +285,26 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return __old; } - private: - alignas(_Platform_wait ? __detail::__platform_wait_alignment - : __alignof__(__count_type)) - __count_type _M_counter; + protected: + alignas(__detail::__platform_wait_alignment) __count_type _M_counter; }; template<ptrdiff_t _Max> - using __semaphore_impl - = __semaphore_base<(_Max <= __semaphore_base<true>::_S_max)>; + using _Select_semaphore_impl = typename decltype([] + { + using namespace __detail; + if constexpr (__platform_wait_uses_type<__platform_wait_t>) + { + if constexpr (_Max <= 1) + return type_identity<__platform_semaphore_impl<true>>{}; + else if constexpr (_Max <= __platform_semaphore_impl<false>::_S_max) + return type_identity<__platform_semaphore_impl<false>>{}; + else + return type_identity<__semaphore_impl>{}; + } + else + return type_identity<__semaphore_impl>{}; + }())::type; _GLIBCXX_END_NAMESPACE_VERSION } // namespace std diff --git a/libstdc++-v3/include/std/semaphore b/libstdc++-v3/include/std/semaphore index ca1bffe371a0..8f49188563e8 100644 --- a/libstdc++-v3/include/std/semaphore +++ b/libstdc++-v3/include/std/semaphore @@ -45,13 +45,12 @@ namespace std _GLIBCXX_VISIBILITY(default) { _GLIBCXX_BEGIN_NAMESPACE_VERSION - template<ptrdiff_t __least_max_value = __semaphore_base<true>::_S_max> + template<ptrdiff_t __least_max_value = _Select_semaphore_impl<2>::_S_max> class counting_semaphore { static_assert(__least_max_value >= 0); - using _Impl = __semaphore_impl<__least_max_value>; - _Impl _M_sem; + _Select_semaphore_impl<__least_max_value> _M_sem; public: constexpr explicit