On Fri, Jan 31, 2025 at 12:01 PM Abhishek Kaushik
<abhishek.kaus...@intel.com> wrote:
>
> From 4ac7c7e56e23ed2f4dd2dafdfab6cfa110c14260 Mon Sep 17 00:00:00 2001
> From: Abhishek Kaushik <abhishek.kaus...@intel.com>
> Date: Fri, 31 Jan 2025 01:28:48 -0800
> Subject: [PATCH] libstdc++: Use canonical loop form in std::reduce
>
> The current while loop in std::reduce and related functions is hard to
> vectorize because the loop control variable is hard to detect.
>
> `while ((__last - __first) >= 4)`
>
> Changing the loop header to a for loop following the OpenMP canonical
> form allows easy vectorization, resulting in improved performance.
>
> `for (; __first <= __last - 4; __first += 4)`
>
> This patch modifies the loop header for std::reduce & std::transform_reduce.

Can you add a testcase to g++.dg/vect/ that is now vectorized but not before?

Thanks,
Richard.

> ---
>  libstdc++-v3/include/std/numeric | 10 +++-------
>  1 file changed, 3 insertions(+), 7 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/numeric 
> b/libstdc++-v3/include/std/numeric
> index 4d36fcd36d9..9c38ad89e21 100644
> --- a/libstdc++-v3/include/std/numeric
> +++ b/libstdc++-v3/include/std/numeric
> @@ -300,13 +300,12 @@ namespace __detail
>        static_assert(is_invocable_r_v<_Tp, _BinaryOperation&, __ref, __ref>);
>        if constexpr (__is_random_access_iter<_InputIterator>::value)
>     {
> -     while ((__last - __first) >= 4)
> +     for (; __first <= __last - 4; __first += 4)
>         {
>           _Tp __v1 = __binary_op(__first[0], __first[1]);
>           _Tp __v2 = __binary_op(__first[2], __first[3]);
>           _Tp __v3 = __binary_op(__v1, __v2);
>           __init = __binary_op(__init, __v3);
> -         __first += 4;
>         }
>     }
>        for (; __first != __last; ++__first)
> @@ -381,7 +380,7 @@ namespace __detail
>        if constexpr (__and_v<__is_random_access_iter<_InputIterator1>,
>                 __is_random_access_iter<_InputIterator2>>)
>     {
> -     while ((__last1 - __first1) >= 4)
> +     for (; __first1 <= __last1 - 4; __first1 += 4, __first2 += 4)
>         {
>           _Tp __v1 = __binary_op1(__binary_op2(__first1[0], __first2[0]),
>                       __binary_op2(__first1[1], __first2[1]));
> @@ -389,8 +388,6 @@ namespace __detail
>                       __binary_op2(__first1[3], __first2[3]));
>           _Tp __v3 = __binary_op1(__v1, __v2);
>           __init = __binary_op1(__init, __v3);
> -         __first1 += 4;
> -         __first2 += 4;
>         }
>     }
>        for (; __first1 != __last1; ++__first1, (void) ++__first2)
> @@ -447,7 +444,7 @@ namespace __detail
>      {
>        if constexpr (__is_random_access_iter<_InputIterator>::value)
>     {
> -     while ((__last - __first) >= 4)
> +     for (; __first <= __last - 4; __first += 4)
>         {
>           _Tp __v1 = __binary_op(__unary_op(__first[0]),
>                      __unary_op(__first[1]));
> @@ -455,7 +452,6 @@ namespace __detail
>                      __unary_op(__first[3]));
>           _Tp __v3 = __binary_op(__v1, __v2);
>           __init = __binary_op(__init, __v3);
> -         __first += 4;
>         }
>     }
>        for (; __first != __last; ++__first)
> --
> 2.31.1
>
>
>
>

Reply via email to