https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111874

--- Comment #1 from Hongtao.liu <crazylht at gmail dot com> ---
For integer, We have _mm512_mask_reduce_add_epi32 defined as

extern __inline int
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A)
{
  __A = _mm512_maskz_mov_epi32 (__U, __A);
  __MM512_REDUCE_OP (+);
}

#undef __MM512_REDUCE_OP
#define __MM512_REDUCE_OP(op) \
  __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1);            \
  __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0);            \
  __m256i __T3 = (__m256i) (__T1 op __T2);                              \
  __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1);            \
  __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0);            \
  __v4si __T6 = __T4 op __T5;                                           \
  __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });      \
  __v4si __T8 = __T6 op __T7;                                           \
  return __T8[0] op __T8[1]

There's correponding floating point version, but it's not in-order adds.

Reply via email to