https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111874
--- Comment #1 from Hongtao.liu <crazylht at gmail dot com> --- For integer, We have _mm512_mask_reduce_add_epi32 defined as extern __inline int __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A) { __A = _mm512_maskz_mov_epi32 (__U, __A); __MM512_REDUCE_OP (+); } #undef __MM512_REDUCE_OP #define __MM512_REDUCE_OP(op) \ __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); \ __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); \ __m256i __T3 = (__m256i) (__T1 op __T2); \ __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); \ __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); \ __v4si __T6 = __T4 op __T5; \ __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \ __v4si __T8 = __T6 op __T7; \ return __T8[0] op __T8[1] There's correponding floating point version, but it's not in-order adds.