https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88464
--- Comment #4 from Jakub Jelinek <jakub at gcc dot gnu.org> --- So, let's use a more complete testcase: void f1 (double * __restrict__ a, double const * __restrict__ b, int const * __restrict__ off1, int const * __restrict__ off2, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) { if (b[off1[i]] < b[off2[i]]) a[off1[i]] = b[off1[i]]; else a[off2[i]] = b[off2[i]]; } } void f2 (double * __restrict__ a, double const * __restrict__ b, long const * __restrict__ off1, long const * __restrict__ off2, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) { if (b[off1[i]] < b[off2[i]]) a[off1[i]] = b[off1[i]]; else a[off2[i]] = b[off2[i]]; } } void f3 (float * __restrict__ a, float const * __restrict__ b, int const * __restrict__ off1, int const * __restrict__ off2, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) { if (b[off1[i]] < b[off2[i]]) a[off1[i]] = b[off1[i]]; else a[off2[i]] = b[off2[i]]; } } void f4 (double * __restrict__ a, const double * __restrict__ b, int const * __restrict__ off, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) a[i] = b[off[i]]; } void f5 (double * __restrict__ a, const double * __restrict__ b, long const * __restrict__ off, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) a[i] = b[off[i]]; } void f6 (float * __restrict__ a, const float * __restrict__ b, int const * __restrict__ off, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) a[i] = b[off[i]]; } void f7 (double * __restrict__ a, const double * __restrict__ b, int const * __restrict__ off, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) a[off[i]] = b[i]; } void f8 (double * __restrict__ a, const double * __restrict__ b, long const * __restrict__ off, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) a[off[i]] = b[i]; } void f9 (float * __restrict__ a, const float * __restrict__ b, int const * __restrict__ off, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) a[off[i]] = b[i]; } void f10 (double * __restrict__ a, const double * __restrict__ b, int const * __restrict__ off, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) if (a[i] > 10) a[i] = b[off[i]]; } void f11 (double * __restrict__ a, const double * __restrict__ b, long const * __restrict__ off, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) if (a[i] > 10) a[i] = b[off[i]]; } void f12 (float * __restrict__ a, const float * __restrict__ b, int const * __restrict__ off, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) if (a[i] > 10) a[i] = b[off[i]]; } void f13 (double * __restrict__ a, const double * __restrict__ b, int const * __restrict__ off, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) if (b[i] > 10) a[off[i]] = b[i]; } void f14 (double * __restrict__ a, const double * __restrict__ b, long const * __restrict__ off, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) if (b[i] > 10) a[off[i]] = b[i]; } void f15 (float * __restrict__ a, const float * __restrict__ b, int const * __restrict__ off, int n) { #pragma GCC ivdep for (int i = 0; i < n; ++i) if (b[i] > 10) a[off[i]] = b[i]; } With -O3 -mavx512{f,bw,dq,vl} -mtune=skylake-avx512 the f4/f5/f6 loops are vectorized for all of -mprefer-vector-width={128,256,512} (though e.g. for -mtune=generic not) and f7/f8/f9 are vectorized only with -mprefer-vector-width=512 using vscatter*. f10/f11/f12 are vectorized using vgather*, but strangely even in -mprefer-vector-width=512 mode using the AVX2 gathers, so that suggests that while unconditional scatters and gathers are properly supported, for conditional ones (both MASK_LOAD and MASK_STORE) the support for the cases when using a mask register rather than a vector register with mask either hasn't been done or doesn't work properly.