https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88464

--- Comment #4 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
So, let's use a more complete testcase:
void
f1 (double * __restrict__ a, double const * __restrict__ b,
    int const * __restrict__ off1, int const * __restrict__ off2, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    {
      if (b[off1[i]] < b[off2[i]])
        a[off1[i]] = b[off1[i]];
      else
        a[off2[i]] = b[off2[i]];
    }
}

void
f2 (double * __restrict__ a, double const * __restrict__ b,
    long const * __restrict__ off1, long const * __restrict__ off2, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    {
      if (b[off1[i]] < b[off2[i]])
        a[off1[i]] = b[off1[i]];
      else
        a[off2[i]] = b[off2[i]];
    }
}

void
f3 (float * __restrict__ a, float const * __restrict__ b,
    int const * __restrict__ off1, int const * __restrict__ off2, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    {
      if (b[off1[i]] < b[off2[i]])
        a[off1[i]] = b[off1[i]];
      else
        a[off2[i]] = b[off2[i]];
    }
}

void
f4 (double * __restrict__ a, const double * __restrict__ b, int const *
__restrict__ off, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    a[i] = b[off[i]];
}

void
f5 (double * __restrict__ a, const double * __restrict__ b, long const *
__restrict__ off, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    a[i] = b[off[i]];
}

void
f6 (float * __restrict__ a, const float * __restrict__ b, int const *
__restrict__ off, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    a[i] = b[off[i]];
}

void
f7 (double * __restrict__ a, const double * __restrict__ b, int const *
__restrict__ off, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    a[off[i]] = b[i];
}

void
f8 (double * __restrict__ a, const double * __restrict__ b, long const *
__restrict__ off, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    a[off[i]] = b[i];
}

void
f9 (float * __restrict__ a, const float * __restrict__ b, int const *
__restrict__ off, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    a[off[i]] = b[i];
}

void
f10 (double * __restrict__ a, const double * __restrict__ b, int const *
__restrict__ off, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    if (a[i] > 10)
      a[i] = b[off[i]];
}

void
f11 (double * __restrict__ a, const double * __restrict__ b, long const *
__restrict__ off, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    if (a[i] > 10)
      a[i] = b[off[i]];
}

void
f12 (float * __restrict__ a, const float * __restrict__ b, int const *
__restrict__ off, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    if (a[i] > 10)
      a[i] = b[off[i]];
}

void
f13 (double * __restrict__ a, const double * __restrict__ b, int const *
__restrict__ off, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    if (b[i] > 10)
      a[off[i]] = b[i];
}

void
f14 (double * __restrict__ a, const double * __restrict__ b, long const *
__restrict__ off, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    if (b[i] > 10)
      a[off[i]] = b[i];
}

void
f15 (float * __restrict__ a, const float * __restrict__ b, int const *
__restrict__ off, int n)
{
#pragma GCC ivdep
  for (int i = 0; i < n; ++i)
    if (b[i] > 10)
      a[off[i]] = b[i];
}

With -O3 -mavx512{f,bw,dq,vl} -mtune=skylake-avx512
the f4/f5/f6 loops are vectorized for all of
-mprefer-vector-width={128,256,512}
(though e.g. for -mtune=generic not) and f7/f8/f9 are vectorized only with
-mprefer-vector-width=512 using vscatter*.
f10/f11/f12 are vectorized using vgather*, but strangely even in
-mprefer-vector-width=512 mode using the AVX2 gathers, so that suggests that
while unconditional scatters and gathers are properly supported, for
conditional ones
(both MASK_LOAD and MASK_STORE) the support for the cases when using a mask
register rather than a vector register with mask either hasn't been done or
doesn't work properly.

Reply via email to