https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90018

--- Comment #15 from Richard Biener <rguenth at gcc dot gnu.org> ---
So the issue is really that for

  for (int i = 0; i < n; ++i)
    {
      double tem1 = a4[i*4] + a4[i*4+n*4] (**);
      double tem2 = a4[i*4+2*n*4+1];
      a4[i*4+n*4+1] = tem1;
      a4[i*4+1] = tem2;
      double tem3 = a4[i*4] - tem2;
      double tem4 = tem3 + a4[i*4+n*4];
      a4[i*4+n*4+1] = tem3 + a4[i*4+n*4+1] (**);
    }

we detect an interleaving load for (**) and emit it before the
later strided store to a4[i*4+n*4+1].

This issue is that vect_preserves_scalar_order_p expects to the
vectorization will happen via SLP but we will end up doing interleaving
which does not perform the load in place of the last load but in place
of ->first_element.  Unfortunately SLP analysis is done _after_
dependence analysis.  That means we have to conservatively assume both
paths may happen.

Fixed testcase:

void __attribute__((noinline,noclone))
foo (double *a4, int n)
{
  for (int i = 0; i < n; ++i)
    {
      double tem1 = a4[i*4] + a4[i*4+n*4];
      double tem2 = a4[i*4+2*n*4+1];
      a4[i*4+n*4+1] = tem1;
      a4[i*4+1] = tem2;
      double tem3 = a4[i*4] - tem2;
      double tem4 = tem3 + a4[i*4+n*4];
      a4[i*4+n*4+1] = tem4 + a4[i*4+n*4+1];
    }
}
int main(int argc, char **argv)
{
  int n = 11;
  double a4[4 * n * 8];
  double a42[4 * n * 8];
  for (int i = 0; i < 4 * n * 8; ++i)
    a4[i] = a42[i] = i;
  foo (a4, n);
  for (int i = 0; i < n; ++i)
    {
      double tem1 = a42[i*4] + a42[i*4+n*4];
      double tem2 = a42[i*4+2*n*4+1];
      a42[i*4+n*4+1] = tem1;
      a42[i*4+1] = tem2;
      double tem3 = a42[i*4] - tem2;
      double tem4 = tem3 + a42[i*4+n*4];
      a42[i*4+n*4+1] = tem4 + a42[i*4+n*4+1];
      __asm__ volatile ("": : : "memory");
    }
  for (int i = 0; i < 4 * n * 8; ++i)
    if (a4[i] != a42[i])
      __builtin_abort ();
  return 0;
}

Reply via email to