https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66419

--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
The difference is that we vectorize parts of stdarg_func:

<L3>:
  _57 = ap.__vr_top;
  _58 = (sizetype) _39;
  _59 = _57 + _58;
  _60 = *_59;
  MEM[(float *)&ha.20] = _60;
  _64 = _58 + 16;
  _65 = _57 + _64;
  _66 = MEM[(float *)_65];
  MEM[(float *)&ha.20 + 4B] = _66;
  _70 = _58 + 32;
  _71 = _57 + _70;
  _72 = MEM[(float *)_71];
  MEM[(float *)&ha.20 + 8B] = _72;
  _76 = _58 + 48;
  _77 = _57 + _76;
  _78 = MEM[(float *)_77];
  MEM[(float *)&ha.20 + 12B] = _78;

to

<L3>:
  _57 = ap.__vr_top;
  _58 = (sizetype) _39;
  _59 = _57 + _58;
  _60 = *_59;
  _64 = _58 + 16;
  _65 = _57 + _64;
  _66 = MEM[(float *)_65];
  _70 = _58 + 32;
  _71 = _57 + _70;
  _72 = MEM[(float *)_71];
  _76 = _58 + 48;
  _77 = _57 + _76;
  vectp.40_138 = _59;
  vect__60.41_125 = MEM[(float *)vectp.40_138];
  _78 = MEM[(float *)_77];
  vectp.43_107 = &ha.20;
  MEM[(float *)vectp.43_107] = vect__60.41_125;

which looks bogus.  The load permutation

/space/rguenther/src/svn/trunk2/gcc/testsuite/gcc.target/aarch64/aapcs64/abitest.h:98:6:
note: Load permutation 0 4 8 12

due to the gaps isn't reflected in the transform.  And of course cost
considerations should prohibit the transform (but well).

C testcase that fails on x86_64 as well:

extern void abort (void);

int a[16];
int b[4];

void __attribute__((noinline))
foo (void)
{
  b[0] = a[0];
  b[1] = a[4];
  b[2] = a[8];
  b[3] = a[12];
}

int main()
{
  int i;
  for (i = 0; i < 16; ++i)
    {
      a[i] = i;
      __asm__ volatile ("");
    }
  foo ();
  if (b[0] != 0 || b[1] != 4 || b[2] != 8 || b[3] != 12)
    abort ();
  return 0;
}

Reply via email to