https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111683

Jakub Jelinek <jakub at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |rguenth at gcc dot gnu.org

--- Comment #14 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
It is indeed pcom.
Slightly better testcase which doesn't use double arithmetics but long long:
long long b[6] = { 3, 4, 5, 6, 7, 8 };
long long c[16];
typedef long long U __attribute__ ((vector_size(16), may_alias, aligned(1)));
typedef long long V __attribute__ ((vector_size(16), may_alias));

int
main ()
{
  for (int f = 0; f < 6; f++)
    {
      *(U *) &c[f] = *(U *) &c[f] + (V) { b[f], b[f] };
      *(U *) &c[f + 2] = *(U *) &c[f + 2] + (V) { b[f], b[f] };
    }
  if (c[1] != 7)
    __builtin_abort ();
  return 0;
}

And it doesn't need to be about vector types either, nor use any aligned(1)
loads/stores in the source:
int b[6] = { 3, 4, 5, 6, 7, 8 }, c[12];
int d[16] = { 0, 1, 3, 6, 10, 14, 12, 9, 5, 0, 0, 0 };

int
main ()
{
  int i;
  if (sizeof (int) * 2 != sizeof (long long))
    return 0;
  for (i = 0; i < 6; i++)
    {
      long long a;
      __builtin_memcpy (&a, &c[i], sizeof (a));
      a += (((long long) i) << (sizeof (int) * __CHAR_BIT__)) + i;
      __builtin_memcpy (&c[i], &a, sizeof (a));
      __builtin_memcpy (&a, &c[i + 2], sizeof (a));
      a += (((long long) i) << (sizeof (int) * __CHAR_BIT__)) + i;
      __builtin_memcpy (&c[i + 2], &a, sizeof (a));
    }
  if (__builtin_memcmp (&c[0], &d[0], sizeof (c)))
    __builtin_abort ();
  return 0;
}
On the last testcase the *.pcom change against previous dump is:
 int main ()
 {
+  unsigned long D__lsm1.7;
+  unsigned long D__lsm0.6;
   long long int a;
   int i;
   int * _1;
@@ -21,18 +31,24 @@ int main ()
   unsigned long _20;
   sizetype _25;
   sizetype _28;
+  unsigned long _33;
+  unsigned long _37;
   unsigned int ivtmp_38;
   unsigned int ivtmp_39;

   <bb 2> [local count: 153437704]:
+  _37 = MEM <unsigned long> [(char * {ref-all})&c];
+  _33 = MEM <unsigned long> [(char * {ref-all})&c + 4B];

   <bb 3> [local count: 920304121]:
   # i_26 = PHI <i_23(7), 0(2)>
   # ivtmp_39 = PHI <ivtmp_38(7), 6(2)>
+  # D__lsm0.6_34 = PHI <D__lsm1.7_35(7), _37(2)>
+  # D__lsm1.7_35 = PHI <D__lsm0.6_36(7), _33(2)>
   _28 = (sizetype) i_26;
   _9 = _28 * 4;
   _1 = &c + _9;
-  _12 = MEM <unsigned long> [(char * {ref-all})_1];
+  _12 = D__lsm0.6_34;
   a_13 = (long long int) _12;
   _19 = _28 * 4294967297;
   _2 = (long long int) _19;
@@ -48,6 +64,7 @@ int main ()
   _6 = _2 + a_18;
   _20 = (unsigned long) _6;
   MEM <unsigned long> [(char * {ref-all})_5] = _20;
+  D__lsm0.6_36 = _20;
   i_23 = i_26 + 1;
   ivtmp_38 = ivtmp_39 - 1;
   if (ivtmp_38 != 0)

The data refs in the loop have access fns {0B, +, 4}_1 and {8B, +, 4}_1, so
within
the same iteration there is no overlap between the two, but because they are
actually
8 byte loads/stores, there is partial overlap between the adjacent iterations
in them that pcom doesn't take into account.

So, shall we somewhere punt for references with larger access sizes than their
DR_STEP?
Something else?

Reply via email to