https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111683
Jakub Jelinek <jakub at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |rguenth at gcc dot gnu.org --- Comment #14 from Jakub Jelinek <jakub at gcc dot gnu.org> --- It is indeed pcom. Slightly better testcase which doesn't use double arithmetics but long long: long long b[6] = { 3, 4, 5, 6, 7, 8 }; long long c[16]; typedef long long U __attribute__ ((vector_size(16), may_alias, aligned(1))); typedef long long V __attribute__ ((vector_size(16), may_alias)); int main () { for (int f = 0; f < 6; f++) { *(U *) &c[f] = *(U *) &c[f] + (V) { b[f], b[f] }; *(U *) &c[f + 2] = *(U *) &c[f + 2] + (V) { b[f], b[f] }; } if (c[1] != 7) __builtin_abort (); return 0; } And it doesn't need to be about vector types either, nor use any aligned(1) loads/stores in the source: int b[6] = { 3, 4, 5, 6, 7, 8 }, c[12]; int d[16] = { 0, 1, 3, 6, 10, 14, 12, 9, 5, 0, 0, 0 }; int main () { int i; if (sizeof (int) * 2 != sizeof (long long)) return 0; for (i = 0; i < 6; i++) { long long a; __builtin_memcpy (&a, &c[i], sizeof (a)); a += (((long long) i) << (sizeof (int) * __CHAR_BIT__)) + i; __builtin_memcpy (&c[i], &a, sizeof (a)); __builtin_memcpy (&a, &c[i + 2], sizeof (a)); a += (((long long) i) << (sizeof (int) * __CHAR_BIT__)) + i; __builtin_memcpy (&c[i + 2], &a, sizeof (a)); } if (__builtin_memcmp (&c[0], &d[0], sizeof (c))) __builtin_abort (); return 0; } On the last testcase the *.pcom change against previous dump is: int main () { + unsigned long D__lsm1.7; + unsigned long D__lsm0.6; long long int a; int i; int * _1; @@ -21,18 +31,24 @@ int main () unsigned long _20; sizetype _25; sizetype _28; + unsigned long _33; + unsigned long _37; unsigned int ivtmp_38; unsigned int ivtmp_39; <bb 2> [local count: 153437704]: + _37 = MEM <unsigned long> [(char * {ref-all})&c]; + _33 = MEM <unsigned long> [(char * {ref-all})&c + 4B]; <bb 3> [local count: 920304121]: # i_26 = PHI <i_23(7), 0(2)> # ivtmp_39 = PHI <ivtmp_38(7), 6(2)> + # D__lsm0.6_34 = PHI <D__lsm1.7_35(7), _37(2)> + # D__lsm1.7_35 = PHI <D__lsm0.6_36(7), _33(2)> _28 = (sizetype) i_26; _9 = _28 * 4; _1 = &c + _9; - _12 = MEM <unsigned long> [(char * {ref-all})_1]; + _12 = D__lsm0.6_34; a_13 = (long long int) _12; _19 = _28 * 4294967297; _2 = (long long int) _19; @@ -48,6 +64,7 @@ int main () _6 = _2 + a_18; _20 = (unsigned long) _6; MEM <unsigned long> [(char * {ref-all})_5] = _20; + D__lsm0.6_36 = _20; i_23 = i_26 + 1; ivtmp_38 = ivtmp_39 - 1; if (ivtmp_38 != 0) The data refs in the loop have access fns {0B, +, 4}_1 and {8B, +, 4}_1, so within the same iteration there is no overlap between the two, but because they are actually 8 byte loads/stores, there is partial overlap between the adjacent iterations in them that pcom doesn't take into account. So, shall we somewhere punt for references with larger access sizes than their DR_STEP? Something else?