------- Comment #4 from jakub at gcc dot gnu dot org 2009-08-04 10:27 ------- On the 4.4 branch/x86_64-linux, I can reproduce even with -O1 on: extern "C" void abort (void);
#define _MAY_ALIAS __attribute__((__may_alias__)) typedef struct { float v[2]; } _float_v; typedef struct { int v[2]; } _int_v; typedef unsigned int _UInt _MAY_ALIAS; typedef signed int _Int _MAY_ALIAS; typedef float _Float _MAY_ALIAS; static inline unsigned short less_than (_int_v a, _int_v b) { unsigned short r = 0; const _UInt *p1 = (const _UInt *) &a; const _UInt *p2 = (const _UInt *) &b; for (int i=0; i < 2; i++) if (p1[i] < p2[i]) r |= (1 << i); return r; } static inline _int_v multiply (_int_v b, _int_v c) { _int_v r; _Int *p3 = (_Int *) &c; for (int i=0; i < 2; i++) r.v[i] = (int) (b.v[i] * p3[i] & 0xFFFFFFFF); return r; } static inline _float_v gather (_int_v indexes, const void *baseAddr) { _float_v r; _Int *idx = (_Int *) &indexes; _Float *src = (_Float *) baseAddr; for (int i=0; i < 2; i++) r.v[i] = *(src + idx[i]); return r; } static inline _int_v add (const _int_v &b, const _int_v &c) { _int_v result; _Int *r = (_Int *) &result; for (int i=0; i < 2; i++) r[i] = b.v[i] + c.v[i]; return result; } struct uint_v { _int_v data; inline uint_v () { data.v[0] = 0; data.v[1] = 1; } inline uint_v (unsigned int a) { for (int i=0; i < 2; i++) *(_UInt *) &data.v[i] = a; } inline uint_v (_int_v x) : data (x) {} inline uint_v operator* (const uint_v &x) const { return multiply (data, x.data); } inline uint_v operator+ (const uint_v &x) const { return uint_v (add (data, x.data)); } inline unsigned short operator< (const uint_v &x) const { return less_than (data, x.data); } }; struct float_v { _float_v data; explicit inline float_v (const uint_v &x) { _UInt *p2 = (_UInt *) &x.data; for (int i=0; i < 2; i++) data.v[i] = p2[i]; } inline float_v (const float *array, const uint_v &indexes) { const uint_v &offsets = indexes * uint_v (1); data = gather (offsets.data, array); } __attribute__((noinline)) unsigned short operator== (const float_v &x) const { unsigned short r = 0; for (int i=0; i < 2; i++) if (data.v[i] == x.data.v[i]) r |= (1 << i); return r; } }; int main () { const float array[2] = { 2, 3 }; unsigned short mask; for (uint_v i; (mask = (i < 2)) == 3; i = i + 2) { const float_v ii (i + 2); asm volatile ("# Barrier 1 %0 %1 %2" : : "r" (&ii), "r" (&i), "r" (array) : "memory"); float_v a (array, i); asm volatile ("# Barrier 2 %0 %1 %2 %3" : : "r" (&ii), "r" (&i), "r" (array), "r" (&a) : "memory"); if ((a == ii) != 3) abort(); } return 0; } Apparently there is some stack slot sharing and RTL DSE decides to remove a store to stack related address as dead when that memory is later on read using a non-%rsp related read. -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40924