Hello! While eyeballing following testcase:
float a[256], b[256], c[256]; void foo(void) { int i; for (i=0; i<256; ++i) c[i] = a[i] / b[i]; } -O2 -ftree-vectorize -ffast-math I noticed that for some reason CSE doesn't eliminate memory read, resulting in: .L2: vrcpps b(%rax), %ymm0 vmulps b(%rax), %ymm0, %ymm1 vmulps %ymm1, %ymm0, %ymm1 vaddps %ymm0, %ymm0, %ymm0 vsubps %ymm1, %ymm0, %ymm1 vmulps a(%rax), %ymm1, %ymm1 vmovaps %ymm1, c(%rax) addq $32, %rax cmpq $1024, %rax jne .L2 Attached patch forces memory operand into register, producing: .L2: vmovaps b(%rax), %ymm1 vrcpps %ymm1, %ymm0 vmulps %ymm1, %ymm0, %ymm1 vmulps %ymm1, %ymm0, %ymm1 vaddps %ymm0, %ymm0, %ymm0 vsubps %ymm1, %ymm0, %ymm1 vmulps a(%rax), %ymm1, %ymm1 vmovaps %ymm1, c(%rax) addq $32, %rax cmpq $1024, %rax jne .L2 The same cure could be applied for rsqrt sequences. 2011-10-21 Uros Bizjak <ubiz...@gmail.com> * config/i386/i386.c (ix86_emit_swdivsf): Force b into register. (ix86_emit_swsqrtsf): Force a into register. Patch was tested on x86_64-pc-linux-gnu, committed to mainline SVN. Uros.
Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 180255) +++ config/i386/i386.c (working copy) @@ -33682,6 +33682,8 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enu /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */ + b = force_reg (mode, b); + /* x0 = rcp(b) estimate */ emit_insn (gen_rtx_SET (VOIDmode, x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), @@ -33737,6 +33739,8 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum mach /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */ + a = force_reg (mode, a); + /* x0 = rsqrt(a) estimate */ emit_insn (gen_rtx_SET (VOIDmode, x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),