Hello!

While eyeballing following testcase:

float a[256], b[256], c[256];

void foo(void)
{
  int i;

  for (i=0; i<256; ++i)
    c[i] = a[i] / b[i];
}

-O2 -ftree-vectorize -ffast-math

I noticed that for some reason CSE doesn't eliminate memory read, resulting in:

.L2:
        vrcpps  b(%rax), %ymm0
        vmulps  b(%rax), %ymm0, %ymm1
        vmulps  %ymm1, %ymm0, %ymm1
        vaddps  %ymm0, %ymm0, %ymm0
        vsubps  %ymm1, %ymm0, %ymm1
        vmulps  a(%rax), %ymm1, %ymm1
        vmovaps %ymm1, c(%rax)
        addq    $32, %rax
        cmpq    $1024, %rax
        jne     .L2

Attached patch forces memory operand into register, producing:

.L2:
        vmovaps b(%rax), %ymm1
        vrcpps  %ymm1, %ymm0
        vmulps  %ymm1, %ymm0, %ymm1
        vmulps  %ymm1, %ymm0, %ymm1
        vaddps  %ymm0, %ymm0, %ymm0
        vsubps  %ymm1, %ymm0, %ymm1
        vmulps  a(%rax), %ymm1, %ymm1
        vmovaps %ymm1, c(%rax)
        addq    $32, %rax
        cmpq    $1024, %rax
        jne     .L2

The same cure could be applied for rsqrt sequences.

2011-10-21  Uros Bizjak  <ubiz...@gmail.com>

        * config/i386/i386.c (ix86_emit_swdivsf): Force b into register.
        (ix86_emit_swsqrtsf): Force a into register.

Patch was tested on x86_64-pc-linux-gnu, committed to mainline SVN.

Uros.
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c  (revision 180255)
+++ config/i386/i386.c  (working copy)
@@ -33682,6 +33682,8 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enu
 
   /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
 
+  b = force_reg (mode, b);
+
   /* x0 = rcp(b) estimate */
   emit_insn (gen_rtx_SET (VOIDmode, x0,
                          gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
@@ -33737,6 +33739,8 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum mach
   /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
      rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
 
+  a = force_reg (mode, a);
+
   /* x0 = rsqrt(a) estimate */
   emit_insn (gen_rtx_SET (VOIDmode, x0,
                          gen_rtx_UNSPEC (mode, gen_rtvec (1, a),

Reply via email to