This first patch optimises the output for vld3q and vld4q functions.
These functions expand into two individual vld3 and vld4 instructions,
with each instruction setting one (interleaved) half of the output
register.  The problem was that both instructions treated the
output register as an input, whereas only the second one needs to.
We therefore treated the output register as being live before the
vldNq and generated unnecessary spill code.

E.g.:

#include <arm_neon.h>

void
foo (uint32_t *a, uint32_t *b, uint32_t *c)
{
  uint32x4x3_t x, y;

  x = vld3q_u32 (a);
  y = vld3q_u32 (b);
  x.val[0] = vaddq_u32 (x.val[0], y.val[0]);
  x.val[1] = vaddq_u32 (x.val[1], y.val[1]);
  x.val[2] = vaddq_u32 (x.val[2], y.val[2]);
  vst3q_u32 (a, x);
}

gave:

        stmfd   sp!, {r3, fp}
        ldr     r2, .L2
        add     fp, sp, #4
        vldmia  r2, {d16-d21}
        sub     sp, sp, #112
        vmov    q11, q8  @ ti
        vmov    q12, q9  @ ti
        vmov    q13, q10  @ ti
        ...

where the vldmia is loading the x and y "inputs" to the two vld3q_u32s
from the corresponding stack slots.

The patch is a backport of:

    http://gcc.gnu.org/ml/gcc-patches/2011-03/msg01634.html

which has been applied to 4.7.  No changes were needed for 4.5.

Richard


gcc/
        Backport from mainline:

        2011-03-30  Richard Sandiford  <richard.sandif...@linaro.org>
                    Ramana Radhakrishnan  <ramana.radhakrish...@linaro.org>

        PR target/43590
        * config/arm/neon.md (neon_vld3qa<mode>, neon_vld4qa<mode>): Remove
        operand 1 and reshuffle the operands to match.
        (neon_vld3<mode>, neon_vld4<mode>): Update accordingly.

Index: gcc/config/arm/neon.md
===================================================================
--- gcc/config/arm/neon.md      2011-04-19 13:55:04.000000000 +0000
+++ gcc/config/arm/neon.md      2011-04-19 13:55:04.000000000 +0000
@@ -4925,8 +4925,7 @@ (define_expand "neon_vld3<mode>"
    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
-  emit_insn (gen_neon_vld3qa<mode> (operands[0], operands[0],
-                                    operands[1], operands[1]));
+  emit_insn (gen_neon_vld3qa<mode> (operands[0], operands[1], operands[1]));
   emit_insn (gen_neon_vld3qb<mode> (operands[0], operands[0],
                                     operands[1], operands[1]));
   DONE;
@@ -4934,12 +4933,11 @@ (define_expand "neon_vld3<mode>"
 
 (define_insn "neon_vld3qa<mode>"
   [(set (match_operand:CI 0 "s_register_operand" "=w")
-        (unspec:CI [(mem:CI (match_operand:SI 3 "s_register_operand" "2"))
-                    (match_operand:CI 1 "s_register_operand" "0")
+        (unspec:CI [(mem:CI (match_operand:SI 2 "s_register_operand" "1"))
                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD3A))
-   (set (match_operand:SI 2 "s_register_operand" "=r")
-        (plus:SI (match_dup 3)
+   (set (match_operand:SI 1 "s_register_operand" "=r")
+        (plus:SI (match_dup 2)
                 (const_int 24)))]
   "TARGET_NEON"
 {
@@ -4948,7 +4946,7 @@ (define_insn "neon_vld3qa<mode>"
   ops[0] = gen_rtx_REG (DImode, regno);
   ops[1] = gen_rtx_REG (DImode, regno + 4);
   ops[2] = gen_rtx_REG (DImode, regno + 8);
-  ops[3] = operands[2];
+  ops[3] = operands[1];
   output_asm_insn ("vld3.<V_sz_elem>\t{%P0, %P1, %P2}, [%3]!", ops);
   return "";
 }
@@ -5217,8 +5215,7 @@ (define_expand "neon_vld4<mode>"
    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
-  emit_insn (gen_neon_vld4qa<mode> (operands[0], operands[0],
-                                    operands[1], operands[1]));
+  emit_insn (gen_neon_vld4qa<mode> (operands[0], operands[1], operands[1]));
   emit_insn (gen_neon_vld4qb<mode> (operands[0], operands[0],
                                     operands[1], operands[1]));
   DONE;
@@ -5226,12 +5223,11 @@ (define_expand "neon_vld4<mode>"
 
 (define_insn "neon_vld4qa<mode>"
   [(set (match_operand:XI 0 "s_register_operand" "=w")
-        (unspec:XI [(mem:XI (match_operand:SI 3 "s_register_operand" "2"))
-                    (match_operand:XI 1 "s_register_operand" "0")
+        (unspec:XI [(mem:XI (match_operand:SI 2 "s_register_operand" "1"))
                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD4A))
-   (set (match_operand:SI 2 "s_register_operand" "=r")
-        (plus:SI (match_dup 3)
+   (set (match_operand:SI 1 "s_register_operand" "=r")
+        (plus:SI (match_dup 2)
                 (const_int 32)))]
   "TARGET_NEON"
 {
@@ -5241,7 +5237,7 @@ (define_insn "neon_vld4qa<mode>"
   ops[1] = gen_rtx_REG (DImode, regno + 4);
   ops[2] = gen_rtx_REG (DImode, regno + 8);
   ops[3] = gen_rtx_REG (DImode, regno + 12);
-  ops[4] = operands[2];
+  ops[4] = operands[1];
   output_asm_insn ("vld4.<V_sz_elem>\t{%P0, %P1, %P2, %P3}, [%4]!", ops);
   return "";
 }

_______________________________________________
linaro-toolchain mailing list
linaro-toolchain@lists.linaro.org
http://lists.linaro.org/mailman/listinfo/linaro-toolchain

Reply via email to