Tweak ARM vld3q and vld4q patterns

Richard Sandiford Thu, 24 Mar 2011 08:57:40 -0700

The ARM vld3q and vld4q .md patterns expand into two individual vld3/vld4
instructions.  Each instruction loads half of the total elements.
The problem is that this is implemented as:


  array = vld3a (array, mem1)
  array = vld3b (array, mem2)

with "array" being an input to the _first_ load as well as the second.
This input is dead, but results in unnecessary loads from the stack.
E.g. for:

#include <arm_neon.h>

void
foo (uint32_t *a, uint32_t *b, uint32_t *c)
{
  uint32x4x3_t x, y;

  x = vld3q_u32 (a);
  y = vld3q_u32 (b);
  x.val[0] = vaddq_u32 (x.val[0], y.val[0]);
  x.val[1] = vaddq_u32 (x.val[1], y.val[1]);
  x.val[2] = vaddq_u32 (x.val[2], y.val[2]);
  vst3q_u32 (a, x);
}

we get:

        stmfd   sp!, {r3, fp}
        ldr     r2, .L2
        add     fp, sp, #4
        vldmia  r2, {d16-d21}
        sub     sp, sp, #112
        vmov    q11, q8  @ ti
        vmov    q12, q9  @ ti
        vmov    q13, q10  @ ti
        ...

where the vldmia is loading the x and y "inputs" to the two vld3q_u32s
from the corresponding stack slots.

It's true that vld?a doesn't _change_ the whole of the array,
but that doesn't matter; we no longer care what values the
other elements have.

Tested on arm-linux-gnueabi.  OK to install?

Richard


gcc/
        * config/arm/neon.md (neon_vld3qa<mode>, neon_vld4qa<mode>): Remove
        operand 1 and reshuffle the operands to match.
        (neon_vld3<mode>, neon_vld4<mode>): Update accordingly.

Index: gcc/config/arm/neon.md
===================================================================
--- gcc/config/arm/neon.md      2011-03-24 13:47:13.000000000 +0000
+++ gcc/config/arm/neon.md      2011-03-24 15:51:59.000000000 +0000
@@ -4605,8 +4605,7 @@ (define_expand "neon_vld3<mode>"
    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
-  emit_insn (gen_neon_vld3qa<mode> (operands[0], operands[0],
-                                    operands[1], operands[1]));
+  emit_insn (gen_neon_vld3qa<mode> (operands[0], operands[1], operands[1]));
   emit_insn (gen_neon_vld3qb<mode> (operands[0], operands[0],
                                     operands[1], operands[1]));
   DONE;
@@ -4614,12 +4613,11 @@ (define_expand "neon_vld3<mode>"
 
 (define_insn "neon_vld3qa<mode>"
   [(set (match_operand:CI 0 "s_register_operand" "=w")
-        (unspec:CI [(mem:CI (match_operand:SI 3 "s_register_operand" "2"))
-                    (match_operand:CI 1 "s_register_operand" "0")
+        (unspec:CI [(mem:CI (match_operand:SI 2 "s_register_operand" "1"))
                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD3A))
-   (set (match_operand:SI 2 "s_register_operand" "=r")
-        (plus:SI (match_dup 3)
+   (set (match_operand:SI 1 "s_register_operand" "=r")
+        (plus:SI (match_dup 2)
                 (const_int 24)))]
   "TARGET_NEON"
 {
@@ -4628,7 +4626,7 @@ (define_insn "neon_vld3qa<mode>"
   ops[0] = gen_rtx_REG (DImode, regno);
   ops[1] = gen_rtx_REG (DImode, regno + 4);
   ops[2] = gen_rtx_REG (DImode, regno + 8);
-  ops[3] = operands[2];
+  ops[3] = operands[1];
   output_asm_insn ("vld3.<V_sz_elem>\t{%P0, %P1, %P2}, [%3]!", ops);
   return "";
 }
@@ -4897,8 +4895,7 @@ (define_expand "neon_vld4<mode>"
    (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
-  emit_insn (gen_neon_vld4qa<mode> (operands[0], operands[0],
-                                    operands[1], operands[1]));
+  emit_insn (gen_neon_vld4qa<mode> (operands[0], operands[1], operands[1]));
   emit_insn (gen_neon_vld4qb<mode> (operands[0], operands[0],
                                     operands[1], operands[1]));
   DONE;
@@ -4906,12 +4903,11 @@ (define_expand "neon_vld4<mode>"
 
 (define_insn "neon_vld4qa<mode>"
   [(set (match_operand:XI 0 "s_register_operand" "=w")
-        (unspec:XI [(mem:XI (match_operand:SI 3 "s_register_operand" "2"))
-                    (match_operand:XI 1 "s_register_operand" "0")
+        (unspec:XI [(mem:XI (match_operand:SI 2 "s_register_operand" "1"))
                     (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VLD4A))
-   (set (match_operand:SI 2 "s_register_operand" "=r")
-        (plus:SI (match_dup 3)
+   (set (match_operand:SI 1 "s_register_operand" "=r")
+        (plus:SI (match_dup 2)
                 (const_int 32)))]
   "TARGET_NEON"
 {
@@ -4921,7 +4917,7 @@ (define_insn "neon_vld4qa<mode>"
   ops[1] = gen_rtx_REG (DImode, regno + 4);
   ops[2] = gen_rtx_REG (DImode, regno + 8);
   ops[3] = gen_rtx_REG (DImode, regno + 12);
-  ops[4] = operands[2];
+  ops[4] = operands[1];
   output_asm_insn ("vld4.<V_sz_elem>\t{%P0, %P1, %P2, %P3}, [%4]!", ops);
   return "";
 }

Tweak ARM vld3q and vld4q patterns

Reply via email to