This first patch optimises the output for vld3q and vld4q functions.
These functions expand into two individual vld3 and vld4 instructions,
with each instruction setting one (interleaved) half of the output
register. The problem was that both instructions treated the
output register as an input, whereas only the second one needs to.
We therefore treated the output register as being live before the
vldNq and generated unnecessary spill code.
E.g.:
#include <arm_neon.h>
void
foo (uint32_t *a, uint32_t *b, uint32_t *c)
{
uint32x4x3_t x, y;
x = vld3q_u32 (a);
y = vld3q_u32 (b);
x.val[0] = vaddq_u32 (x.val[0], y.val[0]);
x.val[1] = vaddq_u32 (x.val[1], y.val[1]);
x.val[2] = vaddq_u32 (x.val[2], y.val[2]);
vst3q_u32 (a, x);
}
gave:
stmfd sp!, {r3, fp}
ldr r2, .L2
add fp, sp, #4
vldmia r2, {d16-d21}
sub sp, sp, #112
vmov q11, q8 @ ti
vmov q12, q9 @ ti
vmov q13, q10 @ ti
...
where the vldmia is loading the x and y "inputs" to the two vld3q_u32s
from the corresponding stack slots.
The patch is a backport of:
http://gcc.gnu.org/ml/gcc-patches/2011-03/msg01634.html
which has been applied to 4.7. No changes were needed for 4.5.
Richard
gcc/
Backport from mainline:
2011-03-30 Richard Sandiford <[email protected]>
Ramana Radhakrishnan <[email protected]>
PR target/43590
* config/arm/neon.md (neon_vld3qa<mode>, neon_vld4qa<mode>): Remove
operand 1 and reshuffle the operands to match.
(neon_vld3<mode>, neon_vld4<mode>): Update accordingly.
Index: gcc/config/arm/neon.md
===================================================================
--- gcc/config/arm/neon.md 2011-04-19 13:55:04.000000000 +0000
+++ gcc/config/arm/neon.md 2011-04-19 13:55:04.000000000 +0000
@@ -4925,8 +4925,7 @@ (define_expand "neon_vld3<mode>"
(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
- emit_insn (gen_neon_vld3qa<mode> (operands[0], operands[0],
- operands[1], operands[1]));
+ emit_insn (gen_neon_vld3qa<mode> (operands[0], operands[1], operands[1]));
emit_insn (gen_neon_vld3qb<mode> (operands[0], operands[0],
operands[1], operands[1]));
DONE;
@@ -4934,12 +4933,11 @@ (define_expand "neon_vld3<mode>"
(define_insn "neon_vld3qa<mode>"
[(set (match_operand:CI 0 "s_register_operand" "=w")
- (unspec:CI [(mem:CI (match_operand:SI 3 "s_register_operand" "2"))
- (match_operand:CI 1 "s_register_operand" "0")
+ (unspec:CI [(mem:CI (match_operand:SI 2 "s_register_operand" "1"))
(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD3A))
- (set (match_operand:SI 2 "s_register_operand" "=r")
- (plus:SI (match_dup 3)
+ (set (match_operand:SI 1 "s_register_operand" "=r")
+ (plus:SI (match_dup 2)
(const_int 24)))]
"TARGET_NEON"
{
@@ -4948,7 +4946,7 @@ (define_insn "neon_vld3qa<mode>"
ops[0] = gen_rtx_REG (DImode, regno);
ops[1] = gen_rtx_REG (DImode, regno + 4);
ops[2] = gen_rtx_REG (DImode, regno + 8);
- ops[3] = operands[2];
+ ops[3] = operands[1];
output_asm_insn ("vld3.<V_sz_elem>\t{%P0, %P1, %P2}, [%3]!", ops);
return "";
}
@@ -5217,8 +5215,7 @@ (define_expand "neon_vld4<mode>"
(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
"TARGET_NEON"
{
- emit_insn (gen_neon_vld4qa<mode> (operands[0], operands[0],
- operands[1], operands[1]));
+ emit_insn (gen_neon_vld4qa<mode> (operands[0], operands[1], operands[1]));
emit_insn (gen_neon_vld4qb<mode> (operands[0], operands[0],
operands[1], operands[1]));
DONE;
@@ -5226,12 +5223,11 @@ (define_expand "neon_vld4<mode>"
(define_insn "neon_vld4qa<mode>"
[(set (match_operand:XI 0 "s_register_operand" "=w")
- (unspec:XI [(mem:XI (match_operand:SI 3 "s_register_operand" "2"))
- (match_operand:XI 1 "s_register_operand" "0")
+ (unspec:XI [(mem:XI (match_operand:SI 2 "s_register_operand" "1"))
(unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
UNSPEC_VLD4A))
- (set (match_operand:SI 2 "s_register_operand" "=r")
- (plus:SI (match_dup 3)
+ (set (match_operand:SI 1 "s_register_operand" "=r")
+ (plus:SI (match_dup 2)
(const_int 32)))]
"TARGET_NEON"
{
@@ -5241,7 +5237,7 @@ (define_insn "neon_vld4qa<mode>"
ops[1] = gen_rtx_REG (DImode, regno + 4);
ops[2] = gen_rtx_REG (DImode, regno + 8);
ops[3] = gen_rtx_REG (DImode, regno + 12);
- ops[4] = operands[2];
+ ops[4] = operands[1];
output_asm_insn ("vld4.<V_sz_elem>\t{%P0, %P1, %P2, %P3}, [%4]!", ops);
return "";
}
_______________________________________________
linaro-toolchain mailing list
[email protected]
http://lists.linaro.org/mailman/listinfo/linaro-toolchain