The ARM vld3q and vld4q .md patterns expand into two individual vld3/vld4 instructions. Each instruction loads half of the total elements. The problem is that this is implemented as:
array = vld3a (array, mem1) array = vld3b (array, mem2) with "array" being an input to the _first_ load as well as the second. This input is dead, but results in unnecessary loads from the stack. E.g. for: #include <arm_neon.h> void foo (uint32_t *a, uint32_t *b, uint32_t *c) { uint32x4x3_t x, y; x = vld3q_u32 (a); y = vld3q_u32 (b); x.val[0] = vaddq_u32 (x.val[0], y.val[0]); x.val[1] = vaddq_u32 (x.val[1], y.val[1]); x.val[2] = vaddq_u32 (x.val[2], y.val[2]); vst3q_u32 (a, x); } we get: stmfd sp!, {r3, fp} ldr r2, .L2 add fp, sp, #4 vldmia r2, {d16-d21} sub sp, sp, #112 vmov q11, q8 @ ti vmov q12, q9 @ ti vmov q13, q10 @ ti ... where the vldmia is loading the x and y "inputs" to the two vld3q_u32s from the corresponding stack slots. It's true that vld?a doesn't _change_ the whole of the array, but that doesn't matter; we no longer care what values the other elements have. Tested on arm-linux-gnueabi. OK to install? Richard gcc/ * config/arm/neon.md (neon_vld3qa<mode>, neon_vld4qa<mode>): Remove operand 1 and reshuffle the operands to match. (neon_vld3<mode>, neon_vld4<mode>): Update accordingly. Index: gcc/config/arm/neon.md =================================================================== --- gcc/config/arm/neon.md 2011-03-24 13:47:13.000000000 +0000 +++ gcc/config/arm/neon.md 2011-03-24 15:51:59.000000000 +0000 @@ -4605,8 +4605,7 @@ (define_expand "neon_vld3<mode>" (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { - emit_insn (gen_neon_vld3qa<mode> (operands[0], operands[0], - operands[1], operands[1])); + emit_insn (gen_neon_vld3qa<mode> (operands[0], operands[1], operands[1])); emit_insn (gen_neon_vld3qb<mode> (operands[0], operands[0], operands[1], operands[1])); DONE; @@ -4614,12 +4613,11 @@ (define_expand "neon_vld3<mode>" (define_insn "neon_vld3qa<mode>" [(set (match_operand:CI 0 "s_register_operand" "=w") - (unspec:CI [(mem:CI (match_operand:SI 3 "s_register_operand" "2")) - (match_operand:CI 1 "s_register_operand" "0") + (unspec:CI [(mem:CI (match_operand:SI 2 "s_register_operand" "1")) (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3A)) - (set (match_operand:SI 2 "s_register_operand" "=r") - (plus:SI (match_dup 3) + (set (match_operand:SI 1 "s_register_operand" "=r") + (plus:SI (match_dup 2) (const_int 24)))] "TARGET_NEON" { @@ -4628,7 +4626,7 @@ (define_insn "neon_vld3qa<mode>" ops[0] = gen_rtx_REG (DImode, regno); ops[1] = gen_rtx_REG (DImode, regno + 4); ops[2] = gen_rtx_REG (DImode, regno + 8); - ops[3] = operands[2]; + ops[3] = operands[1]; output_asm_insn ("vld3.<V_sz_elem>\t{%P0, %P1, %P2}, [%3]!", ops); return ""; } @@ -4897,8 +4895,7 @@ (define_expand "neon_vld4<mode>" (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { - emit_insn (gen_neon_vld4qa<mode> (operands[0], operands[0], - operands[1], operands[1])); + emit_insn (gen_neon_vld4qa<mode> (operands[0], operands[1], operands[1])); emit_insn (gen_neon_vld4qb<mode> (operands[0], operands[0], operands[1], operands[1])); DONE; @@ -4906,12 +4903,11 @@ (define_expand "neon_vld4<mode>" (define_insn "neon_vld4qa<mode>" [(set (match_operand:XI 0 "s_register_operand" "=w") - (unspec:XI [(mem:XI (match_operand:SI 3 "s_register_operand" "2")) - (match_operand:XI 1 "s_register_operand" "0") + (unspec:XI [(mem:XI (match_operand:SI 2 "s_register_operand" "1")) (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4A)) - (set (match_operand:SI 2 "s_register_operand" "=r") - (plus:SI (match_dup 3) + (set (match_operand:SI 1 "s_register_operand" "=r") + (plus:SI (match_dup 2) (const_int 32)))] "TARGET_NEON" { @@ -4921,7 +4917,7 @@ (define_insn "neon_vld4qa<mode>" ops[1] = gen_rtx_REG (DImode, regno + 4); ops[2] = gen_rtx_REG (DImode, regno + 8); ops[3] = gen_rtx_REG (DImode, regno + 12); - ops[4] = operands[2]; + ops[4] = operands[1]; output_asm_insn ("vld4.<V_sz_elem>\t{%P0, %P1, %P2, %P3}, [%4]!", ops); return ""; }