Hello, following up on the prior patch, this patch exploits more opportunities to generate the vld1 / vst1 family of instructions, this time to implement the vec_set and vec_extract patterns with memory scalar operands.
Without the patch, vec_set/vec_extract only support register operands for the scalar, possibly requiring extra moves. In some cases we'd still get a vst1 instruction as a result, since combine would match a neon_vst1_lane pattern. However, that pattern seems to be actually incorrect for big-endian systems (due to a line number ordering mismatch). The patch therefore also changes neon_vst1_lane to use an UNSPEC instead of vec_select to model its operation, just like all the other NEON intrinsic patterns depending on line numbers already do. Benchmarking showed only marginal improvements, but no regression. It seems useful to support this anyway ... Tested on arm-linux-gnueabi. OK for mainline? Bye, Ulrich 2012-09-14 Ulrich Weigand <ulrich.weig...@linaro.org> * config/arm/arm.c (arm_rtx_costs_1): Handle vec_extract and vec_set patterns. * config/arm/arm.md ("vec_set<mode>_internal"): Support memory source operands, implemented via vld1 instruction. ("vec_extract<mode>"): Support memory destination operands, implemented via vst1 instruction. ("neon_vst1_lane<mode>"): Use UNSPEC_VST1_LANE instead of vec_select. * config/arm/predicates.md ("neon_lane_number"): Remove. Index: gcc-head/gcc/config/arm/arm.c =================================================================== --- gcc-head.orig/gcc/config/arm/arm.c 2012-09-14 19:40:51.000000000 +0200 +++ gcc-head/gcc/config/arm/arm.c 2012-09-14 19:41:14.000000000 +0200 @@ -7666,6 +7666,28 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou return true; case SET: + /* The vec_extract patterns accept memory operands that require an + address reload. Account for the cost of that reload to give the + auto-inc-dec pass an incentive to try to replace them. */ + if (TARGET_NEON && MEM_P (SET_DEST (x)) + && GET_CODE (SET_SRC (x)) == VEC_SELECT) + { + *total = rtx_cost (SET_DEST (x), code, 0, speed); + if (!neon_vector_mem_operand (SET_DEST (x), 2)) + *total += COSTS_N_INSNS (1); + return true; + } + /* Likewise for the vec_set patterns. */ + if (TARGET_NEON && GET_CODE (SET_SRC (x)) == VEC_MERGE + && GET_CODE (XEXP (SET_SRC (x), 0)) == VEC_DUPLICATE + && MEM_P (XEXP (XEXP (SET_SRC (x), 0), 0))) + { + rtx mem = XEXP (XEXP (SET_SRC (x), 0), 0); + *total = rtx_cost (mem, code, 0, speed); + if (!neon_vector_mem_operand (mem, 2)) + *total += COSTS_N_INSNS (1); + return true; + } return false; case UNSPEC: Index: gcc-head/gcc/config/arm/neon.md =================================================================== --- gcc-head.orig/gcc/config/arm/neon.md 2012-09-14 19:40:51.000000000 +0200 +++ gcc-head/gcc/config/arm/neon.md 2012-09-14 19:41:14.000000000 +0200 @@ -416,30 +416,33 @@ [(set_attr "neon_type" "neon_vld1_1_2_regs")]) (define_insn "vec_set<mode>_internal" - [(set (match_operand:VD 0 "s_register_operand" "=w") + [(set (match_operand:VD 0 "s_register_operand" "=w,w") (vec_merge:VD (vec_duplicate:VD - (match_operand:<V_elem> 1 "s_register_operand" "r")) - (match_operand:VD 3 "s_register_operand" "0") - (match_operand:SI 2 "immediate_operand" "i")))] + (match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r")) + (match_operand:VD 3 "s_register_operand" "0,0") + (match_operand:SI 2 "immediate_operand" "i,i")))] "TARGET_NEON" { int elt = ffs ((int) INTVAL (operands[2])) - 1; if (BYTES_BIG_ENDIAN) elt = GET_MODE_NUNITS (<MODE>mode) - 1 - elt; operands[2] = GEN_INT (elt); - - return "vmov.<V_sz_elem>\t%P0[%c2], %1"; + + if (which_alternative == 0) + return "vld1.<V_sz_elem>\t{%P0[%c2]}, %A1"; + else + return "vmov.<V_sz_elem>\t%P0[%c2], %1"; } - [(set_attr "neon_type" "neon_mcr")]) + [(set_attr "neon_type" "neon_vld1_vld2_lane,neon_mcr")]) (define_insn "vec_set<mode>_internal" - [(set (match_operand:VQ 0 "s_register_operand" "=w") + [(set (match_operand:VQ 0 "s_register_operand" "=w,w") (vec_merge:VQ (vec_duplicate:VQ - (match_operand:<V_elem> 1 "s_register_operand" "r")) - (match_operand:VQ 3 "s_register_operand" "0") - (match_operand:SI 2 "immediate_operand" "i")))] + (match_operand:<V_elem> 1 "nonimmediate_operand" "Um,r")) + (match_operand:VQ 3 "s_register_operand" "0,0") + (match_operand:SI 2 "immediate_operand" "i,i")))] "TARGET_NEON" { HOST_WIDE_INT elem = ffs ((int) INTVAL (operands[2])) - 1; @@ -454,18 +457,21 @@ operands[0] = gen_rtx_REG (<V_HALF>mode, regno + hi); operands[2] = GEN_INT (elt); - return "vmov.<V_sz_elem>\t%P0[%c2], %1"; + if (which_alternative == 0) + return "vld1.<V_sz_elem>\t{%P0[%c2]}, %A1"; + else + return "vmov.<V_sz_elem>\t%P0[%c2], %1"; } - [(set_attr "neon_type" "neon_mcr")] + [(set_attr "neon_type" "neon_vld1_vld2_lane,neon_mcr")] ) (define_insn "vec_setv2di_internal" - [(set (match_operand:V2DI 0 "s_register_operand" "=w") + [(set (match_operand:V2DI 0 "s_register_operand" "=w,w") (vec_merge:V2DI (vec_duplicate:V2DI - (match_operand:DI 1 "s_register_operand" "r")) - (match_operand:V2DI 3 "s_register_operand" "0") - (match_operand:SI 2 "immediate_operand" "i")))] + (match_operand:DI 1 "nonimmediate_operand" "Um,r")) + (match_operand:V2DI 3 "s_register_operand" "0,0") + (match_operand:SI 2 "immediate_operand" "i,i")))] "TARGET_NEON" { HOST_WIDE_INT elem = ffs ((int) INTVAL (operands[2])) - 1; @@ -473,9 +479,12 @@ operands[0] = gen_rtx_REG (DImode, regno); - return "vmov\t%P0, %Q1, %R1"; + if (which_alternative == 0) + return "vld1.64\t%P0, %A1"; + else + return "vmov\t%P0, %Q1, %R1"; } - [(set_attr "neon_type" "neon_mcr_2_mcrr")] + [(set_attr "neon_type" "neon_vld1_1_2_regs,neon_mcr_2_mcrr")] ) (define_expand "vec_set<mode>" @@ -491,10 +500,10 @@ }) (define_insn "vec_extract<mode>" - [(set (match_operand:<V_elem> 0 "s_register_operand" "=r") + [(set (match_operand:<V_elem> 0 "nonimmediate_operand" "=Um,r") (vec_select:<V_elem> - (match_operand:VD 1 "s_register_operand" "w") - (parallel [(match_operand:SI 2 "immediate_operand" "i")])))] + (match_operand:VD 1 "s_register_operand" "w,w") + (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))] "TARGET_NEON" { if (BYTES_BIG_ENDIAN) @@ -503,16 +512,20 @@ elt = GET_MODE_NUNITS (<MODE>mode) - 1 - elt; operands[2] = GEN_INT (elt); } - return "vmov.<V_uf_sclr>\t%0, %P1[%c2]"; + + if (which_alternative == 0) + return "vst1.<V_sz_elem>\t{%P1[%c2]}, %A0"; + else + return "vmov.<V_uf_sclr>\t%0, %P1[%c2]"; } - [(set_attr "neon_type" "neon_bp_simple")] + [(set_attr "neon_type" "neon_vst1_vst2_lane,neon_bp_simple")] ) (define_insn "vec_extract<mode>" - [(set (match_operand:<V_elem> 0 "s_register_operand" "=r") + [(set (match_operand:<V_elem> 0 "nonimmediate_operand" "=Um,r") (vec_select:<V_elem> - (match_operand:VQ 1 "s_register_operand" "w") - (parallel [(match_operand:SI 2 "immediate_operand" "i")])))] + (match_operand:VQ 1 "s_register_operand" "w,w") + (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))] "TARGET_NEON" { int half_elts = GET_MODE_NUNITS (<MODE>mode) / 2; @@ -526,25 +539,31 @@ operands[1] = gen_rtx_REG (<V_HALF>mode, regno + hi); operands[2] = GEN_INT (elt); - return "vmov.<V_uf_sclr>\t%0, %P1[%c2]"; + if (which_alternative == 0) + return "vst1.<V_sz_elem>\t{%P1[%c2]}, %A0"; + else + return "vmov.<V_uf_sclr>\t%0, %P1[%c2]"; } - [(set_attr "neon_type" "neon_bp_simple")] + [(set_attr "neon_type" "neon_vst1_vst2_lane,neon_bp_simple")] ) (define_insn "vec_extractv2di" - [(set (match_operand:DI 0 "s_register_operand" "=r") + [(set (match_operand:DI 0 "nonimmediate_operand" "=Um,r") (vec_select:DI - (match_operand:V2DI 1 "s_register_operand" "w") - (parallel [(match_operand:SI 2 "immediate_operand" "i")])))] + (match_operand:V2DI 1 "s_register_operand" "w,w") + (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))] "TARGET_NEON" { int regno = REGNO (operands[1]) + 2 * INTVAL (operands[2]); operands[1] = gen_rtx_REG (DImode, regno); - return "vmov\t%Q0, %R0, %P1 @ v2di"; + if (which_alternative == 0) + return "vst1.64\t{%P1}, %A0 @ v2di"; + else + return "vmov\t%Q0, %R0, %P1 @ v2di"; } - [(set_attr "neon_type" "neon_int_1")] + [(set_attr "neon_type" "neon_vst1_vst2_lane,neon_int_1")] ) (define_expand "vec_init<mode>" @@ -4449,9 +4468,10 @@ (define_insn "neon_vst1_lane<mode>" [(set (match_operand:<V_elem> 0 "neon_struct_operand" "=Um") - (vec_select:<V_elem> - (match_operand:VDX 1 "s_register_operand" "w") - (parallel [(match_operand:SI 2 "neon_lane_number" "i")])))] + (unspec:<V_elem> + [(match_operand:VDX 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VST1_LANE))] "TARGET_NEON" { HOST_WIDE_INT lane = INTVAL (operands[2]); @@ -4470,9 +4490,10 @@ (define_insn "neon_vst1_lane<mode>" [(set (match_operand:<V_elem> 0 "neon_struct_operand" "=Um") - (vec_select:<V_elem> - (match_operand:VQX 1 "s_register_operand" "w") - (parallel [(match_operand:SI 2 "neon_lane_number" "i")])))] + (unspec:<V_elem> + [(match_operand:VQX 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + UNSPEC_VST1_LANE))] "TARGET_NEON" { HOST_WIDE_INT lane = INTVAL (operands[2]); Index: gcc-head/gcc/config/arm/predicates.md =================================================================== --- gcc-head.orig/gcc/config/arm/predicates.md 2012-09-14 19:38:14.000000000 +0200 +++ gcc-head/gcc/config/arm/predicates.md 2012-09-14 19:41:14.000000000 +0200 @@ -524,10 +524,6 @@ (ior (match_operand 0 "imm_for_neon_inv_logic_operand") (match_operand 0 "s_register_operand"))) -;; TODO: We could check lane numbers more precisely based on the mode. -(define_predicate "neon_lane_number" - (and (match_code "const_int") - (match_test "INTVAL (op) >= 0 && INTVAL (op) <= 15"))) ;; Predicates for named expanders that overlap multiple ISAs. (define_predicate "cmpdi_operand" -- Dr. Ulrich Weigand GNU Toolchain for Linux on System z and Cell BE ulrich.weig...@de.ibm.com