Hello, a while ago Andrew Stubbs posted a patch to use NEON registers and instructions to perform 64-bit integer shifts: http://gcc.gnu.org/ml/gcc-patches/2012-05/msg01645.html
As Andrew no longer works on ARM, I've now picked this up and reworked it a bit: - Updated for current mainline changes. - Fixed a typo in the "left shift by 1" special case. - Reworked constraint lists to have the NEON alternatives actually reliably chosen in the "left shift by register" case. - Noticed that arm_emit_coreregs_64bit_shift actually does *not* need a scratch for shifting by constant in any case, which simplifies the implementation a bit. - Further minor simplifications & cleanup. Tested on arm-linux-gnueabi (--with-arch=armv7-a --with-float=softfp --with-fpu=neon --with-mode=thumb) with no regressions. OK for mainline? Bye, Ulrich ChangeLog: 2012-09-17 Andrew Stubbs <a...@codesourcery.com> Ulrich Weigand <ulrich.weig...@linaro.org> * config/arm/arm.c (arm_print_operand): Add new 'E' format code. (arm_emit_coreregs_64bit_shift): Fix comment. * config/arm/arm.h (enum reg_class): Add VFP_LO_REGS_EVEN. (REG_CLASS_NAMES, REG_CLASS_CONTENTS, IS_VFP_CLASS): Likewise. * config/arm/arm.md (opt, opt_enabled): New attributes. (enabled): Use opt_enabled. (ashldi3, ashrdi3, lshrdi3): Add TARGET_NEON case. * config/arm/constraints.md (T): New register constraint. * config/arm/iterators.md (rshifts): New code iterator. (shift, shifttype): New code attributes. * config/arm/neon.md (signed_shift_di3_neon, unsigned_shift_di3_neon, ashldi3_neon_noclobber, ashldi3_neon, ashrdi3_neon_imm_noclobber, lshrdi3_neon_imm_noclobber, <shift>di3_neon): New patterns. Index: gcc/config/arm/arm.c =================================================================== *** gcc/config/arm/arm.c (revision 191400) --- gcc/config/arm/arm.c (working copy) *************** arm_print_operand (FILE *stream, rtx x, *** 17280,17285 **** --- 17280,17303 ---- } return; + /* Print the VFP/Neon double precision register name that overlaps the + given single-precision register. */ + case 'E': + { + int mode = GET_MODE (x); + + if (GET_MODE_SIZE (mode) != 4 + || GET_CODE (x) != REG + || !IS_VFP_REGNUM (REGNO (x))) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + fprintf (stream, "d%d", (REGNO (x) - FIRST_VFP_REGNUM) >> 1); + } + return; + /* These two codes print the low/high doubleword register of a Neon quad register, respectively. For pair-structure types, can also print low/high quadword registers. */ *************** arm_autoinc_modes_ok_p (enum machine_mod *** 26293,26300 **** Input requirements: - It is safe for the input and output to be the same register, but early-clobber rules apply for the shift amount and scratch registers. ! - Shift by register requires both scratch registers. Shift by a constant ! less than 32 in Thumb2 mode requires SCRATCH1 only. In all other cases the scratch registers may be NULL. - Ashiftrt by a register also clobbers the CC register. */ void --- 26311,26317 ---- Input requirements: - It is safe for the input and output to be the same register, but early-clobber rules apply for the shift amount and scratch registers. ! - Shift by register requires both scratch registers. In all other cases the scratch registers may be NULL. - Ashiftrt by a register also clobbers the CC register. */ void Index: gcc/config/arm/arm.h =================================================================== *** gcc/config/arm/arm.h (revision 191254) --- gcc/config/arm/arm.h (working copy) *************** enum reg_class *** 1120,1125 **** --- 1120,1126 ---- CORE_REGS, VFP_D0_D7_REGS, VFP_LO_REGS, + VFP_LO_REGS_EVEN, VFP_HI_REGS, VFP_REGS, IWMMXT_REGS, *************** enum reg_class *** 1146,1151 **** --- 1147,1153 ---- "CORE_REGS", \ "VFP_D0_D7_REGS", \ "VFP_LO_REGS", \ + "VFP_LO_REGS_EVEN", \ "VFP_HI_REGS", \ "VFP_REGS", \ "IWMMXT_REGS", \ *************** enum reg_class *** 1169,1174 **** --- 1171,1177 ---- { 0x00007FFF, 0x00000000, 0x00000000, 0x00000000 }, /* CORE_REGS */ \ { 0xFFFF0000, 0x00000000, 0x00000000, 0x00000000 }, /* VFP_D0_D7_REGS */ \ { 0xFFFF0000, 0x0000FFFF, 0x00000000, 0x00000000 }, /* VFP_LO_REGS */ \ + { 0x55550000, 0x00005555, 0x00000000, 0x00000000 }, /* VFP_LO_REGS_EVEN */ \ { 0x00000000, 0xFFFF0000, 0x0000FFFF, 0x00000000 }, /* VFP_HI_REGS */ \ { 0xFFFF0000, 0xFFFFFFFF, 0x0000FFFF, 0x00000000 }, /* VFP_REGS */ \ { 0x00000000, 0x00000000, 0xFFFF0000, 0x00000000 }, /* IWMMXT_REGS */ \ *************** enum reg_class *** 1182,1188 **** /* Any of the VFP register classes. */ #define IS_VFP_CLASS(X) \ ! ((X) == VFP_D0_D7_REGS || (X) == VFP_LO_REGS \ || (X) == VFP_HI_REGS || (X) == VFP_REGS) /* The same information, inverted: --- 1185,1191 ---- /* Any of the VFP register classes. */ #define IS_VFP_CLASS(X) \ ! ((X) == VFP_D0_D7_REGS || (X) == VFP_LO_REGS || (X) == VFP_LO_REGS_EVEN \ || (X) == VFP_HI_REGS || (X) == VFP_REGS) /* The same information, inverted: Index: gcc/config/arm/neon.md =================================================================== *** gcc/config/arm/neon.md (revision 191400) --- gcc/config/arm/neon.md (working copy) *************** *** 1170,1175 **** --- 1170,1335 ---- DONE; }) + ;; 64-bit shifts + + (define_insn "ashldi3_neon_noclobber" + [(set (match_operand:DI 0 "s_register_operand" "=w,w") + (ashift:DI (match_operand:DI 1 "s_register_operand" " w,w") + (match_operand:SI 2 "reg_or_int_operand" " i,T")))] + "TARGET_NEON && reload_completed + && (!CONST_INT_P (operands[2]) + || (INTVAL (operands[2]) >= 0 && INTVAL (operands[2]) < 64))" + "@ + vshl.u64\t%P0, %P1, %2 + vshl.u64\t%P0, %P1, %E2 @ ashl %P0, %P1, %2" + [(set_attr "neon_type" "neon_vshl_ddd,neon_vshl_ddd")] + ) + + (define_insn_and_split "ashldi3_neon" + [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?r,?w") + (ashift:DI (match_operand:DI 1 "s_register_operand" " w, 0r, r, w") + (match_operand:SI 2 "reg_or_int_operand" "Ti, r, i,Ti"))) + (clobber (match_scratch:SI 3 "=X,?&r, X, X")) + (clobber (match_scratch:SI 4 "=X,?&r, X, X")) + (clobber (reg:CC_C CC_REGNUM))] + "TARGET_NEON" + "#" + "TARGET_NEON && reload_completed" + [(const_int 0)] + " + { + if (IS_VFP_REGNUM (REGNO (operands[0]))) + { + if (CONST_INT_P (operands[2])) + { + if (INTVAL (operands[2]) < 1) + { + emit_insn (gen_movdi (operands[0], operands[1])); + DONE; + } + else if (INTVAL (operands[2]) > 63) + operands[2] = gen_rtx_CONST_INT (VOIDmode, 63); + } + + /* Ditch the unnecessary clobbers. */ + emit_insn (gen_ashldi3_neon_noclobber (operands[0], operands[1], + operands[2])); + } + else + { + if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) == 1) + /* This clobbers CC. */ + emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1])); + else + arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1], + operands[2], operands[3], operands[4]); + } + DONE; + }" + [(set_attr "arch" "nota8,*,*,onlya8") + (set_attr "opt" "*,speed,speed,*")] + ) + + ; The shift amount needs to be negated for right-shifts + (define_insn "signed_shift_di3_neon" + [(set (match_operand:DI 0 "s_register_operand" "=w") + (unspec:DI [(match_operand:DI 1 "s_register_operand" " w") + (match_operand:SI 2 "s_register_operand" " T")] + UNSPEC_ASHIFT_SIGNED))] + "TARGET_NEON && reload_completed" + "vshl.s64\t%P0, %P1, %E2 @ ashr %P0, %P1, %2" + [(set_attr "neon_type" "neon_vshl_ddd")] + ) + + ; The shift amount needs to be negated for right-shifts + (define_insn "unsigned_shift_di3_neon" + [(set (match_operand:DI 0 "s_register_operand" "=w") + (unspec:DI [(match_operand:DI 1 "s_register_operand" " w") + (match_operand:SI 2 "s_register_operand" " T")] + UNSPEC_ASHIFT_UNSIGNED))] + "TARGET_NEON && reload_completed" + "vshl.u64\t%P0, %P1, %E2 @ lshr %P0, %P1, %2" + [(set_attr "neon_type" "neon_vshl_ddd")] + ) + + (define_insn "ashrdi3_neon_imm_noclobber" + [(set (match_operand:DI 0 "s_register_operand" "=w") + (ashiftrt:DI (match_operand:DI 1 "s_register_operand" " w") + (match_operand:SI 2 "const_int_operand" " i")))] + "TARGET_NEON && reload_completed + && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64" + "vshr.s64\t%P0, %P1, %2" + [(set_attr "neon_type" "neon_vshl_ddd")] + ) + + (define_insn "lshrdi3_neon_imm_noclobber" + [(set (match_operand:DI 0 "s_register_operand" "=w") + (lshiftrt:DI (match_operand:DI 1 "s_register_operand" " w") + (match_operand:SI 2 "const_int_operand" " i")))] + "TARGET_NEON && reload_completed + && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64" + "vshr.u64\t%P0, %P1, %2" + [(set_attr "neon_type" "neon_vshl_ddd")] + ) + + ;; ashrdi3_neon + ;; lshrdi3_neon + (define_insn_and_split "<shift>di3_neon" + [(set (match_operand:DI 0 "s_register_operand" "= w, w,?&r,?r,?w,?w") + (rshifts:DI (match_operand:DI 1 "s_register_operand" " w, w, 0r, r, w, w") + (match_operand:SI 2 "reg_or_int_operand" " r, i, r, i, r, i"))) + (clobber (match_scratch:SI 3 "= r, X, &r, X, r, X")) + (clobber (match_scratch:SI 4 "=&T, X, &r, X,&T, X")) + (clobber (reg:CC CC_REGNUM))] + "TARGET_NEON" + "#" + "TARGET_NEON && reload_completed" + [(const_int 0)] + " + { + if (IS_VFP_REGNUM (REGNO (operands[0]))) + { + if (CONST_INT_P (operands[2])) + { + if (INTVAL (operands[2]) < 1) + { + emit_insn (gen_movdi (operands[0], operands[1])); + DONE; + } + else if (INTVAL (operands[2]) > 64) + operands[2] = gen_rtx_CONST_INT (VOIDmode, 64); + + /* Ditch the unnecessary clobbers. */ + emit_insn (gen_<shift>di3_neon_imm_noclobber (operands[0], + operands[1], + operands[2])); + } + else + { + /* We must use a negative left-shift. */ + emit_insn (gen_negsi2 (operands[3], operands[2])); + emit_insn (gen_rtx_SET (SImode, operands[4], operands[3])); + emit_insn (gen_<shifttype>_shift_di3_neon (operands[0], operands[1], + operands[4])); + } + } + else + { + if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) == 1) + /* This clobbers CC. */ + emit_insn (gen_arm_<shift>di3_1bit (operands[0], operands[1])); + else + /* This clobbers CC (ASHIFTRT by register only). */ + arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1], + operands[2], operands[3], operands[4]); + } + + DONE; + }" + [(set_attr "arch" "nota8,nota8,*,*,onlya8,onlya8") + (set_attr "opt" "*,*,speed,speed,*,*")] + ) + ;; Widening operations (define_insn "widen_ssum<mode>3" Index: gcc/config/arm/constraints.md =================================================================== *** gcc/config/arm/constraints.md (revision 191254) --- gcc/config/arm/constraints.md (working copy) *************** *** 19,25 **** ;; <http://www.gnu.org/licenses/>. ;; The following register constraints have been used: ! ;; - in ARM/Thumb-2 state: t, w, x, y, z ;; - in Thumb state: h, b ;; - in both states: l, c, k ;; In ARM state, 'l' is an alias for 'r' --- 19,25 ---- ;; <http://www.gnu.org/licenses/>. ;; The following register constraints have been used: ! ;; - in ARM/Thumb-2 state: t, T, w, x, y, z ;; - in Thumb state: h, b ;; - in both states: l, c, k ;; In ARM state, 'l' is an alias for 'r' *************** *** 44,49 **** --- 44,52 ---- (define_register_constraint "t" "TARGET_32BIT ? VFP_LO_REGS : NO_REGS" "The VFP registers @code{s0}-@code{s31}.") + (define_register_constraint "T" "TARGET_32BIT ? VFP_LO_REGS_EVEN : NO_REGS" + "The even numbered VFP registers @code{s0}-@code{s31}.") + (define_register_constraint "w" "TARGET_32BIT ? (TARGET_VFPD32 ? VFP_REGS : VFP_LO_REGS) : NO_REGS" "The VFP registers @code{d0}-@code{d15}, or @code{d0}-@code{d31} for VFPv3.") Index: gcc/config/arm/iterators.md =================================================================== *** gcc/config/arm/iterators.md (revision 191254) --- gcc/config/arm/iterators.md (working copy) *************** *** 188,193 **** --- 188,196 ---- ;; A list of widening operators (define_code_iterator SE [sign_extend zero_extend]) + ;; Right shifts + (define_code_iterator rshifts [ashiftrt lshiftrt]) + ;;---------------------------------------------------------------------------- ;; Mode attributes ;;---------------------------------------------------------------------------- *************** *** 449,451 **** --- 452,459 ---- ;; Assembler mnemonics for signedness of widening operations. (define_code_attr US [(sign_extend "s") (zero_extend "u")]) + + ;; Right shifts + (define_code_attr shift [(ashiftrt "ashr") (lshiftrt "lshr")]) + (define_code_attr shifttype [(ashiftrt "signed") (lshiftrt "unsigned")]) + Index: gcc/config/arm/arm.md =================================================================== *** gcc/config/arm/arm.md (revision 191254) --- gcc/config/arm/arm.md (working copy) *************** *** 249,254 **** --- 249,270 ---- (const_string "no"))) + (define_attr "opt" "any,speed,size" + (const_string "any")) + + (define_attr "opt_enabled" "no,yes" + (cond [(eq_attr "opt" "any") + (const_string "yes") + + (and (eq_attr "opt" "speed") + (match_test "optimize_function_for_speed_p (cfun)")) + (const_string "yes") + + (and (eq_attr "opt" "size") + (match_test "optimize_function_for_size_p (cfun)")) + (const_string "yes")] + (const_string "no"))) + ; Allows an insn to disable certain alternatives for reasons other than ; arch support. (define_attr "insn_enabled" "no,yes" *************** *** 256,266 **** ; Enable all alternatives that are both arch_enabled and insn_enabled. (define_attr "enabled" "no,yes" ! (if_then_else (eq_attr "insn_enabled" "yes") ! (if_then_else (eq_attr "arch_enabled" "yes") ! (const_string "yes") ! (const_string "no")) ! (const_string "no"))) ; POOL_RANGE is how far away from a constant pool entry that this insn ; can be placed. If the distance is zero, then this insn will never --- 272,286 ---- ; Enable all alternatives that are both arch_enabled and insn_enabled. (define_attr "enabled" "no,yes" ! (cond [(eq_attr "insn_enabled" "no") ! (const_string "no") ! ! (eq_attr "arch_enabled" "no") ! (const_string "no") ! ! (eq_attr "opt_enabled" "no") ! (const_string "no")] ! (const_string "yes"))) ; POOL_RANGE is how far away from a constant pool entry that this insn ; can be placed. If the distance is zero, then this insn will never *************** *** 3492,3498 **** (match_operand:SI 2 "reg_or_int_operand" "")))] "TARGET_32BIT" " ! if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT) ; /* No special preparation statements; expand pattern as above. */ else { --- 3512,3525 ---- (match_operand:SI 2 "reg_or_int_operand" "")))] "TARGET_32BIT" " ! if (TARGET_NEON) ! { ! /* Delay the decision whether to use NEON or core-regs until ! register allocation. */ ! emit_insn (gen_ashldi3_neon (operands[0], operands[1], operands[2])); ! DONE; ! } ! else if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT) ; /* No special preparation statements; expand pattern as above. */ else { *************** *** 3566,3572 **** (match_operand:SI 2 "reg_or_int_operand" "")))] "TARGET_32BIT" " ! if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT) ; /* No special preparation statements; expand pattern as above. */ else { --- 3593,3606 ---- (match_operand:SI 2 "reg_or_int_operand" "")))] "TARGET_32BIT" " ! if (TARGET_NEON) ! { ! /* Delay the decision whether to use NEON or core-regs until ! register allocation. */ ! emit_insn (gen_ashrdi3_neon (operands[0], operands[1], operands[2])); ! DONE; ! } ! else if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT) ; /* No special preparation statements; expand pattern as above. */ else { *************** *** 3638,3644 **** (match_operand:SI 2 "reg_or_int_operand" "")))] "TARGET_32BIT" " ! if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT) ; /* No special preparation statements; expand pattern as above. */ else { --- 3672,3685 ---- (match_operand:SI 2 "reg_or_int_operand" "")))] "TARGET_32BIT" " ! if (TARGET_NEON) ! { ! /* Delay the decision whether to use NEON or core-regs until ! register allocation. */ ! emit_insn (gen_lshrdi3_neon (operands[0], operands[1], operands[2])); ! DONE; ! } ! else if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT) ; /* No special preparation statements; expand pattern as above. */ else { -- Dr. Ulrich Weigand GNU Toolchain for Linux on System z and Cell BE ulrich.weig...@de.ibm.com