On 18/09/12 18:06, Ulrich Weigand wrote:
> Hello,
> 
> a while ago Andrew Stubbs posted a patch to use NEON registers
> and instructions to perform 64-bit integer shifts:
> http://gcc.gnu.org/ml/gcc-patches/2012-05/msg01645.html
> 
> As Andrew no longer works on ARM, I've now picked this up and
> reworked it a bit:
> 
> - Updated for current mainline changes.
> - Fixed a typo in the "left shift by 1" special case.
> - Reworked constraint lists to have the NEON alternatives actually
>   reliably chosen in the "left shift by register" case.
> - Noticed that arm_emit_coreregs_64bit_shift actually does *not*
>   need a scratch for shifting by constant in any case, which
>   simplifies the implementation a bit.
> - Further minor simplifications & cleanup.
> 
> Tested on arm-linux-gnueabi (--with-arch=armv7-a --with-float=softfp
> --with-fpu=neon --with-mode=thumb) with no regressions.
> 
> OK for mainline?
> 

Hmm, this is going to cause bottlenecks on Cortex-A15: writing a Neon
single-precision register and then reading it back as a double-precision
value will cause scheduling problems.

The awkward thing here is that the shift only uses the bottom 8 bits of
the register, even though the instruction takes a 64-bit register, so we
don't want to go to the trouble of sign-extending the value all the way
out to 64-bits.

A solution to this is to have the set of the shifter register done as a
lane-set operation rather than as a set of the lower register, but it
probably needs some thought as to how to achieve this without creating
other overheads.

R.

> Bye,
> Ulrich
> 
> ChangeLog:
> 
> 2012-09-17  Andrew Stubbs  <a...@codesourcery.com>
>             Ulrich Weigand  <ulrich.weig...@linaro.org>
> 
>         * config/arm/arm.c (arm_print_operand): Add new 'E' format code.
>         (arm_emit_coreregs_64bit_shift): Fix comment.
>         * config/arm/arm.h (enum reg_class): Add VFP_LO_REGS_EVEN.
>         (REG_CLASS_NAMES, REG_CLASS_CONTENTS, IS_VFP_CLASS): Likewise.
>         * config/arm/arm.md (opt, opt_enabled): New attributes.
>         (enabled): Use opt_enabled.
>         (ashldi3, ashrdi3, lshrdi3): Add TARGET_NEON case.
>         * config/arm/constraints.md (T): New register constraint.
>         * config/arm/iterators.md (rshifts): New code iterator.
>         (shift, shifttype): New code attributes.
>         * config/arm/neon.md (signed_shift_di3_neon, unsigned_shift_di3_neon,
>         ashldi3_neon_noclobber, ashldi3_neon, ashrdi3_neon_imm_noclobber,
>         lshrdi3_neon_imm_noclobber, <shift>di3_neon): New patterns.
> 
> 
> Index: gcc/config/arm/arm.c
> ===================================================================
> *** gcc/config/arm/arm.c        (revision 191400)
> --- gcc/config/arm/arm.c        (working copy)
> *************** arm_print_operand (FILE *stream, rtx x,
> *** 17280,17285 ****
> --- 17280,17303 ----
>         }
>         return;
> 
> +     /* Print the VFP/Neon double precision register name that overlaps the
> +        given single-precision register.  */
> +     case 'E':
> +       {
> +       int mode = GET_MODE (x);
> +
> +       if (GET_MODE_SIZE (mode) != 4
> +           || GET_CODE (x) != REG
> +           || !IS_VFP_REGNUM (REGNO (x)))
> +         {
> +           output_operand_lossage ("invalid operand for code '%c'", code);
> +           return;
> +         }
> +
> +       fprintf (stream, "d%d", (REGNO (x) - FIRST_VFP_REGNUM) >> 1);
> +       }
> +       return;
> +
>       /* These two codes print the low/high doubleword register of a Neon quad
>          register, respectively.  For pair-structure types, can also print
>          low/high quadword registers.  */
> *************** arm_autoinc_modes_ok_p (enum machine_mod
> *** 26293,26300 ****
>      Input requirements:
>       - It is safe for the input and output to be the same register, but
>         early-clobber rules apply for the shift amount and scratch registers.
> !     - Shift by register requires both scratch registers.  Shift by a 
> constant
> !       less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other 
> cases
>         the scratch registers may be NULL.
>       - Ashiftrt by a register also clobbers the CC register.  */
>   void
> --- 26311,26317 ----
>      Input requirements:
>       - It is safe for the input and output to be the same register, but
>         early-clobber rules apply for the shift amount and scratch registers.
> !     - Shift by register requires both scratch registers.  In all other cases
>         the scratch registers may be NULL.
>       - Ashiftrt by a register also clobbers the CC register.  */
>   void
> Index: gcc/config/arm/arm.h
> ===================================================================
> *** gcc/config/arm/arm.h        (revision 191254)
> --- gcc/config/arm/arm.h        (working copy)
> *************** enum reg_class
> *** 1120,1125 ****
> --- 1120,1126 ----
>     CORE_REGS,
>     VFP_D0_D7_REGS,
>     VFP_LO_REGS,
> +   VFP_LO_REGS_EVEN,
>     VFP_HI_REGS,
>     VFP_REGS,
>     IWMMXT_REGS,
> *************** enum reg_class
> *** 1146,1151 ****
> --- 1147,1153 ----
>     "CORE_REGS",                \
>     "VFP_D0_D7_REGS",   \
>     "VFP_LO_REGS",      \
> +   "VFP_LO_REGS_EVEN", \
>     "VFP_HI_REGS",      \
>     "VFP_REGS",         \
>     "IWMMXT_REGS",      \
> *************** enum reg_class
> *** 1169,1174 ****
> --- 1171,1177 ----
>     { 0x00007FFF, 0x00000000, 0x00000000, 0x00000000 }, /* CORE_REGS */ \
>     { 0xFFFF0000, 0x00000000, 0x00000000, 0x00000000 }, /* VFP_D0_D7_REGS  */ 
> \
>     { 0xFFFF0000, 0x0000FFFF, 0x00000000, 0x00000000 }, /* VFP_LO_REGS  */ \
> +   { 0x55550000, 0x00005555, 0x00000000, 0x00000000 }, /* VFP_LO_REGS_EVEN  
> */ \
>     { 0x00000000, 0xFFFF0000, 0x0000FFFF, 0x00000000 }, /* VFP_HI_REGS  */ \
>     { 0xFFFF0000, 0xFFFFFFFF, 0x0000FFFF, 0x00000000 }, /* VFP_REGS  */ \
>     { 0x00000000, 0x00000000, 0xFFFF0000, 0x00000000 }, /* IWMMXT_REGS */     
>   \
> *************** enum reg_class
> *** 1182,1188 ****
> 
>   /* Any of the VFP register classes.  */
>   #define IS_VFP_CLASS(X) \
> !   ((X) == VFP_D0_D7_REGS || (X) == VFP_LO_REGS \
>      || (X) == VFP_HI_REGS || (X) == VFP_REGS)
> 
>   /* The same information, inverted:
> --- 1185,1191 ----
> 
>   /* Any of the VFP register classes.  */
>   #define IS_VFP_CLASS(X) \
> !   ((X) == VFP_D0_D7_REGS || (X) == VFP_LO_REGS || (X) == VFP_LO_REGS_EVEN \
>      || (X) == VFP_HI_REGS || (X) == VFP_REGS)
> 
>   /* The same information, inverted:
> Index: gcc/config/arm/neon.md
> ===================================================================
> *** gcc/config/arm/neon.md      (revision 191400)
> --- gcc/config/arm/neon.md      (working copy)
> ***************
> *** 1170,1175 ****
> --- 1170,1335 ----
>     DONE;
>   })
> 
> + ;; 64-bit shifts
> +
> + (define_insn "ashldi3_neon_noclobber"
> +   [(set (match_operand:DI 0 "s_register_operand"          "=w,w")
> +       (ashift:DI (match_operand:DI 1 "s_register_operand" " w,w")
> +                  (match_operand:SI 2 "reg_or_int_operand" " i,T")))]
> +   "TARGET_NEON && reload_completed
> +    && (!CONST_INT_P (operands[2])
> +        || (INTVAL (operands[2]) >= 0 && INTVAL (operands[2]) < 64))"
> +   "@
> +    vshl.u64\t%P0, %P1, %2
> +    vshl.u64\t%P0, %P1, %E2    @ ashl %P0, %P1, %2"
> +   [(set_attr "neon_type" "neon_vshl_ddd,neon_vshl_ddd")]
> + )
> +
> + (define_insn_and_split "ashldi3_neon"
> +   [(set (match_operand:DI 0 "s_register_operand"          "=w,?&r,?r,?w")
> +       (ashift:DI (match_operand:DI 1 "s_register_operand" " w, 0r, r, w")
> +                  (match_operand:SI 2 "reg_or_int_operand" "Ti,  r, i,Ti")))
> +    (clobber (match_scratch:SI 3                                   "=X,?&r, 
> X, X"))
> +    (clobber (match_scratch:SI 4                                   "=X,?&r, 
> X, X"))
> +    (clobber (reg:CC_C CC_REGNUM))]
> +   "TARGET_NEON"
> +   "#"
> +   "TARGET_NEON && reload_completed"
> +   [(const_int 0)]
> +   "
> +   {
> +     if (IS_VFP_REGNUM (REGNO (operands[0])))
> +       {
> +         if (CONST_INT_P (operands[2]))
> +         {
> +           if (INTVAL (operands[2]) < 1)
> +             {
> +               emit_insn (gen_movdi (operands[0], operands[1]));
> +               DONE;
> +             }
> +           else if (INTVAL (operands[2]) > 63)
> +             operands[2] = gen_rtx_CONST_INT (VOIDmode, 63);
> +         }
> +
> +       /* Ditch the unnecessary clobbers.  */
> +       emit_insn (gen_ashldi3_neon_noclobber (operands[0], operands[1],
> +                                              operands[2]));
> +       }
> +     else
> +       {
> +       if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) == 1)
> +         /* This clobbers CC.  */
> +         emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
> +       else
> +         arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
> +                                        operands[2], operands[3], 
> operands[4]);
> +       }
> +     DONE;
> +   }"
> +   [(set_attr "arch" "nota8,*,*,onlya8")
> +    (set_attr "opt" "*,speed,speed,*")]
> + )
> +
> + ; The shift amount needs to be negated for right-shifts
> + (define_insn "signed_shift_di3_neon"
> +   [(set (match_operand:DI 0 "s_register_operand"           "=w")
> +       (unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
> +                   (match_operand:SI 2 "s_register_operand" " T")]
> +                  UNSPEC_ASHIFT_SIGNED))]
> +   "TARGET_NEON && reload_completed"
> +   "vshl.s64\t%P0, %P1, %E2    @ ashr %P0, %P1, %2"
> +   [(set_attr "neon_type" "neon_vshl_ddd")]
> + )
> +
> + ; The shift amount needs to be negated for right-shifts
> + (define_insn "unsigned_shift_di3_neon"
> +   [(set (match_operand:DI 0 "s_register_operand"           "=w")
> +       (unspec:DI [(match_operand:DI 1 "s_register_operand" " w")
> +                   (match_operand:SI 2 "s_register_operand" " T")]
> +                  UNSPEC_ASHIFT_UNSIGNED))]
> +   "TARGET_NEON && reload_completed"
> +   "vshl.u64\t%P0, %P1, %E2    @ lshr %P0, %P1, %2"
> +   [(set_attr "neon_type" "neon_vshl_ddd")]
> + )
> +
> + (define_insn "ashrdi3_neon_imm_noclobber"
> +   [(set (match_operand:DI 0 "s_register_operand"            "=w")
> +       (ashiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
> +                    (match_operand:SI 2 "const_int_operand"  " i")))]
> +   "TARGET_NEON && reload_completed
> +    && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
> +   "vshr.s64\t%P0, %P1, %2"
> +   [(set_attr "neon_type" "neon_vshl_ddd")]
> + )
> +
> + (define_insn "lshrdi3_neon_imm_noclobber"
> +   [(set (match_operand:DI 0 "s_register_operand"            "=w")
> +       (lshiftrt:DI (match_operand:DI 1 "s_register_operand" " w")
> +                    (match_operand:SI 2 "const_int_operand"  " i")))]
> +   "TARGET_NEON && reload_completed
> +    && INTVAL (operands[2]) > 0 && INTVAL (operands[2]) <= 64"
> +   "vshr.u64\t%P0, %P1, %2"
> +   [(set_attr "neon_type" "neon_vshl_ddd")]
> + )
> +
> + ;; ashrdi3_neon
> + ;; lshrdi3_neon
> + (define_insn_and_split "<shift>di3_neon"
> +   [(set (match_operand:DI 0 "s_register_operand"           "= w, 
> w,?&r,?r,?w,?w")
> +       (rshifts:DI (match_operand:DI 1 "s_register_operand" "  w, w, 0r, r, 
> w, w")
> +                   (match_operand:SI 2 "reg_or_int_operand" "  r, i,  r, i, 
> r, i")))
> +    (clobber (match_scratch:SI 3                                    "= r, X, 
> &r, X, r, X"))
> +    (clobber (match_scratch:SI 4                                    "=&T, X, 
> &r, X,&T, X"))
> +    (clobber (reg:CC CC_REGNUM))]
> +   "TARGET_NEON"
> +   "#"
> +   "TARGET_NEON && reload_completed"
> +   [(const_int 0)]
> +   "
> +   {
> +     if (IS_VFP_REGNUM (REGNO (operands[0])))
> +       {
> +       if (CONST_INT_P (operands[2]))
> +         {
> +           if (INTVAL (operands[2]) < 1)
> +             {
> +               emit_insn (gen_movdi (operands[0], operands[1]));
> +               DONE;
> +             }
> +           else if (INTVAL (operands[2]) > 64)
> +             operands[2] = gen_rtx_CONST_INT (VOIDmode, 64);
> +
> +           /* Ditch the unnecessary clobbers.  */
> +           emit_insn (gen_<shift>di3_neon_imm_noclobber (operands[0],
> +                                                         operands[1],
> +                                                         operands[2]));
> +         }
> +       else
> +         {
> +           /* We must use a negative left-shift.  */
> +           emit_insn (gen_negsi2 (operands[3], operands[2]));
> +           emit_insn (gen_rtx_SET (SImode, operands[4], operands[3]));
> +           emit_insn (gen_<shifttype>_shift_di3_neon (operands[0], 
> operands[1],
> +                                                      operands[4]));
> +         }
> +       }
> +     else
> +       {
> +       if (CONST_INT_P (operands[2]) && INTVAL (operands[2]) == 1)
> +         /* This clobbers CC.  */
> +         emit_insn (gen_arm_<shift>di3_1bit (operands[0], operands[1]));
> +       else
> +         /* This clobbers CC (ASHIFTRT by register only).  */
> +         arm_emit_coreregs_64bit_shift (<CODE>, operands[0], operands[1],
> +                                        operands[2], operands[3], 
> operands[4]);
> +       }
> +
> +     DONE;
> +   }"
> +   [(set_attr "arch" "nota8,nota8,*,*,onlya8,onlya8")
> +    (set_attr "opt" "*,*,speed,speed,*,*")]
> + )
> +
>   ;; Widening operations
> 
>   (define_insn "widen_ssum<mode>3"
> Index: gcc/config/arm/constraints.md
> ===================================================================
> *** gcc/config/arm/constraints.md       (revision 191254)
> --- gcc/config/arm/constraints.md       (working copy)
> ***************
> *** 19,25 ****
>   ;; <http://www.gnu.org/licenses/>.
> 
>   ;; The following register constraints have been used:
> ! ;; - in ARM/Thumb-2 state: t, w, x, y, z
>   ;; - in Thumb state: h, b
>   ;; - in both states: l, c, k
>   ;; In ARM state, 'l' is an alias for 'r'
> --- 19,25 ----
>   ;; <http://www.gnu.org/licenses/>.
> 
>   ;; The following register constraints have been used:
> ! ;; - in ARM/Thumb-2 state: t, T, w, x, y, z
>   ;; - in Thumb state: h, b
>   ;; - in both states: l, c, k
>   ;; In ARM state, 'l' is an alias for 'r'
> ***************
> *** 44,49 ****
> --- 44,52 ----
>   (define_register_constraint "t" "TARGET_32BIT ? VFP_LO_REGS : NO_REGS"
>    "The VFP registers @code{s0}-@code{s31}.")
> 
> + (define_register_constraint "T" "TARGET_32BIT ? VFP_LO_REGS_EVEN : NO_REGS"
> +  "The even numbered VFP registers @code{s0}-@code{s31}.")
> +
>   (define_register_constraint "w"
>     "TARGET_32BIT ? (TARGET_VFPD32 ? VFP_REGS : VFP_LO_REGS) : NO_REGS"
>    "The VFP registers @code{d0}-@code{d15}, or @code{d0}-@code{d31} for 
> VFPv3.")
> Index: gcc/config/arm/iterators.md
> ===================================================================
> *** gcc/config/arm/iterators.md (revision 191254)
> --- gcc/config/arm/iterators.md (working copy)
> ***************
> *** 188,193 ****
> --- 188,196 ----
>   ;; A list of widening operators
>   (define_code_iterator SE [sign_extend zero_extend])
> 
> + ;; Right shifts
> + (define_code_iterator rshifts [ashiftrt lshiftrt])
> +
>   
> ;;----------------------------------------------------------------------------
>   ;; Mode attributes
>   
> ;;----------------------------------------------------------------------------
> ***************
> *** 449,451 ****
> --- 452,459 ----
> 
>   ;; Assembler mnemonics for signedness of widening operations.
>   (define_code_attr US [(sign_extend "s") (zero_extend "u")])
> +
> + ;; Right shifts
> + (define_code_attr shift [(ashiftrt "ashr") (lshiftrt "lshr")])
> + (define_code_attr shifttype [(ashiftrt "signed") (lshiftrt "unsigned")])
> +
> Index: gcc/config/arm/arm.md
> ===================================================================
> *** gcc/config/arm/arm.md       (revision 191254)
> --- gcc/config/arm/arm.md       (working copy)
> ***************
> *** 249,254 ****
> --- 249,270 ----
> 
>         (const_string "no")))
> 
> + (define_attr "opt" "any,speed,size"
> +   (const_string "any"))
> +
> + (define_attr "opt_enabled" "no,yes"
> +   (cond [(eq_attr "opt" "any")
> +          (const_string "yes")
> +
> +        (and (eq_attr "opt" "speed")
> +             (match_test "optimize_function_for_speed_p (cfun)"))
> +        (const_string "yes")
> +
> +        (and (eq_attr "opt" "size")
> +             (match_test "optimize_function_for_size_p (cfun)"))
> +        (const_string "yes")]
> +       (const_string "no")))
> +
>   ; Allows an insn to disable certain alternatives for reasons other than
>   ; arch support.
>   (define_attr "insn_enabled" "no,yes"
> ***************
> *** 256,266 ****
> 
>   ; Enable all alternatives that are both arch_enabled and insn_enabled.
>    (define_attr "enabled" "no,yes"
> !    (if_then_else (eq_attr "insn_enabled" "yes")
> !                (if_then_else (eq_attr "arch_enabled" "yes")
> !                              (const_string "yes")
> !                              (const_string "no"))
> !                 (const_string "no")))
> 
>   ; POOL_RANGE is how far away from a constant pool entry that this insn
>   ; can be placed.  If the distance is zero, then this insn will never
> --- 272,286 ----
> 
>   ; Enable all alternatives that are both arch_enabled and insn_enabled.
>    (define_attr "enabled" "no,yes"
> !    (cond [(eq_attr "insn_enabled" "no")
> !         (const_string "no")
> !
> !         (eq_attr "arch_enabled" "no")
> !         (const_string "no")
> !
> !         (eq_attr "opt_enabled" "no")
> !         (const_string "no")]
> !        (const_string "yes")))
> 
>   ; POOL_RANGE is how far away from a constant pool entry that this insn
>   ; can be placed.  If the distance is zero, then this insn will never
> ***************
> *** 3492,3498 ****
>                      (match_operand:SI 2 "reg_or_int_operand" "")))]
>     "TARGET_32BIT"
>     "
> !   if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
>       ; /* No special preparation statements; expand pattern as above.  */
>     else
>       {
> --- 3512,3525 ----
>                      (match_operand:SI 2 "reg_or_int_operand" "")))]
>     "TARGET_32BIT"
>     "
> !   if (TARGET_NEON)
> !     {
> !       /* Delay the decision whether to use NEON or core-regs until
> !        register allocation.  */
> !       emit_insn (gen_ashldi3_neon (operands[0], operands[1], operands[2]));
> !       DONE;
> !     }
> !   else if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
>       ; /* No special preparation statements; expand pattern as above.  */
>     else
>       {
> ***************
> *** 3566,3572 ****
>                        (match_operand:SI 2 "reg_or_int_operand" "")))]
>     "TARGET_32BIT"
>     "
> !   if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
>       ; /* No special preparation statements; expand pattern as above.  */
>     else
>       {
> --- 3593,3606 ----
>                        (match_operand:SI 2 "reg_or_int_operand" "")))]
>     "TARGET_32BIT"
>     "
> !   if (TARGET_NEON)
> !     {
> !       /* Delay the decision whether to use NEON or core-regs until
> !        register allocation.  */
> !       emit_insn (gen_ashrdi3_neon (operands[0], operands[1], operands[2]));
> !       DONE;
> !     }
> !   else if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
>       ; /* No special preparation statements; expand pattern as above.  */
>     else
>       {
> ***************
> *** 3638,3644 ****
>                        (match_operand:SI 2 "reg_or_int_operand" "")))]
>     "TARGET_32BIT"
>     "
> !   if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
>       ; /* No special preparation statements; expand pattern as above.  */
>     else
>       {
> --- 3672,3685 ----
>                        (match_operand:SI 2 "reg_or_int_operand" "")))]
>     "TARGET_32BIT"
>     "
> !   if (TARGET_NEON)
> !     {
> !       /* Delay the decision whether to use NEON or core-regs until
> !        register allocation.  */
> !       emit_insn (gen_lshrdi3_neon (operands[0], operands[1], operands[2]));
> !       DONE;
> !     }
> !   else if (!CONST_INT_P (operands[2]) && TARGET_REALLY_IWMMXT)
>       ; /* No special preparation statements; expand pattern as above.  */
>     else
>       {
> --
>   Dr. Ulrich Weigand
>   GNU Toolchain for Linux on System z and Cell BE
>   ulrich.weig...@de.ibm.com
> 
> 




Reply via email to