Michael Collison <michael.colli...@arm.com> writes:
> Updated the patch per Richard's suggestions to allow scheduling of
> instructions before reload.

Thanks, this looks good to me FWIW, but obviously I can't approve it.

Richard

> Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk?
>
> 2017-05-22  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>
>           Michael Collison <michael.colli...@arm.com>
>
>       PR target/70119
>       * config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1):
>       New pattern.
>       (*aarch64_reg_<mode>3_neg_mask2): New pattern.
>       (*aarch64_reg_<mode>3_minus_mask): New pattern.
>       (*aarch64_<optab>_reg_di3_mask2): New pattern.
>       * config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost
>       of shift when the shift amount is masked with constant equal to
>       the size of the mode.
>       * config/aarch64/predicates.md (subreg_lowpart_operator): New
>       predicate.
>
>
> 2016-05-22  Kyrylo Tkachov  <kyrylo.tkac...@arm.com>
>           Michael Collison <michael.colli...@arm.com>
>
>       PR target/70119
>       * gcc.target/aarch64/var_shift_mask_1.c: New test.
>
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandif...@linaro.org] 
> Sent: Thursday, June 15, 2017 6:40 AM
> To: Michael Collison <michael.colli...@arm.com>
> Cc: Wilco Dijkstra <wilco.dijks...@arm.com>; Christophe Lyon 
> <christophe.l...@linaro.org>; GCC Patches <gcc-patches@gcc.gnu.org>; nd 
> <n...@arm.com>
> Subject: Re: [PATCH] [Aarch64] Variable shift count truncation issues
>
> Michael Collison <michael.colli...@arm.com> writes:
>> +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
>> +  [(set (match_operand:GPI 0 "register_operand" "=r")
>> +    (SHIFT:GPI
>> +      (match_operand:GPI 1 "register_operand" "r")
>> +      (match_operator 4 "subreg_lowpart_operator"
>> +      [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
>> +                       (match_operand 3 "const_int_operand" "n")))])))
>> +   (clobber (match_scratch:SI 5 "=&r"))]
>> +  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
>> +  "#"
>> +  "&& reload_completed"
>> +  [(const_int 0)]
>> +  {
>> +    emit_insn (gen_negsi2 (operands[5], operands[2]));
>> +
>> +    rtx and_op = gen_rtx_AND (SImode, operands[5], operands[3]);
>> +    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
>> +                                         SUBREG_BYTE (operands[4]));
>> +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
>> +    DONE;
>> +  }
>> +)
>
> Thanks, I agree this looks correct from the split/reload_completed POV.
> I think we can go one better though, either:
>
> (a) Still allow the split when !reload_completed, and use:
>
>      if (GET_MODE (operands[5]) == SCRATCH)
>        operands[5] = gen_reg_rtx (SImode);
>
>     This will allow the individual instructions to be scheduled by sched1.
>
> (b) Continue to restrict the split to reload_completed, change operand 0
>     to =&r so that it can be used as a temporary, and drop operand 5 entirely.
>
> Or perhaps do both:
>
> (define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
>   [(set (match_operand:GPI 0 "register_operand" "=&r")
>       (SHIFT:GPI
>         (match_operand:GPI 1 "register_operand" "r")
>         (match_operator 4 "subreg_lowpart_operator"
>         [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
>                          (match_operand 3 "const_int_operand" "n")))])))]
>   "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
>   "#"
>   "&& 1"
>   [(const_int 0)]
>   {
>     rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (<GPI:MODE>mode)
>                : operands[0]);
>     emit_insn (gen_negsi2 (tmp, operands[2]));
>
>     rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
>     rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
>                                    SUBREG_BYTE (operands[4]));
>     emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
>     DONE;
>   }
> )
>
> Sorry for the run-around.  I should have realised earlier that these patterns 
> didn't really need a distinct register after RA.
>
> Thanks,
> Richard
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 5707e53..45377a2 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -7407,17 +7407,26 @@ cost_plus:
>          }
>        else
>          {
> -       if (speed)
> +       if (VECTOR_MODE_P (mode))
>           {
> -           if (VECTOR_MODE_P (mode))
> -             {
> -               /* Vector shift (register).  */
> -               *cost += extra_cost->vect.alu;
> -             }
> -           else
> +           if (speed)
> +             /* Vector shift (register).  */
> +             *cost += extra_cost->vect.alu;
> +         }
> +       else
> +         {
> +           if (speed)
> +             /* LSLV.  */
> +             *cost += extra_cost->alu.shift_reg;
> +
> +           if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
> +               && CONST_INT_P (XEXP (op1, 1))
> +               && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
>               {
> -               /* LSLV.  */
> -               *cost += extra_cost->alu.shift_reg;
> +               *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
> +               /* We already demanded XEXP (op1, 0) to be REG_P, so
> +                  don't recurse into it.  */
> +               return true;
>               }
>           }
>         return false;  /* All arguments need to be in registers.  */
> @@ -7446,14 +7455,27 @@ cost_plus:
>       }
>        else
>       {
> -
> -       /* ASR (register) and friends.  */
> -       if (speed)
> +       if (VECTOR_MODE_P (mode))
>           {
> -           if (VECTOR_MODE_P (mode))
> +           if (speed)
> +             /* Vector shift (register).  */
>               *cost += extra_cost->vect.alu;
> -           else
> +         }
> +       else
> +         {
> +           if (speed)
> +             /* ASR (register) and friends.  */
>               *cost += extra_cost->alu.shift_reg;
> +
> +           if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
> +               && CONST_INT_P (XEXP (op1, 1))
> +               && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
> +             {
> +               *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
> +               /* We already demanded XEXP (op1, 0) to be REG_P, so
> +                  don't recurse into it.  */
> +               return true;
> +             }
>           }
>         return false;  /* All arguments need to be in registers.  */
>       }
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index d89df66..ff5720c 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -3942,6 +3942,97 @@
>    }
>  )
>  
> +;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments
> +;; they truncate the shift/rotate amount by the size of the registers they
> +;; operate on: 32 for W-regs, 63 for X-regs.  This allows us to optimise away
> +;; such redundant masking instructions.  GCC can do that automatically when
> +;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD
> +;; because some of the SISD shift alternatives don't perform this 
> truncations.
> +;; So this pattern exists to catch such cases.
> +
> +(define_insn "*aarch64_<optab>_reg_<mode>3_mask1"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +     (SHIFT:GPI
> +       (match_operand:GPI 1 "register_operand" "r")
> +       (match_operator 4 "subreg_lowpart_operator"
> +        [(and:GPI (match_operand:GPI 2 "register_operand" "r")
> +                  (match_operand 3 "const_int_operand" "n"))])))]
> +  "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0"
> +  "<shift>\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "shift_reg")]
> +)
> +
> +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
> +  [(set (match_operand:GPI 0 "register_operand" "=&r")
> +     (SHIFT:GPI
> +       (match_operand:GPI 1 "register_operand" "r")
> +       (match_operator 4 "subreg_lowpart_operator"
> +       [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
> +                        (match_operand 3 "const_int_operand" "n")))])))]
> +  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +  {
> +    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
> +            : operands[0]);
> +    emit_insn (gen_negsi2 (tmp, operands[2]));
> +
> +    rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
> +    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
> +                                  SUBREG_BYTE (operands[4]));
> +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
> +    DONE;
> +  }
> +)
> +
> +(define_insn_and_split "*aarch64_reg_<mode>3_minus_mask"
> +  [(set (match_operand:GPI 0 "register_operand" "=&r")
> +     (ashift:GPI
> +       (match_operand:GPI 1 "register_operand" "r")
> +       (minus:QI (match_operand 2 "const_int_operand" "n")
> +                 (match_operator 5 "subreg_lowpart_operator"
> +                 [(and:SI (match_operand:SI 3 "register_operand" "r")
> +                          (match_operand 4 "const_int_operand" "n"))]))))]
> +  "((~INTVAL (operands[4]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
> +   && INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +  {
> +    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
> +            : operands[0]);
> +
> +    emit_insn (gen_negsi2 (tmp, operands[3]));
> +
> +    rtx and_op = gen_rtx_AND (SImode, tmp, operands[4]);
> +    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[5]), and_op,
> +                                  SUBREG_BYTE (operands[5]));
> +
> +    emit_insn (gen_ashl<mode>3 (operands[0], operands[1], subreg_tmp));
> +    DONE;
> +  }
> +)
> +
> +(define_insn "*aarch64_<optab>_reg_di3_mask2"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +     (SHIFT:DI
> +       (match_operand:DI 1 "register_operand" "r")
> +       (match_operator 4 "subreg_lowpart_operator"
> +        [(and:SI (match_operand:SI 2 "register_operand" "r")
> +                  (match_operand 3 "aarch64_shift_imm_di" "Usd"))])))]
> +  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == 0)"
> +{
> +  rtx xop[3];
> +  xop[0] = operands[0];
> +  xop[1] = operands[1];
> +  xop[2] = gen_lowpart (GET_MODE (operands[4]), operands[2]);
> +  output_asm_insn ("<shift>\t%x0, %x1, %x2", xop);
> +  return "";
> +}
> +  [(set_attr "type" "shift_reg")]
> +)
> +
>  ;; Logical left shift using SISD or Integer instruction
>  (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
>    [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
> diff --git a/gcc/config/aarch64/predicates.md 
> b/gcc/config/aarch64/predicates.md
> index cd7ded9..ad8a43c 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -35,6 +35,10 @@
>    (and (match_code "const_int")
>         (match_test "op == CONST0_RTX (mode)")))
>  
> +(define_special_predicate "subreg_lowpart_operator"
> +  (and (match_code "subreg")
> +       (match_test "subreg_lowpart_p (op)")))
> +
>  (define_predicate "aarch64_ccmp_immediate"
>    (and (match_code "const_int")
>         (match_test "IN_RANGE (INTVAL (op), -31, 31)")))
> diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c 
> b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
> new file mode 100644
> index 0000000..e2b020e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
> @@ -0,0 +1,61 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* The integer variable shift and rotate instructions truncate their
> +   shift amounts by the datasize.  Make sure that we don't emit a redundant
> +   masking operation.  */
> +
> +unsigned
> +f1 (unsigned x, int y)
> +{
> +  return x << (y & 31);
> +}
> +
> +unsigned long
> +f2 (unsigned long x, int y)
> +{
> +  return x << (y & 63);
> +}
> +
> +unsigned long
> +f3 (unsigned long bit_addr, int y)
> +{
> +  unsigned long bitnumb = bit_addr & 63;
> +  return (1L << bitnumb);
> +}
> +
> +unsigned int
> +f4 (unsigned int x, unsigned int y)
> +{
> +  y &= 31;
> +  return x >> y | (x << (32 - y));
> +}
> +
> +unsigned long
> +f5 (unsigned long x, unsigned long y)
> +{
> +  y &= 63;
> +  return x >> y | (x << (64 - y));
> +}
> +
> +unsigned long
> +f6 (unsigned long x, unsigned long y)
> +{
> +
> +  return (x << (64 - (y & 63)));
> +
> +}
> +
> +unsigned long
> +f7 (unsigned long x, unsigned long y)
> +{
> +  return (x << -(y & 63));
> +}
> +
> +/* { dg-final { scan-assembler-times "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 
> 1 } } */
> +/* { dg-final { scan-assembler-times "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 
> 4 } } */
> +/* { dg-final { scan-assembler-times "ror\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 
> 1 } } */
> +/* { dg-final { scan-assembler-times "ror\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 
> 1 } } */
> +/* { dg-final { scan-assembler-not "and\tw\[0-9\]+, w\[0-9\]+, 31" } } */
> +/* { dg-final { scan-assembler-not "and\tx\[0-9\]+, x\[0-9\]+, 63" } } */
> +/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } 
> } */

Reply via email to