Michael Collison <michael.colli...@arm.com> writes: > Updated the patch per Richard's suggestions to allow scheduling of > instructions before reload.
Thanks, this looks good to me FWIW, but obviously I can't approve it. Richard > Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk? > > 2017-05-22 Kyrylo Tkachov <kyrylo.tkac...@arm.com> > Michael Collison <michael.colli...@arm.com> > > PR target/70119 > * config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1): > New pattern. > (*aarch64_reg_<mode>3_neg_mask2): New pattern. > (*aarch64_reg_<mode>3_minus_mask): New pattern. > (*aarch64_<optab>_reg_di3_mask2): New pattern. > * config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost > of shift when the shift amount is masked with constant equal to > the size of the mode. > * config/aarch64/predicates.md (subreg_lowpart_operator): New > predicate. > > > 2016-05-22 Kyrylo Tkachov <kyrylo.tkac...@arm.com> > Michael Collison <michael.colli...@arm.com> > > PR target/70119 > * gcc.target/aarch64/var_shift_mask_1.c: New test. > > -----Original Message----- > From: Richard Sandiford [mailto:richard.sandif...@linaro.org] > Sent: Thursday, June 15, 2017 6:40 AM > To: Michael Collison <michael.colli...@arm.com> > Cc: Wilco Dijkstra <wilco.dijks...@arm.com>; Christophe Lyon > <christophe.l...@linaro.org>; GCC Patches <gcc-patches@gcc.gnu.org>; nd > <n...@arm.com> > Subject: Re: [PATCH] [Aarch64] Variable shift count truncation issues > > Michael Collison <michael.colli...@arm.com> writes: >> +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2" >> + [(set (match_operand:GPI 0 "register_operand" "=r") >> + (SHIFT:GPI >> + (match_operand:GPI 1 "register_operand" "r") >> + (match_operator 4 "subreg_lowpart_operator" >> + [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r") >> + (match_operand 3 "const_int_operand" "n")))]))) >> + (clobber (match_scratch:SI 5 "=&r"))] >> + "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)" >> + "#" >> + "&& reload_completed" >> + [(const_int 0)] >> + { >> + emit_insn (gen_negsi2 (operands[5], operands[2])); >> + >> + rtx and_op = gen_rtx_AND (SImode, operands[5], operands[3]); >> + rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op, >> + SUBREG_BYTE (operands[4])); >> + emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp)); >> + DONE; >> + } >> +) > > Thanks, I agree this looks correct from the split/reload_completed POV. > I think we can go one better though, either: > > (a) Still allow the split when !reload_completed, and use: > > if (GET_MODE (operands[5]) == SCRATCH) > operands[5] = gen_reg_rtx (SImode); > > This will allow the individual instructions to be scheduled by sched1. > > (b) Continue to restrict the split to reload_completed, change operand 0 > to =&r so that it can be used as a temporary, and drop operand 5 entirely. > > Or perhaps do both: > > (define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2" > [(set (match_operand:GPI 0 "register_operand" "=&r") > (SHIFT:GPI > (match_operand:GPI 1 "register_operand" "r") > (match_operator 4 "subreg_lowpart_operator" > [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r") > (match_operand 3 "const_int_operand" "n")))])))] > "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)" > "#" > "&& 1" > [(const_int 0)] > { > rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (<GPI:MODE>mode) > : operands[0]); > emit_insn (gen_negsi2 (tmp, operands[2])); > > rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]); > rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op, > SUBREG_BYTE (operands[4])); > emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp)); > DONE; > } > ) > > Sorry for the run-around. I should have realised earlier that these patterns > didn't really need a distinct register after RA. > > Thanks, > Richard > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index 5707e53..45377a2 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -7407,17 +7407,26 @@ cost_plus: > } > else > { > - if (speed) > + if (VECTOR_MODE_P (mode)) > { > - if (VECTOR_MODE_P (mode)) > - { > - /* Vector shift (register). */ > - *cost += extra_cost->vect.alu; > - } > - else > + if (speed) > + /* Vector shift (register). */ > + *cost += extra_cost->vect.alu; > + } > + else > + { > + if (speed) > + /* LSLV. */ > + *cost += extra_cost->alu.shift_reg; > + > + if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0)) > + && CONST_INT_P (XEXP (op1, 1)) > + && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1) > { > - /* LSLV. */ > - *cost += extra_cost->alu.shift_reg; > + *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed); > + /* We already demanded XEXP (op1, 0) to be REG_P, so > + don't recurse into it. */ > + return true; > } > } > return false; /* All arguments need to be in registers. */ > @@ -7446,14 +7455,27 @@ cost_plus: > } > else > { > - > - /* ASR (register) and friends. */ > - if (speed) > + if (VECTOR_MODE_P (mode)) > { > - if (VECTOR_MODE_P (mode)) > + if (speed) > + /* Vector shift (register). */ > *cost += extra_cost->vect.alu; > - else > + } > + else > + { > + if (speed) > + /* ASR (register) and friends. */ > *cost += extra_cost->alu.shift_reg; > + > + if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0)) > + && CONST_INT_P (XEXP (op1, 1)) > + && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1) > + { > + *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed); > + /* We already demanded XEXP (op1, 0) to be REG_P, so > + don't recurse into it. */ > + return true; > + } > } > return false; /* All arguments need to be in registers. */ > } > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index d89df66..ff5720c 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -3942,6 +3942,97 @@ > } > ) > > +;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments > +;; they truncate the shift/rotate amount by the size of the registers they > +;; operate on: 32 for W-regs, 63 for X-regs. This allows us to optimise away > +;; such redundant masking instructions. GCC can do that automatically when > +;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD > +;; because some of the SISD shift alternatives don't perform this > truncations. > +;; So this pattern exists to catch such cases. > + > +(define_insn "*aarch64_<optab>_reg_<mode>3_mask1" > + [(set (match_operand:GPI 0 "register_operand" "=r") > + (SHIFT:GPI > + (match_operand:GPI 1 "register_operand" "r") > + (match_operator 4 "subreg_lowpart_operator" > + [(and:GPI (match_operand:GPI 2 "register_operand" "r") > + (match_operand 3 "const_int_operand" "n"))])))] > + "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0" > + "<shift>\t%<w>0, %<w>1, %<w>2" > + [(set_attr "type" "shift_reg")] > +) > + > +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2" > + [(set (match_operand:GPI 0 "register_operand" "=&r") > + (SHIFT:GPI > + (match_operand:GPI 1 "register_operand" "r") > + (match_operator 4 "subreg_lowpart_operator" > + [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r") > + (match_operand 3 "const_int_operand" "n")))])))] > + "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)" > + "#" > + "&& 1" > + [(const_int 0)] > + { > + rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode) > + : operands[0]); > + emit_insn (gen_negsi2 (tmp, operands[2])); > + > + rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]); > + rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op, > + SUBREG_BYTE (operands[4])); > + emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp)); > + DONE; > + } > +) > + > +(define_insn_and_split "*aarch64_reg_<mode>3_minus_mask" > + [(set (match_operand:GPI 0 "register_operand" "=&r") > + (ashift:GPI > + (match_operand:GPI 1 "register_operand" "r") > + (minus:QI (match_operand 2 "const_int_operand" "n") > + (match_operator 5 "subreg_lowpart_operator" > + [(and:SI (match_operand:SI 3 "register_operand" "r") > + (match_operand 4 "const_int_operand" "n"))]))))] > + "((~INTVAL (operands[4]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0) > + && INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)" > + "#" > + "&& 1" > + [(const_int 0)] > + { > + rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode) > + : operands[0]); > + > + emit_insn (gen_negsi2 (tmp, operands[3])); > + > + rtx and_op = gen_rtx_AND (SImode, tmp, operands[4]); > + rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[5]), and_op, > + SUBREG_BYTE (operands[5])); > + > + emit_insn (gen_ashl<mode>3 (operands[0], operands[1], subreg_tmp)); > + DONE; > + } > +) > + > +(define_insn "*aarch64_<optab>_reg_di3_mask2" > + [(set (match_operand:DI 0 "register_operand" "=r") > + (SHIFT:DI > + (match_operand:DI 1 "register_operand" "r") > + (match_operator 4 "subreg_lowpart_operator" > + [(and:SI (match_operand:SI 2 "register_operand" "r") > + (match_operand 3 "aarch64_shift_imm_di" "Usd"))])))] > + "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == 0)" > +{ > + rtx xop[3]; > + xop[0] = operands[0]; > + xop[1] = operands[1]; > + xop[2] = gen_lowpart (GET_MODE (operands[4]), operands[2]); > + output_asm_insn ("<shift>\t%x0, %x1, %x2", xop); > + return ""; > +} > + [(set_attr "type" "shift_reg")] > +) > + > ;; Logical left shift using SISD or Integer instruction > (define_insn "*aarch64_ashl_sisd_or_int_<mode>3" > [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w") > diff --git a/gcc/config/aarch64/predicates.md > b/gcc/config/aarch64/predicates.md > index cd7ded9..ad8a43c 100644 > --- a/gcc/config/aarch64/predicates.md > +++ b/gcc/config/aarch64/predicates.md > @@ -35,6 +35,10 @@ > (and (match_code "const_int") > (match_test "op == CONST0_RTX (mode)"))) > > +(define_special_predicate "subreg_lowpart_operator" > + (and (match_code "subreg") > + (match_test "subreg_lowpart_p (op)"))) > + > (define_predicate "aarch64_ccmp_immediate" > (and (match_code "const_int") > (match_test "IN_RANGE (INTVAL (op), -31, 31)"))) > diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c > b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c > new file mode 100644 > index 0000000..e2b020e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c > @@ -0,0 +1,61 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > + > +/* The integer variable shift and rotate instructions truncate their > + shift amounts by the datasize. Make sure that we don't emit a redundant > + masking operation. */ > + > +unsigned > +f1 (unsigned x, int y) > +{ > + return x << (y & 31); > +} > + > +unsigned long > +f2 (unsigned long x, int y) > +{ > + return x << (y & 63); > +} > + > +unsigned long > +f3 (unsigned long bit_addr, int y) > +{ > + unsigned long bitnumb = bit_addr & 63; > + return (1L << bitnumb); > +} > + > +unsigned int > +f4 (unsigned int x, unsigned int y) > +{ > + y &= 31; > + return x >> y | (x << (32 - y)); > +} > + > +unsigned long > +f5 (unsigned long x, unsigned long y) > +{ > + y &= 63; > + return x >> y | (x << (64 - y)); > +} > + > +unsigned long > +f6 (unsigned long x, unsigned long y) > +{ > + > + return (x << (64 - (y & 63))); > + > +} > + > +unsigned long > +f7 (unsigned long x, unsigned long y) > +{ > + return (x << -(y & 63)); > +} > + > +/* { dg-final { scan-assembler-times "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" > 1 } } */ > +/* { dg-final { scan-assembler-times "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" > 4 } } */ > +/* { dg-final { scan-assembler-times "ror\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" > 1 } } */ > +/* { dg-final { scan-assembler-times "ror\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" > 1 } } */ > +/* { dg-final { scan-assembler-not "and\tw\[0-9\]+, w\[0-9\]+, 31" } } */ > +/* { dg-final { scan-assembler-not "and\tx\[0-9\]+, x\[0-9\]+, 63" } } */ > +/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } > } */