<dhr...@nvidia.com> writes: > From: Dhruv Chawla <dhr...@nvidia.com> > > This patch modifies the shift expander to immediately lower constant > shifts without unspec. It also modifies the ADR, SRA and ADDHNB patterns > to match the lowered forms of the shifts, as the predicate register is > not required for these instructions. > > Bootstrapped and regtested on aarch64-linux-gnu. > > Signed-off-by: Dhruv Chawla <dhr...@nvidia.com> > Co-authored-by: Richard Sandiford <richard.sandif...@arm.com> > > gcc/ChangeLog: > > * gcc/config/aarch64/aarch64-sve.md (@aarch64_adr<mode>_shift): > Match lowered form of ashift. > (*aarch64_adr<mode>_shift): Likewise. > (*aarch64_adr_shift_sxtw): Likewise. > (*aarch64_adr_shift_uxtw): Likewise. > (<ASHIFT:optab><mode>3): Check amount instead of operands[2] in > aarch64_sve_<lr>shift_operand. > (v<optab><mode>3): Generate unpredicated shifts for constant > operands. > (@aarch64_pred_<optab><mode>): Convert to a define_expand. > (*aarch64_pred_<optab><mode>): Create define_insn_and_split pattern > from @aarch64_pred_<optab><mode>. > (*post_ra_v_ashl<mode>3): Rename to ... > (aarch64_vashl<mode>3_const): ... this and remove reload requirement. > (*post_ra_v_<optab><mode>3): Rename to ... > (aarch64_v<optab><mode>3_const): ... this and remove reload > requirement. > * gcc/config/aarch64/aarch64-sve2.md > (@aarch64_sve_add_<sve_int_op><mode>): Match lowered form of > SHIFTRT. > (*aarch64_sve2_sra<mode>): Likewise. > (*bitmask_shift_plus<mode>): Match lowered form of lshiftrt. > --- > gcc/config/aarch64/aarch64-sve.md | 119 +++++++++++++++-------------- > gcc/config/aarch64/aarch64-sve2.md | 46 ++++------- > 2 files changed, 75 insertions(+), 90 deletions(-)
OK, thanks. It doesn't look like you're listed in MAINTAINERS as having write access. If that's right, and if you'd like access, please follow the instructions in https://gcc.gnu.org/gitwrite.html (I'll sponsor). Richard > > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index bf7569f932b..e1ec778b10d 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -4234,80 +4234,57 @@ > (define_expand "@aarch64_adr<mode>_shift" > [(set (match_operand:SVE_FULL_SDI 0 "register_operand") > (plus:SVE_FULL_SDI > - (unspec:SVE_FULL_SDI > - [(match_dup 4) > - (ashift:SVE_FULL_SDI > - (match_operand:SVE_FULL_SDI 2 "register_operand") > - (match_operand:SVE_FULL_SDI 3 "const_1_to_3_operand"))] > - UNSPEC_PRED_X) > + (ashift:SVE_FULL_SDI > + (match_operand:SVE_FULL_SDI 2 "register_operand") > + (match_operand:SVE_FULL_SDI 3 "const_1_to_3_operand")) > (match_operand:SVE_FULL_SDI 1 "register_operand")))] > "TARGET_SVE && TARGET_NON_STREAMING" > - { > - operands[4] = CONSTM1_RTX (<VPRED>mode); > - } > ) > > -(define_insn_and_rewrite "*aarch64_adr<mode>_shift" > +(define_insn "*aarch64_adr<mode>_shift" > [(set (match_operand:SVE_24I 0 "register_operand" "=w") > (plus:SVE_24I > - (unspec:SVE_24I > - [(match_operand 4) > - (ashift:SVE_24I > - (match_operand:SVE_24I 2 "register_operand" "w") > - (match_operand:SVE_24I 3 "const_1_to_3_operand"))] > - UNSPEC_PRED_X) > + (ashift:SVE_24I > + (match_operand:SVE_24I 2 "register_operand" "w") > + (match_operand:SVE_24I 3 "const_1_to_3_operand")) > (match_operand:SVE_24I 1 "register_operand" "w")))] > "TARGET_SVE && TARGET_NON_STREAMING" > "adr\t%0.<Vctype>, [%1.<Vctype>, %2.<Vctype>, lsl %3]" > - "&& !CONSTANT_P (operands[4])" > - { > - operands[4] = CONSTM1_RTX (<VPRED>mode); > - } > ) > > ;; Same, but with the index being sign-extended from the low 32 bits. > (define_insn_and_rewrite "*aarch64_adr_shift_sxtw" > [(set (match_operand:VNx2DI 0 "register_operand" "=w") > (plus:VNx2DI > - (unspec:VNx2DI > - [(match_operand 4) > - (ashift:VNx2DI > - (unspec:VNx2DI > - [(match_operand 5) > - (sign_extend:VNx2DI > - (truncate:VNx2SI > - (match_operand:VNx2DI 2 "register_operand" "w")))] > - UNSPEC_PRED_X) > - (match_operand:VNx2DI 3 "const_1_to_3_operand"))] > - UNSPEC_PRED_X) > + (ashift:VNx2DI > + (unspec:VNx2DI > + [(match_operand 4) > + (sign_extend:VNx2DI > + (truncate:VNx2SI > + (match_operand:VNx2DI 2 "register_operand" "w")))] > + UNSPEC_PRED_X) > + (match_operand:VNx2DI 3 "const_1_to_3_operand")) > (match_operand:VNx2DI 1 "register_operand" "w")))] > "TARGET_SVE && TARGET_NON_STREAMING" > "adr\t%0.d, [%1.d, %2.d, sxtw %3]" > - "&& (!CONSTANT_P (operands[4]) || !CONSTANT_P (operands[5]))" > + "&& !CONSTANT_P (operands[4])" > { > - operands[5] = operands[4] = CONSTM1_RTX (VNx2BImode); > + operands[4] = CONSTM1_RTX (VNx2BImode); > } > ) > > ;; Same, but with the index being zero-extended from the low 32 bits. > -(define_insn_and_rewrite "*aarch64_adr_shift_uxtw" > +(define_insn "*aarch64_adr_shift_uxtw" > [(set (match_operand:VNx2DI 0 "register_operand" "=w") > (plus:VNx2DI > - (unspec:VNx2DI > - [(match_operand 5) > - (ashift:VNx2DI > - (and:VNx2DI > - (match_operand:VNx2DI 2 "register_operand" "w") > - (match_operand:VNx2DI 4 "aarch64_sve_uxtw_immediate")) > - (match_operand:VNx2DI 3 "const_1_to_3_operand"))] > - UNSPEC_PRED_X) > + (ashift:VNx2DI > + (and:VNx2DI > + (match_operand:VNx2DI 2 "register_operand" "w") > + (match_operand:VNx2DI 4 "aarch64_sve_uxtw_immediate")) > + (match_operand:VNx2DI 3 "const_1_to_3_operand")) > (match_operand:VNx2DI 1 "register_operand" "w")))] > "TARGET_SVE && TARGET_NON_STREAMING" > "adr\t%0.d, [%1.d, %2.d, uxtw %3]" > - "&& !CONSTANT_P (operands[5])" > - { > - operands[5] = CONSTM1_RTX (VNx2BImode); > - } > ) > > ;; ------------------------------------------------------------------------- > @@ -4899,7 +4876,7 @@ > if (CONST_INT_P (operands[2])) > { > amount = gen_const_vec_duplicate (<MODE>mode, operands[2]); > - if (!aarch64_sve_<lr>shift_operand (operands[2], <MODE>mode)) > + if (!aarch64_sve_<lr>shift_operand (amount, <MODE>mode)) > amount = force_reg (<MODE>mode, amount); > } > else > @@ -4923,15 +4900,40 @@ > UNSPEC_PRED_X))] > "TARGET_SVE" > { > + if (CONSTANT_P (operands[2])) > + { > + emit_insn (gen_aarch64_v<optab><mode>3_const (operands[0], operands[1], > + operands[2])); > + DONE; > + } > operands[3] = aarch64_ptrue_reg (<VPRED>mode); > } > ) > > -;; Shift by a vector, predicated with a PTRUE. We don't actually need > -;; the predicate for the first alternative, but using Upa or X isn't > -;; likely to gain much and would make the instruction seem less uniform > -;; to the register allocator. > -(define_insn_and_split "@aarch64_pred_<optab><mode>" > +;; Shift by a vector, predicated with a PTRUE. > +(define_expand "@aarch64_pred_<optab><mode>" > + [(set (match_operand:SVE_I 0 "register_operand") > + (unspec:SVE_I > + [(match_operand:<VPRED> 1 "register_operand") > + (ASHIFT:SVE_I > + (match_operand:SVE_I 2 "register_operand") > + (match_operand:SVE_I 3 "aarch64_sve_<lr>shift_operand"))] > + UNSPEC_PRED_X))] > + "TARGET_SVE" > + { > + if (CONSTANT_P (operands[3])) > + { > + emit_insn (gen_aarch64_v<optab><mode>3_const (operands[0], operands[2], > + operands[3])); > + DONE; > + } > + } > +) > + > +;; We don't actually need the predicate for the first alternative, but > +;; using Upa or X isn't likely to gain much and would make the instruction > +;; seem less uniform to the register allocator. > +(define_insn_and_split "*aarch64_pred_<optab><mode>" > [(set (match_operand:SVE_I 0 "register_operand") > (unspec:SVE_I > [(match_operand:<VPRED> 1 "register_operand") > @@ -4946,33 +4948,32 @@ > [ w , Upl , w , 0 ; * ] <shift>r\t%0.<Vetype>, > %1/m, %3.<Vetype>, %2.<Vetype> > [ ?&w , Upl , w , w ; yes ] movprfx\t%0, > %2\;<shift>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> > } > - "&& reload_completed > - && !register_operand (operands[3], <MODE>mode)" > + "&& !register_operand (operands[3], <MODE>mode)" > [(set (match_dup 0) (ASHIFT:SVE_I (match_dup 2) (match_dup 3)))] > "" > ) > > -;; Unpredicated shift operations by a constant (post-RA only). > +;; Unpredicated shift operations by a constant. > ;; These are generated by splitting a predicated instruction whose > ;; predicate is unused. > -(define_insn "*post_ra_v_ashl<mode>3" > +(define_insn "aarch64_vashl<mode>3_const" > [(set (match_operand:SVE_I 0 "register_operand") > (ashift:SVE_I > (match_operand:SVE_I 1 "register_operand") > (match_operand:SVE_I 2 "aarch64_simd_lshift_imm")))] > - "TARGET_SVE && reload_completed" > + "TARGET_SVE" > {@ [ cons: =0 , 1 , 2 ] > [ w , w , vs1 ] add\t%0.<Vetype>, %1.<Vetype>, %1.<Vetype> > [ w , w , Dl ] lsl\t%0.<Vetype>, %1.<Vetype>, #%2 > } > ) > > -(define_insn "*post_ra_v_<optab><mode>3" > +(define_insn "aarch64_v<optab><mode>3_const" > [(set (match_operand:SVE_I 0 "register_operand" "=w") > (SHIFTRT:SVE_I > (match_operand:SVE_I 1 "register_operand" "w") > (match_operand:SVE_I 2 "aarch64_simd_rshift_imm")))] > - "TARGET_SVE && reload_completed" > + "TARGET_SVE" > "<shift>\t%0.<Vetype>, %1.<Vetype>, #%2" > ) > > diff --git a/gcc/config/aarch64/aarch64-sve2.md > b/gcc/config/aarch64/aarch64-sve2.md > index 871cf0bd2e8..62524f36de6 100644 > --- a/gcc/config/aarch64/aarch64-sve2.md > +++ b/gcc/config/aarch64/aarch64-sve2.md > @@ -1932,40 +1932,27 @@ > (define_expand "@aarch64_sve_add_<sve_int_op><mode>" > [(set (match_operand:SVE_FULL_I 0 "register_operand") > (plus:SVE_FULL_I > - (unspec:SVE_FULL_I > - [(match_dup 4) > - (SHIFTRT:SVE_FULL_I > - (match_operand:SVE_FULL_I 2 "register_operand") > - (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm"))] > - UNSPEC_PRED_X) > - (match_operand:SVE_FULL_I 1 "register_operand")))] > + (SHIFTRT:SVE_FULL_I > + (match_operand:SVE_FULL_I 2 "register_operand") > + (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm")) > + (match_operand:SVE_FULL_I 1 "register_operand")))] > "TARGET_SVE2" > - { > - operands[4] = CONSTM1_RTX (<VPRED>mode); > - } > ) > > ;; Pattern-match SSRA and USRA as a predicated operation whose predicate > ;; isn't needed. > -(define_insn_and_rewrite "*aarch64_sve2_sra<mode>" > +(define_insn "*aarch64_sve2_sra<mode>" > [(set (match_operand:SVE_FULL_I 0 "register_operand") > (plus:SVE_FULL_I > - (unspec:SVE_FULL_I > - [(match_operand 4) > - (SHIFTRT:SVE_FULL_I > - (match_operand:SVE_FULL_I 2 "register_operand") > - (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm"))] > - UNSPEC_PRED_X) > + (SHIFTRT:SVE_FULL_I > + (match_operand:SVE_FULL_I 2 "register_operand") > + (match_operand:SVE_FULL_I 3 "aarch64_simd_rshift_imm")) > (match_operand:SVE_FULL_I 1 "register_operand")))] > "TARGET_SVE2" > {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ] > [ w , 0 , w ; * ] <sra_op>sra\t%0.<Vetype>, > %2.<Vetype>, #%3 > [ ?&w , w , w ; yes ] movprfx\t%0, > %1\;<sra_op>sra\t%0.<Vetype>, %2.<Vetype>, #%3 > } > - "&& !CONSTANT_P (operands[4])" > - { > - operands[4] = CONSTM1_RTX (<VPRED>mode); > - } > ) > > ;; SRSRA and URSRA. > @@ -2715,17 +2702,14 @@ > ;; Optimize ((a + b) >> n) where n is half the bitsize of the vector > (define_insn "*bitmask_shift_plus<mode>" > [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w") > - (unspec:SVE_FULL_HSDI > - [(match_operand:<VPRED> 1) > - (lshiftrt:SVE_FULL_HSDI > - (plus:SVE_FULL_HSDI > - (match_operand:SVE_FULL_HSDI 2 "register_operand" "w") > - (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")) > - (match_operand:SVE_FULL_HSDI 4 > - "aarch64_simd_shift_imm_vec_exact_top" ""))] > - UNSPEC_PRED_X))] > + (lshiftrt:SVE_FULL_HSDI > + (plus:SVE_FULL_HSDI > + (match_operand:SVE_FULL_HSDI 1 "register_operand" "w") > + (match_operand:SVE_FULL_HSDI 2 "register_operand" "w")) > + (match_operand:SVE_FULL_HSDI 3 > + "aarch64_simd_shift_imm_vec_exact_top" "")))] > "TARGET_SVE2" > - "addhnb\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>" > + "addhnb\t%0.<Ventype>, %1.<Vetype>, %2.<Vetype>" > ) > > ;; -------------------------------------------------------------------------