Pengxuan Zheng <quic_pzh...@quicinc.com> writes: > We can optimize AND with certain vector of immediates as FMOV if the result of > the AND is as if the upper lane of the input vector is set to zero and the > lower > lane remains unchanged. > > For example, at present: > > v4hi > f_v4hi (v4hi x) > { > return x & (v4hi){ 0xffff, 0xffff, 0, 0 }; > } > > generates: > > f_v4hi: > movi d31, 0xffffffff > and v0.8b, v0.8b, v31.8b > ret > > With this patch, it generates: > > f_v4hi: > fmov s0, s0 > ret > > PR target/100165 > > gcc/ChangeLog: > > * config/aarch64/aarch64-protos.h (aarch64_output_fmov): New prototype. > (aarch64_simd_valid_and_imm_fmov): Likewise. > * config/aarch64/aarch64-simd.md (and<mode>3<vczle><vczbe>): Allow FMOV > codegen. > * config/aarch64/aarch64.cc (aarch64_simd_valid_and_imm_fmov): New > function. > (aarch64_output_fmov): Likewise. > * config/aarch64/constraints.md (Df): New constraint. > * config/aarch64/predicates.md (aarch64_reg_or_and_imm): Update > predicate to support FMOV codegen. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/fmov-1.c: New test. > * gcc.target/aarch64/fmov-2.c: New test. > * gcc.target/aarch64/fmov-be-1.c: New test. > * gcc.target/aarch64/fmov-be-2.c: New test.
Thanks for doing this. Mostly looks good, but some comments below: > Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com> > --- > gcc/config/aarch64/aarch64-protos.h | 2 + > gcc/config/aarch64/aarch64-simd.md | 10 +- > gcc/config/aarch64/aarch64.cc | 75 ++++++++++ > gcc/config/aarch64/constraints.md | 7 + > gcc/config/aarch64/predicates.md | 3 +- > gcc/testsuite/gcc.target/aarch64/fmov-1.c | 149 +++++++++++++++++++ > gcc/testsuite/gcc.target/aarch64/fmov-2.c | 90 +++++++++++ > gcc/testsuite/gcc.target/aarch64/fmov-be-1.c | 149 +++++++++++++++++++ > gcc/testsuite/gcc.target/aarch64/fmov-be-2.c | 90 +++++++++++ > 9 files changed, 569 insertions(+), 6 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-1.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-2.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be-1.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-be-2.c > > diff --git a/gcc/config/aarch64/aarch64-protos.h > b/gcc/config/aarch64/aarch64-protos.h > index 1ca86c9d175..c461fce8896 100644 > --- a/gcc/config/aarch64/aarch64-protos.h > +++ b/gcc/config/aarch64/aarch64-protos.h > @@ -933,6 +933,7 @@ char *aarch64_output_simd_mov_imm (rtx, unsigned); > char *aarch64_output_simd_orr_imm (rtx, unsigned); > char *aarch64_output_simd_and_imm (rtx, unsigned); > char *aarch64_output_simd_xor_imm (rtx, unsigned); > +char *aarch64_output_fmov (rtx); > > char *aarch64_output_sve_mov_immediate (rtx); > char *aarch64_output_sve_ptrues (rtx); > @@ -948,6 +949,7 @@ bool aarch64_simd_scalar_immediate_valid_for_move (rtx, > scalar_int_mode); > bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool); > bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *); > bool aarch64_simd_valid_and_imm (rtx); > +bool aarch64_simd_valid_and_imm_fmov (rtx, unsigned int * = NULL); > bool aarch64_simd_valid_mov_imm (rtx); > bool aarch64_simd_valid_orr_imm (rtx); > bool aarch64_simd_valid_xor_imm (rtx); > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index e2afe87e513..e051e6459a5 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -1117,17 +1117,17 @@ (define_insn "fabd<mode>3<vczle><vczbe>" > [(set_attr "type" "neon_fp_abd_<stype><q>")] > ) > > -;; For AND (vector, register) and BIC (vector, immediate) > +;; For AND (vector, register), BIC (vector, immediate) and FMOV (register) > (define_insn "and<mode>3<vczle><vczbe>" > [(set (match_operand:VDQ_I 0 "register_operand") > (and:VDQ_I (match_operand:VDQ_I 1 "register_operand") > (match_operand:VDQ_I 2 "aarch64_reg_or_and_imm")))] > "TARGET_SIMD" > - {@ [ cons: =0 , 1 , 2 ] > - [ w , w , w ] and\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype> > - [ w , 0 , Db ] << aarch64_output_simd_and_imm (operands[2], > <bitsize>); > + {@ [ cons: =0 , 1 , 2 ; attrs: type ] > + [ w , w , w ; neon_logic<q> ] and\t%0.<Vbtype>, %1.<Vbtype>, > %2.<Vbtype> > + [ w , 0 , Db ; neon_logic<q> ] << aarch64_output_simd_and_imm > (operands[2], <bitsize>); > + [ w , w , Df ; fmov ] << aarch64_output_fmov > (operands[2]); I think the fmov alternative should have priority over the AND immediate alternative, so that we use fmov even for SVE. > } > - [(set_attr "type" "neon_logic<q>")] > ) > > ;; For ORR (vector, register) and ORR (vector, immediate) > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index 38c112cc92f..54895e3f456 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -23516,6 +23516,61 @@ aarch64_simd_valid_and_imm (rtx op) > return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_AND); > } > > +/* Return true if OP is a valid SIMD and immediate which allows the and be > + optimized as fmov. If ELT_SIZE is nonnull, it represents the size of the > + register for fmov. */ > +bool > +aarch64_simd_valid_and_imm_fmov (rtx op, unsigned int *elt_size) > +{ > + machine_mode mode = GET_MODE (op); > + gcc_assert (!aarch64_sve_mode_p (mode)); > + > + auto_vec<target_unit, 128> buffer; 16 should be enough for Advanced SIMD vectors. > + unsigned int n_bytes > + = GET_MODE_BITSIZE (mode).to_constant () / BITS_PER_UNIT; This can be simplified to GET_MODE_SIZE (mode) > + buffer.reserve (n_bytes); > + > + bool ok = native_encode_rtx (mode, op, buffer, 0, n_bytes); > + gcc_assert (ok); > + > + unsigned int n_ones = 0; > + unsigned int i = 0; > + auto idx = [] (unsigned int n, unsigned int i) > + { > + return BYTES_BIG_ENDIAN ? n - i - 1 : i; > + }; > + > + /* Find the number of consecutive 0xFFs at the beginning of the AND mask. > */ > + for (; i < n_bytes; i++) > + { > + if (i > 8) > + return false; > + > + if (buffer[idx (n_bytes, i)] != 0xff) > + break; > + } > + > + n_ones = i; > + /* Bail out if the AND mask is not valid. The valid n_ones are 2 (h with > + TARGET_SIMD_F16INST), 4 (s) and 8 (d). */ > + if ((n_ones != 2 && n_ones != 4 && n_ones != 8) > + || (n_ones == 2 && !TARGET_SIMD_F16INST)) > + return false; > + > + /* The rest of the AND mask must be all 0s. */ > + for (; i < n_bytes; i++) > + if (buffer[idx (n_bytes, i)] != 0) > + break; > + > + if (i != n_bytes) > + return false; > + > + if (elt_size) > + *elt_size = n_ones; > + > + return true; > +} > + With the decode_native_int patch I submitted here: https://gcc.gnu.org/pipermail/gcc-patches/2025-April/682003.html this could be simplified to: bool ok = native_encode_rtx (mode, op, buffer, 0, n_bytes); gcc_assert (ok); auto mask = native_decode_int (buffer, 0, n_bytes, n_bytes * BITS_PER_UNIT); int set_bit = wi::exact_log2 (mask + 1); if ((set_bit == 16 && TARGET_SIMD_F16INST) || set_bit == 32 || set_bit == 64) { if (elt_size) *elt_size = set_bit / BITS_PER_UNIT; return true; } return false; > /* Return true if OP is a valid SIMD xor immediate for SVE. */ > bool > aarch64_simd_valid_xor_imm (rtx op) > @@ -25642,6 +25697,26 @@ aarch64_float_const_representable_p (rtx x) > return aarch64_real_float_const_representable_p (r); > } > > +/* Returns the string with the fmov instruction which is equivalent to an and > + instruction with the SIMD immediate CONST_VECTOR. */ > +char* > +aarch64_output_fmov (rtx const_vector) > +{ > + bool is_valid; > + static char templ[40]; > + char element_char; > + unsigned int elt_size; > + > + is_valid = aarch64_simd_valid_and_imm_fmov (const_vector, &elt_size); > + gcc_assert (is_valid); > + > + element_char = sizetochar (elt_size * BITS_PER_UNIT); > + snprintf (templ, sizeof (templ), "fmov\t%%%c0, %%%c1", > + element_char, element_char); > + > + return templ; > +} > + > /* Returns the string with the instruction for the SIMD immediate > * CONST_VECTOR of MODE and WIDTH. WHICH selects a move, and(bic) or orr. > */ > char* > diff --git a/gcc/config/aarch64/constraints.md > b/gcc/config/aarch64/constraints.md > index e8321c4d2fb..253603c173a 100644 > --- a/gcc/config/aarch64/constraints.md > +++ b/gcc/config/aarch64/constraints.md > @@ -466,6 +466,13 @@ (define_constraint "Do" > (and (match_code "const_vector") > (match_test "aarch64_simd_valid_orr_imm (op)"))) > > +(define_constraint "Df" > + "@internal > + A constraint that matches vector of immediates for and which can be\ matches a vector The trailing backslash shouldn't be necesssary. > + optimized as fmov." > + (and (match_code "const_vector") > + (match_test "aarch64_simd_valid_and_imm_fmov (op)"))) > + > (define_constraint "Db" > "@internal > A constraint that matches vector of immediates for and/bic." > diff --git a/gcc/config/aarch64/predicates.md > b/gcc/config/aarch64/predicates.md > index 1ab1c696c62..2c6af831eae 100644 > --- a/gcc/config/aarch64/predicates.md > +++ b/gcc/config/aarch64/predicates.md > @@ -123,7 +123,8 @@ (define_predicate "aarch64_reg_or_orr_imm" > (define_predicate "aarch64_reg_or_and_imm" > (ior (match_operand 0 "register_operand") > (and (match_code "const_vector") > - (match_test "aarch64_simd_valid_and_imm (op)")))) > + (ior (match_test "aarch64_simd_valid_and_imm (op)") > + (match_test "aarch64_simd_valid_and_imm_fmov (op)"))))) > > (define_predicate "aarch64_reg_or_xor_imm" > (ior (match_operand 0 "register_operand") > diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-1.c > b/gcc/testsuite/gcc.target/aarch64/fmov-1.c > new file mode 100644 > index 00000000000..d657dfdee88 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/fmov-1.c > @@ -0,0 +1,149 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ Since this test is specific to little-endian, it might be better to call it fmov-1-le.c. The options should include -mlittle-endian, so that the tests still pass on aarch64_be targets. Same for fmov-2.c. Thanks, Richard