https://gcc.gnu.org/g:0417a630811404c2362060b7e15f99e5a4a0d76a
commit r16-703-g0417a630811404c2362060b7e15f99e5a4a0d76a Author: Pengxuan Zheng <quic_pzh...@quicinc.com> Date: Mon May 12 10:12:11 2025 -0700 aarch64: Optimize AND with certain vector of immediates as FMOV [PR100165] We can optimize AND with certain vector of immediates as FMOV if the result of the AND is as if the upper lane of the input vector is set to zero and the lower lane remains unchanged. For example, at present: v4hi f_v4hi (v4hi x) { return x & (v4hi){ 0xffff, 0xffff, 0, 0 }; } generates: f_v4hi: movi d31, 0xffffffff and v0.8b, v0.8b, v31.8b ret With this patch, it generates: f_v4hi: fmov s0, s0 ret Changes since v1: * v2: Simplify the mask checking logic by using native_decode_int and address a few other review comments. PR target/100165 gcc/ChangeLog: * config/aarch64/aarch64-protos.h (aarch64_output_fmov): New prototype. (aarch64_simd_valid_and_imm_fmov): Likewise. * config/aarch64/aarch64-simd.md (and<mode>3<vczle><vczbe>): Allow FMOV codegen. * config/aarch64/aarch64.cc (aarch64_simd_valid_and_imm_fmov): New. (aarch64_output_fmov): Likewise. * config/aarch64/constraints.md (Df): New constraint. * config/aarch64/predicates.md (aarch64_reg_or_and_imm): Update predicate to support FMOV codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/fmov-1-be.c: New test. * gcc.target/aarch64/fmov-1-le.c: New test. * gcc.target/aarch64/fmov-2-be.c: New test. * gcc.target/aarch64/fmov-2-le.c: New test. Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com> Diff: --- gcc/config/aarch64/aarch64-protos.h | 2 + gcc/config/aarch64/aarch64-simd.md | 10 +- gcc/config/aarch64/aarch64.cc | 50 +++++++++ gcc/config/aarch64/constraints.md | 7 ++ gcc/config/aarch64/predicates.md | 3 +- gcc/testsuite/gcc.target/aarch64/fmov-1-be.c | 151 +++++++++++++++++++++++++++ gcc/testsuite/gcc.target/aarch64/fmov-1-le.c | 151 +++++++++++++++++++++++++++ gcc/testsuite/gcc.target/aarch64/fmov-2-be.c | 90 ++++++++++++++++ gcc/testsuite/gcc.target/aarch64/fmov-2-le.c | 90 ++++++++++++++++ 9 files changed, 548 insertions(+), 6 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index b59eecf5bdff..8f37e56d440e 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -933,6 +933,7 @@ char *aarch64_output_simd_mov_imm (rtx, unsigned); char *aarch64_output_simd_orr_imm (rtx, unsigned); char *aarch64_output_simd_and_imm (rtx, unsigned); char *aarch64_output_simd_xor_imm (rtx, unsigned); +char *aarch64_output_fmov (rtx); char *aarch64_output_sve_mov_immediate (rtx); char *aarch64_output_sve_ptrues (rtx); @@ -948,6 +949,7 @@ bool aarch64_simd_scalar_immediate_valid_for_move (rtx, scalar_int_mode); bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool); bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *); bool aarch64_simd_valid_and_imm (rtx); +bool aarch64_simd_valid_and_imm_fmov (rtx, unsigned int * = NULL); bool aarch64_simd_valid_mov_imm (rtx); bool aarch64_simd_valid_orr_imm (rtx); bool aarch64_simd_valid_xor_imm (rtx); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 1099e742cbf7..6e30dc48934c 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1117,17 +1117,17 @@ [(set_attr "type" "neon_fp_abd_<stype><q>")] ) -;; For AND (vector, register) and BIC (vector, immediate) +;; For AND (vector, register), BIC (vector, immediate) and FMOV (register) (define_insn "and<mode>3<vczle><vczbe>" [(set (match_operand:VDQ_I 0 "register_operand") (and:VDQ_I (match_operand:VDQ_I 1 "register_operand") (match_operand:VDQ_I 2 "aarch64_reg_or_and_imm")))] "TARGET_SIMD" - {@ [ cons: =0 , 1 , 2 ] - [ w , w , w ] and\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype> - [ w , 0 , Db ] << aarch64_output_simd_and_imm (operands[2], <bitsize>); + {@ [ cons: =0 , 1 , 2 ; attrs: type ] + [ w , w , w ; neon_logic<q> ] and\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype> + [ w , w , Df ; fmov ] << aarch64_output_fmov (operands[2]); + [ w , 0 , Db ; neon_logic<q> ] << aarch64_output_simd_and_imm (operands[2], <bitsize>); } - [(set_attr "type" "neon_logic<q>")] ) ;; For ORR (vector, register) and ORR (vector, immediate) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 34f9725485d2..1da615c8955a 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -23620,6 +23620,36 @@ aarch64_simd_valid_and_imm (rtx op) return aarch64_simd_valid_imm (op, NULL, AARCH64_CHECK_AND); } +/* Return true if OP is a valid SIMD and immediate which allows the and to be + optimized as fmov. If ELT_BITSIZE is nonnull, use it to return the number of + bits to move. */ +bool +aarch64_simd_valid_and_imm_fmov (rtx op, unsigned int *elt_bitsize) +{ + machine_mode mode = GET_MODE (op); + gcc_assert (!aarch64_sve_mode_p (mode)); + + auto_vec<target_unit, 16> buffer; + unsigned int n_bytes = GET_MODE_SIZE (mode).to_constant (); + buffer.reserve (n_bytes); + + bool ok = native_encode_rtx (mode, op, buffer, 0, n_bytes); + gcc_assert (ok); + + auto mask = native_decode_int (buffer, 0, n_bytes, n_bytes * BITS_PER_UNIT); + int set_bit = wi::exact_log2 (mask + 1); + if ((set_bit == 16 && TARGET_SIMD_F16INST) + || set_bit == 32 + || set_bit == 64) + { + if (elt_bitsize) + *elt_bitsize = set_bit; + return true; + } + + return false; +} + /* Return true if OP is a valid SIMD xor immediate for SVE. */ bool aarch64_simd_valid_xor_imm (rtx op) @@ -25754,6 +25784,26 @@ aarch64_float_const_representable_p (rtx x) return aarch64_real_float_const_representable_p (r); } +/* Returns the string with the fmov instruction which is equivalent to an and + instruction with the SIMD immediate CONST_VECTOR. */ +char* +aarch64_output_fmov (rtx const_vector) +{ + bool is_valid; + static char templ[40]; + char element_char; + unsigned int elt_bitsize; + + is_valid = aarch64_simd_valid_and_imm_fmov (const_vector, &elt_bitsize); + gcc_assert (is_valid); + + element_char = sizetochar (elt_bitsize); + snprintf (templ, sizeof (templ), "fmov\t%%%c0, %%%c1", element_char, + element_char); + + return templ; +} + /* Returns the string with the instruction for the SIMD immediate * CONST_VECTOR of MODE and WIDTH. WHICH selects a move, and(bic) or orr. */ char* diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index e8321c4d2fbd..e9f69f823a6b 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -466,6 +466,13 @@ (and (match_code "const_vector") (match_test "aarch64_simd_valid_orr_imm (op)"))) +(define_constraint "Df" + "@internal + A constraint that matches a vector of immediates for and which can be + optimized as fmov." + (and (match_code "const_vector") + (match_test "aarch64_simd_valid_and_imm_fmov (op)"))) + (define_constraint "Db" "@internal A constraint that matches vector of immediates for and/bic." diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 1ab1c696c62c..2c6af831eae1 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -123,7 +123,8 @@ (define_predicate "aarch64_reg_or_and_imm" (ior (match_operand 0 "register_operand") (and (match_code "const_vector") - (match_test "aarch64_simd_valid_and_imm (op)")))) + (ior (match_test "aarch64_simd_valid_and_imm (op)") + (match_test "aarch64_simd_valid_and_imm_fmov (op)"))))) (define_predicate "aarch64_reg_or_xor_imm" (ior (match_operand 0 "register_operand") diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-1-be.c b/gcc/testsuite/gcc.target/aarch64/fmov-1-be.c new file mode 100644 index 000000000000..4227c677248d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/fmov-1-be.c @@ -0,0 +1,151 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mbig-endian" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#pragma GCC target ("arch=armv8-a") + +typedef int v2si __attribute__ ((vector_size (8))); +typedef float v2sf __attribute__ ((vector_size (8))); +typedef short v4hi __attribute__ ((vector_size (8))); +typedef char v8qi __attribute__ ((vector_size (8))); +typedef long v2di __attribute__ ((vector_size (16))); +typedef double v2df __attribute__ ((vector_size (16))); +typedef int v4si __attribute__ ((vector_size (16))); +typedef float v4sf __attribute__ ((vector_size (16))); +typedef short v8hi __attribute__ ((vector_size (16))); +typedef char v16qi __attribute__ ((vector_size (16))); + +/* +** f_v4hi: +** fmov s0, s0 +** ret +*/ +v4hi +f_v4hi (v4hi x) +{ + return x & (v4hi){ 0, 0, 0xffff, 0xffff }; +} + +/* +** g_v4hi: +** movi d([0-9]+), 0xffff00000000ffff +** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b) +** ret +*/ +v4hi +g_v4hi (v4hi x) +{ + return x & (v4hi){ 0xffff, 0, 0, 0xffff }; +} + +/* +** f_v8hi: +** fmov s0, s0 +** ret +*/ +v8hi +f_v8hi (v8hi x) +{ + return x & (v8hi){ 0, 0, 0, 0, 0, 0, 0xffff, 0xffff }; +} + +/* +** g_v8hi: +** fmov d0, d0 +** ret +*/ +v8hi +g_v8hi (v8hi x) +{ + return x & (v8hi){ 0, 0, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff }; +} + +/* +** f_v2si: +** fmov s0, s0 +** ret +*/ +v2si +f_v2si (v2si x) +{ + return x & (v2si){ 0, 0xffffffff }; +} + +/* +** f_v2di: +** fmov d0, d0 +** ret +*/ +v2di +f_v2di (v2di x) +{ + return x & (v2di){ 0, 0xffffffffffffffff }; +} + +/* +** g_v2di: +** fmov s0, s0 +** ret +*/ +v2di +g_v2di (v2di x) +{ + return x & (v2di){ 0, 0xffffffff }; +} + +/* +** f_v4si: +** fmov s0, s0 +** ret +*/ +v4si +f_v4si (v4si x) +{ + return x & (v4si){ 0, 0, 0, 0xffffffff }; +} + +/* +** h_v4si: +** fmov d0, d0 +** ret +*/ +v4si +h_v4si (v4si x) +{ + return x & (v4si){ 0, 0, 0xffffffff, 0xffffffff }; +} + +/* +** f_v8qi: +** fmov s0, s0 +** ret +*/ +v8qi +f_v8qi (v8qi x) +{ + return x & (v8qi){ 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff }; +} + +/* +** f_v16qi: +** fmov d0, d0 +** ret +*/ +v16qi +f_v16qi (v16qi x) +{ + return x & (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; +} + +/* +** g_v16qi: +** fmov s0, s0 +** ret +*/ +v16qi +g_v16qi (v16qi x) +{ + return x & (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff }; +} diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-1-le.c b/gcc/testsuite/gcc.target/aarch64/fmov-1-le.c new file mode 100644 index 000000000000..618702a63dab --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/fmov-1-le.c @@ -0,0 +1,151 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mlittle-endian" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#pragma GCC target ("arch=armv8-a") + +typedef int v2si __attribute__ ((vector_size (8))); +typedef float v2sf __attribute__ ((vector_size (8))); +typedef short v4hi __attribute__ ((vector_size (8))); +typedef char v8qi __attribute__ ((vector_size (8))); +typedef long v2di __attribute__ ((vector_size (16))); +typedef double v2df __attribute__ ((vector_size (16))); +typedef int v4si __attribute__ ((vector_size (16))); +typedef float v4sf __attribute__ ((vector_size (16))); +typedef short v8hi __attribute__ ((vector_size (16))); +typedef char v16qi __attribute__ ((vector_size (16))); + +/* +** f_v4hi: +** fmov s0, s0 +** ret +*/ +v4hi +f_v4hi (v4hi x) +{ + return x & (v4hi){ 0xffff, 0xffff, 0, 0 }; +} + +/* +** g_v4hi: +** movi d([0-9]+), 0xffff00000000ffff +** and v0.8b, (?:v0.8b, v\1.8b|v\1.8b, v0.8b) +** ret +*/ +v4hi +g_v4hi (v4hi x) +{ + return x & (v4hi){ 0xffff, 0, 0, 0xffff }; +} + +/* +** f_v8hi: +** fmov s0, s0 +** ret +*/ +v8hi +f_v8hi (v8hi x) +{ + return x & (v8hi){ 0xffff, 0xffff, 0, 0, 0, 0, 0, 0 }; +} + +/* +** g_v8hi: +** fmov d0, d0 +** ret +*/ +v8hi +g_v8hi (v8hi x) +{ + return x & (v8hi){ 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 }; +} + +/* +** f_v2si: +** fmov s0, s0 +** ret +*/ +v2si +f_v2si (v2si x) +{ + return x & (v2si){ 0xffffffff, 0 }; +} + +/* +** f_v2di: +** fmov d0, d0 +** ret +*/ +v2di +f_v2di (v2di x) +{ + return x & (v2di){ 0xffffffffffffffff, 0 }; +} + +/* +** g_v2di: +** fmov s0, s0 +** ret +*/ +v2di +g_v2di (v2di x) +{ + return x & (v2di){ 0xffffffff, 0 }; +} + +/* +** f_v4si: +** fmov s0, s0 +** ret +*/ +v4si +f_v4si (v4si x) +{ + return x & (v4si){ 0xffffffff, 0, 0, 0 }; +} + +/* +** h_v4si: +** fmov d0, d0 +** ret +*/ +v4si +h_v4si (v4si x) +{ + return x & (v4si){ 0xffffffff, 0xffffffff, 0, 0 }; +} + +/* +** f_v8qi: +** fmov s0, s0 +** ret +*/ +v8qi +f_v8qi (v8qi x) +{ + return x & (v8qi){ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }; +} + +/* +** f_v16qi: +** fmov d0, d0 +** ret +*/ +v16qi +f_v16qi (v16qi x) +{ + return x & (v16qi){ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +/* +** g_v16qi: +** fmov s0, s0 +** ret +*/ +v16qi +g_v16qi (v16qi x) +{ + return x & (v16qi){ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; +} diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-2-be.c b/gcc/testsuite/gcc.target/aarch64/fmov-2-be.c new file mode 100644 index 000000000000..1e38066b4cf2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/fmov-2-be.c @@ -0,0 +1,90 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mbig-endian" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#pragma GCC target ("arch=armv8.2-a+fp16") + +typedef int v2si __attribute__ ((vector_size (8))); +typedef short v4hi __attribute__ ((vector_size (8))); +typedef char v8qi __attribute__ ((vector_size (8))); +typedef long v2di __attribute__ ((vector_size (16))); +typedef int v4si __attribute__ ((vector_size (16))); +typedef short v8hi __attribute__ ((vector_size (16))); +typedef char v16qi __attribute__ ((vector_size (16))); + +/* +** f_v2di: +** fmov h0, h0 +** ret +*/ +v2di +f_v2di (v2di x) +{ + return x & (v2di){ 0, 0xffff }; +} + +/* +** f_v4si: +** fmov h0, h0 +** ret +*/ +v4si +f_v4si (v4si x) +{ + return x & (v4si){ 0, 0, 0, 0xffff }; +} + +/* +** f_v2si: +** fmov h0, h0 +** ret +*/ +v2si +f_v2si (v2si x) +{ + return x & (v2si){ 0, 0xffff }; +} + +/* +** f_v8hi: +** fmov h0, h0 +** ret +*/ +v8hi +f_v8hi (v8hi x) +{ + return x & (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0xffff }; +} + +/* +** f_v4hi: +** fmov h0, h0 +** ret +*/ +v4hi +f_v4hi (v4hi x) +{ + return x & (v4hi){ 0, 0, 0, 0xffff }; +} + +/* +** f_v16qi: +** fmov h0, h0 +** ret +*/ +v16qi +f_v16qi (v16qi x) +{ + return x & (v16qi){ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff }; +} + +/* +** f_v8qi: +** fmov h0, h0 +** ret +*/ +v8qi +f_v8qi (v8qi x) +{ + return x & (v8qi){ 0, 0, 0, 0, 0, 0, 0xff, 0xff }; +} diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-2-le.c b/gcc/testsuite/gcc.target/aarch64/fmov-2-le.c new file mode 100644 index 000000000000..7627680a0b22 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/fmov-2-le.c @@ -0,0 +1,90 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mlittle-endian" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#pragma GCC target ("arch=armv8.2-a+fp16") + +typedef int v2si __attribute__ ((vector_size (8))); +typedef short v4hi __attribute__ ((vector_size (8))); +typedef char v8qi __attribute__ ((vector_size (8))); +typedef long v2di __attribute__ ((vector_size (16))); +typedef int v4si __attribute__ ((vector_size (16))); +typedef short v8hi __attribute__ ((vector_size (16))); +typedef char v16qi __attribute__ ((vector_size (16))); + +/* +** f_v2di: +** fmov h0, h0 +** ret +*/ +v2di +f_v2di (v2di x) +{ + return x & (v2di){ 0xffff, 0 }; +} + +/* +** f_v4si: +** fmov h0, h0 +** ret +*/ +v4si +f_v4si (v4si x) +{ + return x & (v4si){ 0xffff, 0, 0, 0 }; +} + +/* +** f_v2si: +** fmov h0, h0 +** ret +*/ +v2si +f_v2si (v2si x) +{ + return x & (v2si){ 0xffff, 0 }; +} + +/* +** f_v8hi: +** fmov h0, h0 +** ret +*/ +v8hi +f_v8hi (v8hi x) +{ + return x & (v8hi){ 0xffff, 0, 0, 0, 0, 0, 0, 0 }; +} + +/* +** f_v4hi: +** fmov h0, h0 +** ret +*/ +v4hi +f_v4hi (v4hi x) +{ + return x & (v4hi){ 0xffff, 0, 0, 0 }; +} + +/* +** f_v16qi: +** fmov h0, h0 +** ret +*/ +v16qi +f_v16qi (v16qi x) +{ + return x & (v16qi){ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +/* +** f_v8qi: +** fmov h0, h0 +** ret +*/ +v8qi +f_v8qi (v8qi x) +{ + return x & (v8qi){ 0xff, 0xff, 0, 0, 0, 0, 0, 0 }; +}