[gcc r15-3669] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]
https://gcc.gnu.org/g:a92f54f580c37732a5de01e47aed56882231f196 commit r15-3669-ga92f54f580c37732a5de01e47aed56882231f196 Author: Pengxuan Zheng Date: Tue Sep 10 17:59:46 2024 -0700 aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328] SVE's INDEX instruction can be used to populate vectors by values starting from "base" and incremented by "step" for each subsequent value. We can take advantage of it to generate vector constants if TARGET_SVE is available and the base and step values are within [-16, 15]. For example, with the following function: typedef int v4si __attribute__ ((vector_size (16))); v4si f_v4si (void) { return (v4si){ 0, 1, 2, 3 }; } GCC currently generates: f_v4si: adrpx0, .LC4 ldr q0, [x0, #:lo12:.LC4] ret .LC4: .word 0 .word 1 .word 2 .word 3 With this patch, we generate an INDEX instruction instead if TARGET_SVE is available. f_v4si: index z0.s, #0, #1 ret PR target/113328 gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_simd_valid_immediate): Improve handling of some ADVSIMD vectors by using SVE's INDEX if TARGET_SVE is available. (aarch64_output_simd_mov_immediate): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use SVE's INDEX instruction. * gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise. * gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise. * gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise. * gcc.target/aarch64/sve/vec_init_3.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64.cc | 13 ++- .../gcc.target/aarch64/sve/acle/general/dupq_1.c | 3 +- .../gcc.target/aarch64/sve/acle/general/dupq_2.c | 3 +- .../gcc.target/aarch64/sve/acle/general/dupq_3.c | 3 +- .../gcc.target/aarch64/sve/acle/general/dupq_4.c | 3 +- gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c | 99 ++ 6 files changed, 115 insertions(+), 9 deletions(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 6ccf08d1cc0a..92763d403c75 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -22987,7 +22987,8 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, if (CONST_VECTOR_P (op) && CONST_VECTOR_DUPLICATE_P (op)) n_elts = CONST_VECTOR_NPATTERNS (op); - else if ((vec_flags & VEC_SVE_DATA) + else if (which == AARCH64_CHECK_MOV + && TARGET_SVE && const_vec_series_p (op, &base, &step)) { gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); @@ -25245,6 +25246,16 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, if (which == AARCH64_CHECK_MOV) { + if (info.insn == simd_immediate_info::INDEX) + { + gcc_assert (TARGET_SVE); + snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #" + HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC, + element_char, INTVAL (info.u.index.base), + INTVAL (info.u.index.step)); + return templ; + } + mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi"; shift_op = (info.u.mov.modifier == simd_immediate_info::MSL ? "msl" : "lsl"); diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c index 216699b0536e..0940bedd0ddb 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c @@ -10,7 +10,6 @@ dupq (int x) return svdupq_s32 (x, 1, 2, 3); } -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */ +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */ /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */ /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */ -/* { dg-final { scan-assembler {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c index d494943a2753..218a66013375 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c @@ -10,7 +10,6 @@ dupq (int x) return svdupq_s32 (x, 1, 2, 3); } -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */ +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */ /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
[gcc r15-1801] aarch64: Add vector popcount besides QImode [PR113859]
https://gcc.gnu.org/g:895bbc08d38c2aca3cbbab273a247021fea73930 commit r15-1801-g895bbc08d38c2aca3cbbab273a247021fea73930 Author: Pengxuan Zheng Date: Wed Jun 12 18:23:13 2024 -0700 aarch64: Add vector popcount besides QImode [PR113859] This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target by adding popcount patterns for vector modes besides QImode, i.e., HImode, SImode and DImode. With this patch, we now generate the following for V8HI: cnt v1.16b, v0.16b uaddlp v2.8h, v1.16b For V4HI, we generate: cnt v1.8b, v0.8b uaddlp v2.4h, v1.8b For V4SI, we generate: cnt v1.16b, v0.16b uaddlp v2.8h, v1.16b uaddlp v3.4s, v2.8h For V4SI with TARGET_DOTPROD, we generate the following instead: moviv0.4s, #0 moviv1.16b, #1 cnt v3.16b, v2.16b udotv0.4s, v3.16b, v1.16b For V2SI, we generate: cnt v1.8b, v.8b uaddlp v2.4h, v1.8b uaddlp v3.2s, v2.4h For V2SI with TARGET_DOTPROD, we generate the following instead: moviv0.8b, #0 moviv1.8b, #1 cnt v3.8b, v2.8b udotv0.2s, v3.8b, v1.8b For V2DI, we generate: cnt v1.16b, v.16b uaddlp v2.8h, v1.16b uaddlp v3.4s, v2.8h uaddlp v4.2d, v3.4s For V4SI with TARGET_DOTPROD, we generate the following instead: moviv0.4s, #0 moviv1.16b, #1 cnt v3.16b, v2.16b udotv0.4s, v3.16b, v1.16b uaddlp v0.2d, v0.4s PR target/113859 gcc/ChangeLog: * config/aarch64/aarch64-simd.md (aarch64_addlp): Rename to... (@aarch64_addlp): ... This. (popcount2): New define_expand. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt-udot.c: New test. * gcc.target/aarch64/popcnt-vec.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64-simd.md | 41 ++- gcc/testsuite/gcc.target/aarch64/popcnt-udot.c | 58 ++ gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 ++ 3 files changed, 167 insertions(+), 1 deletion(-) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 01b084d8ccb..fd0c5e612b5 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3461,7 +3461,7 @@ [(set_attr "type" "neon_reduc_add")] ) -(define_expand "aarch64_addlp" +(define_expand "@aarch64_addlp" [(set (match_operand: 0 "register_operand") (plus: (vec_select: @@ -3517,6 +3517,45 @@ [(set_attr "type" "neon_cnt")] ) +(define_expand "popcount2" + [(set (match_operand:VDQHSD 0 "register_operand") + (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))] + "TARGET_SIMD" + { +/* Generate a byte popcount. */ +machine_mode mode = == 64 ? V8QImode : V16QImode; +rtx tmp = gen_reg_rtx (mode); +auto icode = optab_handler (popcount_optab, mode); +emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1]))); + +if (TARGET_DOTPROD + && (mode == SImode || mode == DImode)) + { + /* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and a + 1 multiplicand. For V2DI, another UAADDLP is needed. */ + rtx ones = force_reg (mode, CONST1_RTX (mode)); + auto icode = optab_handler (udot_prod_optab, mode); + mode = == 64 ? V2SImode : V4SImode; + rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode); + rtx zeros = force_reg (mode, CONST0_RTX (mode)); + emit_insn (GEN_FCN (icode) (dest, tmp, ones, zeros)); + tmp = dest; + } + +/* Use a sequence of UADDLPs to accumulate the counts. Each step doubles + the element size and halves the number of elements. */ +while (mode != mode) + { + auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp)); + mode = insn_data[icode].operand[0].mode; + rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode); + emit_insn (GEN_FCN (icode) (dest, tmp)); + tmp = dest; + } +DONE; + } +) + ;; 'across lanes' max and min ops. ;; Template for outputting a scalar, so we can create __builtins which can be diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c new file mode 100644 index 000..f6a968dae95 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c @@ -0,0 +1,58 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=armv8.2-a+dotprod -fno-vect-cost-model -fno-schedule-insns -fno-schedule-insns2" } */ + +/* +** bar: +** moviv([0-9]+).16b, 0x1 +** moviv([0-9]+).4s, 0 +** ldr q([0-9]+), \[x0\] +** cnt v([0-9]+).16b, v\3.16b +** udotv\2.4s, v\4.16b, v\
[gcc r15-2659] aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]
https://gcc.gnu.org/g:e4b8db26de35239bd621aad9c0361f25d957122b commit r15-2659-ge4b8db26de35239bd621aad9c0361f25d957122b Author: Pengxuan Zheng Date: Wed Jul 31 17:00:01 2024 -0700 aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860] This patch improves the Advanced SIMD popcount expansion by using SVE if available. For example, GCC currently generates the following code sequence for V2DI: cnt v31.16b, v31.16b uaddlp v31.8h, v31.16b uaddlp v31.4s, v31.8h uaddlp v31.2d, v31.4s However, by using SVE, we can generate the following sequence instead: ptrue p7.b, all cnt z31.d, p7/m, z31.d Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too. The scalar popcount expansion can also be improved similarly by using SVE and those changes will be included in a separate patch. PR target/113860 gcc/ChangeLog: * config/aarch64/aarch64-simd.md (popcount2): Add TARGET_SVE support. * config/aarch64/aarch64-sve.md (@aarch64_pred_): Use new iterator SVE_VDQ_I. * config/aarch64/iterators.md (SVE_VDQ_I): New mode iterator. (VPRED): Add V8QI, V16QI, V4HI, V8HI and V2SI. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt-sve.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64-simd.md| 9 +++ gcc/config/aarch64/aarch64-sve.md | 13 ++-- gcc/config/aarch64/iterators.md | 5 ++ gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++ 4 files changed, 109 insertions(+), 6 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 459e11b09a19..816f499e9634 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3508,6 +3508,15 @@ (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))] "TARGET_SIMD" { +if (TARGET_SVE) + { + rtx p = aarch64_ptrue_reg (mode); + emit_insn (gen_aarch64_pred_popcount (operands[0], + p, + operands[1])); + DONE; + } + /* Generate a byte popcount. */ machine_mode mode = == 64 ? V8QImode : V16QImode; rtx tmp = gen_reg_rtx (mode); diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index c3ed5075c4ed..a5cd42be9d5c 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3104,16 +3104,16 @@ ;; Integer unary arithmetic predicated with a PTRUE. (define_insn "@aarch64_pred_" - [(set (match_operand:SVE_I 0 "register_operand") - (unspec:SVE_I + [(set (match_operand:SVE_VDQ_I 0 "register_operand") + (unspec:SVE_VDQ_I [(match_operand: 1 "register_operand") - (SVE_INT_UNARY:SVE_I -(match_operand:SVE_I 2 "register_operand"))] + (SVE_INT_UNARY:SVE_VDQ_I +(match_operand:SVE_VDQ_I 2 "register_operand"))] UNSPEC_PRED_X))] "TARGET_SVE" {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ] - [ w, Upl , 0 ; * ] \t%0., %1/m, %2. - [ ?&w , Upl , w ; yes] movprfx\t%0, %2\;\t%0., %1/m, %2. + [ w, Upl , 0 ; * ] \t%Z0., %1/m, %Z2. + [ ?&w , Upl , w ; yes] movprfx\t%Z0, %Z2\;\t%Z0., %1/m, %Z2. } ) @@ -3168,6 +3168,7 @@ } ) + ;; - ;; [INT] General unary arithmetic corresponding to unspecs ;; - diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 95fe8f070f4c..aaa4afefe2ce 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -559,6 +559,9 @@ ;; element modes (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI]) +;; All SVE and Advanced SIMD integer vector modes. +(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I]) + ;; SVE integer vector modes whose elements are 16 bits or wider. (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI VNx4SI VNx2SI @@ -2278,6 +2281,8 @@ (VNx32BF "VNx8BI") (VNx16SI "VNx4BI") (VNx16SF "VNx4BI") (VNx8DI "VNx2BI") (VNx8DF "VNx2BI") +(V8QI "VNx8BI") (V16QI "VNx16BI") +(V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI") (V4SI "VNx4BI") (V2DI "VNx2BI")]) ;; ...and again in lower case. diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c new file mode 100644 index ..8e349efe3907 --- /dev/null +
[gcc r15-949] MAINTAINERS: Add myself to Write After Approval and DCO
https://gcc.gnu.org/g:96ec186d1dbeaa87453c3703e25fae7ce3ddbbb7 commit r15-949-g96ec186d1dbeaa87453c3703e25fae7ce3ddbbb7 Author: Pengxuan Zheng Date: Fri May 31 11:07:05 2024 -0700 MAINTAINERS: Add myself to Write After Approval and DCO ChangeLog: * MAINTAINERS: Add myself to Write After Approval and DCO. Signed-off-by: Pengxuan Zheng Diff: --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index e2870eef2ef..6444e6ea2f1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -743,6 +743,7 @@ Dennis Zhang Yufeng Zhang Qing Zhao Shujing Zhao +Pengxuan Zheng Jon Ziegler Roman Zippel Josef Zlomek @@ -789,3 +790,4 @@ Martin Uecker Jonathan Wakely Alexander Westbrooks Chung-Ju Wu +Pengxuan Zheng
[gcc r15-950] aarch64: testsuite: Explicitly add -mlittle-endian to vget_low_2.c
https://gcc.gnu.org/g:7fb62627cfb3e03811bb667fa7159bbc7f972f00 commit r15-950-g7fb62627cfb3e03811bb667fa7159bbc7f972f00 Author: Pengxuan Zheng Date: Wed May 22 17:38:43 2024 -0700 aarch64: testsuite: Explicitly add -mlittle-endian to vget_low_2.c vget_low_2.c is a test case for little-endian, but we missed the -mlittle-endian flag in r15-697-ga2e4fe5a53cf75. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vget_low_2.c: Add -mlittle-endian. Signed-off-by: Pengxuan Zheng Diff: --- gcc/testsuite/gcc.target/aarch64/vget_low_2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c index 44414e1c043..93e9e664ee9 100644 --- a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O3 -fdump-tree-optimized" } */ +/* { dg-options "-O3 -fdump-tree-optimized -mlittle-endian" } */ #include
[gcc r15-1079] aarch64: Add vector floating point extend pattern [PR113880, PR113869]
https://gcc.gnu.org/g:230d62a2cdd16c1ec8fe87998ec01081503f010d commit r15-1079-g230d62a2cdd16c1ec8fe87998ec01081503f010d Author: Pengxuan Zheng Date: Thu May 30 17:53:23 2024 -0700 aarch64: Add vector floating point extend pattern [PR113880, PR113869] This patch adds vector floating point extend pattern for V2SF->V2DF and V4HF->V4SF conversions by renaming the existing aarch64_float_extend_lo_ pattern to the standard optab one, i.e., extend2. This allows the vectorizer to vectorize certain floating point widening operations for the aarch64 target. PR target/113880 PR target/113869 gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (VAR1): Remap float_extend_lo_ builtin codes to standard optab ones. * config/aarch64/aarch64-simd.md (aarch64_float_extend_lo_): Rename to... (extend2): ... This. gcc/testsuite/ChangeLog: * gcc.target/aarch64/extend-vec.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64-builtins.cc| 9 + gcc/config/aarch64/aarch64-simd.md| 2 +- gcc/testsuite/gcc.target/aarch64/extend-vec.c | 21 + 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index f8eeccb554d..25189888d17 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -534,6 +534,15 @@ BUILTIN_VDQ_BHSI (urhadd, uavg, _ceil, 0) BUILTIN_VDQ_BHSI (shadd, avg, _floor, 0) BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0) +/* The builtins below should be expanded through the standard optabs + CODE_FOR_extend2. */ +#undef VAR1 +#define VAR1(F,T,N,M) \ + constexpr insn_code CODE_FOR_aarch64_##F##M = CODE_FOR_##T##N##M##2; + +VAR1 (float_extend_lo_, extend, v2sf, v2df) +VAR1 (float_extend_lo_, extend, v4hf, v4sf) + #undef VAR1 #define VAR1(T, N, MAP, FLAG, A) \ {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG}, diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 868f4486218..c5e2c9f00d0 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3132,7 +3132,7 @@ DONE; } ) -(define_insn "aarch64_float_extend_lo_" +(define_insn "extend2" [(set (match_operand: 0 "register_operand" "=w") (float_extend: (match_operand:VDF 1 "register_operand" "w")))] diff --git a/gcc/testsuite/gcc.target/aarch64/extend-vec.c b/gcc/testsuite/gcc.target/aarch64/extend-vec.c new file mode 100644 index 000..f6241d5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/extend-vec.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.2d, v[0-9]+.2s} 1 } } */ +void +f (float *__restrict a, double *__restrict b) +{ + b[0] = a[0]; + b[1] = a[1]; +} + +/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.4s, v[0-9]+.4h} 1 } } */ +void +f1 (_Float16 *__restrict a, float *__restrict b) +{ + + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; +}
[gcc r15-1182] aarch64: Add vector floating point trunc pattern
https://gcc.gnu.org/g:e7cd8ea1fa3e48404954bb7c06e9bcd603f132dd commit r15-1182-ge7cd8ea1fa3e48404954bb7c06e9bcd603f132dd Author: Pengxuan Zheng Date: Fri Jun 7 19:52:00 2024 -0700 aarch64: Add vector floating point trunc pattern This patch is a follow-up of r15-1079-g230d62a2cdd16c to add vector floating point trunc pattern for V2DF->V2SF and V4SF->V4HF conversions by renaming the existing aarch64_float_truncate_lo_ pattern to the standard optab one, i.e., trunc2. This allows the vectorizer to vectorize certain floating point narrowing operations for the aarch64 target. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (VAR1): Remap float_truncate_lo_ builtin codes to standard optab ones. * config/aarch64/aarch64-simd.md (aarch64_float_truncate_lo_): Rename to... (trunc2): ... This. gcc/testsuite/ChangeLog: * gcc.target/aarch64/trunc-vec.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64-builtins.cc | 7 +++ gcc/config/aarch64/aarch64-simd.md | 6 +++--- gcc/testsuite/gcc.target/aarch64/trunc-vec.c | 21 + 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 25189888d17d..d589e59defc2 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -543,6 +543,13 @@ BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0) VAR1 (float_extend_lo_, extend, v2sf, v2df) VAR1 (float_extend_lo_, extend, v4hf, v4sf) +/* __builtin_aarch64_float_truncate_lo_ should be expanded through the + standard optabs CODE_FOR_trunc2. */ +constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v4hf += CODE_FOR_truncv4sfv4hf2; +constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v2sf += CODE_FOR_truncv2dfv2sf2; + #undef VAR1 #define VAR1(T, N, MAP, FLAG, A) \ {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG}, diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index c5e2c9f00d02..f644bd1731e5 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3197,7 +3197,7 @@ } ) -(define_insn "aarch64_float_truncate_lo_" +(define_insn "trunc2" [(set (match_operand:VDF 0 "register_operand" "=w") (float_truncate:VDF (match_operand: 1 "register_operand" "w")))] @@ -3256,7 +3256,7 @@ int lo = BYTES_BIG_ENDIAN ? 2 : 1; int hi = BYTES_BIG_ENDIAN ? 1 : 2; -emit_insn (gen_aarch64_float_truncate_lo_v2sf (tmp, operands[lo])); +emit_insn (gen_truncv2dfv2sf2 (tmp, operands[lo])); emit_insn (gen_aarch64_float_truncate_hi_v4sf (operands[0], tmp, operands[hi])); DONE; @@ -3272,7 +3272,7 @@ { rtx tmp = gen_reg_rtx (V2SFmode); emit_insn (gen_aarch64_vec_concatdf (tmp, operands[1], operands[2])); -emit_insn (gen_aarch64_float_truncate_lo_v2sf (operands[0], tmp)); +emit_insn (gen_truncv2dfv2sf2 (operands[0], tmp)); DONE; } ) diff --git a/gcc/testsuite/gcc.target/aarch64/trunc-vec.c b/gcc/testsuite/gcc.target/aarch64/trunc-vec.c new file mode 100644 index ..05e8af7912de --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/trunc-vec.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +/* { dg-final { scan-assembler-times {fcvtn\tv[0-9]+.2s, v[0-9]+.2d} 1 } } */ +void +f (double *__restrict a, float *__restrict b) +{ + b[0] = a[0]; + b[1] = a[1]; +} + +/* { dg-final { scan-assembler-times {fcvtn\tv[0-9]+.4h, v[0-9]+.4s} 1 } } */ +void +f1 (float *__restrict a, _Float16 *__restrict b) +{ + + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; +}
[gcc r15-4579] aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]
https://gcc.gnu.org/g:9ffcf1f193b477f417a4c1960cd32696a23b99b4 commit r15-4579-g9ffcf1f193b477f417a4c1960cd32696a23b99b4 Author: Pengxuan Zheng Date: Mon Oct 14 05:37:49 2024 -0700 aarch64: Improve scalar mode popcount expansion by using SVE [PR113860] This is similar to the recent improvements to the Advanced SIMD popcount expansion by using SVE. We can utilize SVE to generate more efficient code for scalar mode popcount too. Changes since v1: * v2: Add a new VNx1BI mode and a new test case for V1DI. * v3: Abandon VNx1BI changes and add a new variant of aarch64_ptrue_reg. PR target/113860 gcc/ChangeLog: * config/aarch64/aarch64-protos.h (aarch64_ptrue_reg): New function. * config/aarch64/aarch64-simd.md (popcount2): Update pattern to also support V1DI mode. * config/aarch64/aarch64.cc (aarch64_ptrue_reg): New function. * config/aarch64/aarch64.md (popcount2): Add TARGET_SVE support. * config/aarch64/iterators.md (VDQHSD_V1DI): New mode iterator. (SVE_VDQ_I): Add V1DI. (bitsize): Likewise. (VPRED): Likewise. (VEC_POP_MODE): New mode attribute. (vec_pop_mode): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt-sve.c: Update test. * gcc.target/aarch64/popcnt11.c: New test. * gcc.target/aarch64/popcnt12.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64-simd.md| 15 +-- gcc/config/aarch64/aarch64.cc | 21 ++ gcc/config/aarch64/aarch64.md | 9 + gcc/config/aarch64/iterators.md | 16 ++-- gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 10 ++--- gcc/testsuite/gcc.target/aarch64/popcnt11.c | 58 +++ gcc/testsuite/gcc.target/aarch64/popcnt12.c | 20 + 8 files changed, 139 insertions(+), 11 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 06aa0aac0df6..75f30a52e617 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -917,6 +917,7 @@ rtx aarch64_expand_sve_dupq (rtx, machine_mode, rtx); void aarch64_expand_mov_immediate (rtx, rtx); rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type); rtx aarch64_ptrue_reg (machine_mode); +rtx aarch64_ptrue_reg (machine_mode, unsigned int); rtx aarch64_pfalse_reg (machine_mode); bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *); void aarch64_emit_sve_pred_move (rtx, rtx, rtx); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 04851524fdea..68839246fd8a 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3516,19 +3516,28 @@ ) (define_expand "popcount2" - [(set (match_operand:VDQHSD 0 "register_operand") - (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))] + [(set (match_operand:VDQHSD_V1DI 0 "register_operand") + (popcount:VDQHSD_V1DI + (match_operand:VDQHSD_V1DI 1 "register_operand")))] "TARGET_SIMD" { if (TARGET_SVE) { - rtx p = aarch64_ptrue_reg (mode); + rtx p = aarch64_ptrue_reg (mode, == 64 ? 8 : 16); emit_insn (gen_aarch64_pred_popcount (operands[0], p, operands[1])); DONE; } +if (mode == V1DImode) + { + rtx out = gen_reg_rtx (DImode); + emit_insn (gen_popcountdi2 (out, gen_lowpart (DImode, operands[1]))); + emit_move_insn (operands[0], gen_lowpart (mode, out)); + DONE; + } + /* Generate a byte popcount. */ machine_mode mode = == 64 ? V8QImode : V16QImode; machine_mode mode2 = == 64 ? V2SImode : V4SImode; diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 3e1d67431566..e6d957d275d1 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -3630,6 +3630,27 @@ aarch64_ptrue_reg (machine_mode mode) return gen_lowpart (mode, reg); } +/* Return an all-true (restricted to the leading VL bits) predicate register of + mode MODE. */ + +rtx +aarch64_ptrue_reg (machine_mode mode, unsigned int vl) +{ + gcc_assert (aarch64_sve_pred_mode_p (mode)); + + rtx_vector_builder builder (VNx16BImode, vl, 2); + + for (int i = 0; i < vl; i++) +builder.quick_push (CONST1_RTX (BImode)); + + for (int i = 0; i < vl; i++) +builder.quick_push (CONST0_RTX (BImode)); + + rtx const_vec = builder.build (); + rtx reg = force_reg (VNx16BImode, const_vec); + return gen_lowpart (mode, reg); +} + /* Return an all-false predicate register of mode MODE. */ rtx diff --git a/gcc/config/aarch64/aarch64.
[gcc r16-459] Canonicalize vec_merge in simplify_ternary_operation
https://gcc.gnu.org/g:9b13bea07706a7cae0185f8a860d67209308c050 commit r16-459-g9b13bea07706a7cae0185f8a860d67209308c050 Author: Pengxuan Zheng Date: Thu Feb 6 16:16:32 2025 -0800 Canonicalize vec_merge in simplify_ternary_operation Similar to the canonicalization done in combine, we canonicalize vec_merge with swap_communattive_operands_p in simplify_ternary_operation too. gcc/ChangeLog: * config/aarch64/aarch64-protos.h (aarch64_exact_log2_inverse): New. * config/aarch64/aarch64-simd.md (aarch64_simd_vec_set_zero): Update pattern accordingly. * config/aarch64/aarch64.cc (aarch64_exact_log2_inverse): New. * simplify-rtx.cc (simplify_context::simplify_ternary_operation): Canonicalize vec_merge. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64-simd.md | 10 ++ gcc/config/aarch64/aarch64.cc | 10 ++ gcc/simplify-rtx.cc | 7 +++ 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index c83c35c6d71e..c935e7bcf33d 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -1055,6 +1055,7 @@ void aarch64_subvti_scratch_regs (rtx, rtx, rtx *, rtx *, rtx *, rtx *); void aarch64_expand_subvti (rtx, rtx, rtx, rtx, rtx, rtx, rtx, bool); +int aarch64_exact_log2_inverse (unsigned int, rtx); /* Initialize builtins for SIMD intrinsics. */ diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index e2afe87e5130..1099e742cbf7 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1193,12 +1193,14 @@ (define_insn "aarch64_simd_vec_set_zero" [(set (match_operand:VALL_F16 0 "register_operand" "=w") (vec_merge:VALL_F16 - (match_operand:VALL_F16 1 "aarch64_simd_imm_zero" "") - (match_operand:VALL_F16 3 "register_operand" "0") + (match_operand:VALL_F16 1 "register_operand" "0") + (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "") (match_operand:SI 2 "immediate_operand" "i")))] - "TARGET_SIMD && exact_log2 (INTVAL (operands[2])) >= 0" + "TARGET_SIMD && aarch64_exact_log2_inverse (, operands[2]) >= 0" { -int elt = ENDIAN_LANE_N (, exact_log2 (INTVAL (operands[2]))); +int elt = ENDIAN_LANE_N (, +aarch64_exact_log2_inverse (, +operands[2])); operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt); return "ins\\t%0.[%p2], zr"; } diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 2dc5f4c4b59d..9e3f2885bccb 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -23914,6 +23914,16 @@ aarch64_strided_registers_p (rtx *operands, unsigned int num_operands, return true; } +/* Return the base 2 logarithm of the bit inverse of OP masked by the lowest + NELTS bits, if OP is a power of 2. Otherwise, returns -1. */ + +int +aarch64_exact_log2_inverse (unsigned int nelts, rtx op) +{ + return exact_log2 ((~INTVAL (op)) +& ((HOST_WIDE_INT_1U << nelts) - 1)); +} + /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and HIGH (exclusive). */ void diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index 7bcbe11370fa..b34fd2f4b9ea 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -7387,6 +7387,13 @@ simplify_context::simplify_ternary_operation (rtx_code code, machine_mode mode, return gen_rtx_CONST_VECTOR (mode, v); } + if (swap_commutative_operands_p (op0, op1) + /* Two operands have same precedence, then first bit of mask +select first operand. */ + || (!swap_commutative_operands_p (op1, op0) && !(sel & 1))) + return simplify_gen_ternary (code, mode, mode, op1, op0, +GEN_INT (~sel & mask)); + /* Replace (vec_merge (vec_merge a b m) c n) with (vec_merge b c n) if no element from a appears in the result. */ if (GET_CODE (op0) == VEC_MERGE)
[gcc r16-701] aarch64: Fix an oversight in aarch64_evpc_reencode
https://gcc.gnu.org/g:d77c3bc1c35e3032b91648dbef4e0ef1f6020017 commit r16-701-gd77c3bc1c35e3032b91648dbef4e0ef1f6020017 Author: Pengxuan Zheng Date: Thu May 15 17:52:29 2025 -0700 aarch64: Fix an oversight in aarch64_evpc_reencode Some fields (e.g., zero_op0_p and zero_op1_p) of the struct "newd" may be left uninitialized in aarch64_evpc_reencode. This can cause reading of uninitialized data. I found this oversight when testing my patches on and/fmov optimizations. This patch fixes the bug by zero initializing the struct. Pushed as obvious after bootstrap/test on aarch64-linux-gnu. gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_evpc_reencode): Zero initialize newd. Diff: --- gcc/config/aarch64/aarch64.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 901aa6ea68a8..f5552e4b86ce 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -26277,7 +26277,7 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d) static bool aarch64_evpc_reencode (struct expand_vec_perm_d *d) { - expand_vec_perm_d newd; + expand_vec_perm_d newd = {}; /* The subregs that we'd create are not supported for big-endian SVE; see aarch64_modes_compatible_p for details. */
[gcc r16-702] aarch64: Recognize vector permute patterns which can be interpreted as AND [PR100165]
https://gcc.gnu.org/g:dc501cb0dc857663f7fa762f3dbf0ae60973d2c3 commit r16-702-gdc501cb0dc857663f7fa762f3dbf0ae60973d2c3 Author: Pengxuan Zheng Date: Wed May 7 10:47:37 2025 -0700 aarch64: Recognize vector permute patterns which can be interpreted as AND [PR100165] Certain permute that blends a vector with zero can be interpreted as an AND of a mask. This idea was suggested by Richard Sandiford when he was reviewing my patch which tries to optimizes certain vector permute with the FMOV instruction for the aarch64 target. For example, for the aarch64 target, at present: v4hi f_v4hi (v4hi x) { return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 1, 6, 3 }); } generates: f_v4hi: uzp1v0.2d, v0.2d, v0.2d adrpx0, .LC0 ldr d31, [x0, #:lo12:.LC0] tbl v0.8b, {v0.16b}, v31.8b ret .LC0: .byte -1 .byte -1 .byte 2 .byte 3 .byte -1 .byte -1 .byte 6 .byte 7 With this patch, it generates: f_v4hi: mvniv31.2s, 0xff, msl 8 and v0.8b, v0.8b, v31.8b ret This patch also provides a target-independent routine for detecting vector permute patterns which can be interpreted as AND. Changes since v1: * v2: Rework the patch to only perform the optimization for aarch64 by calling the target independent routine vec_perm_and_mask. PR target/100165 gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_evpc_and): New. (aarch64_expand_vec_perm_const_1): Call aarch64_evpc_and. * optabs.cc (vec_perm_and_mask): New. * optabs.h (vec_perm_and_mask): New prototype. gcc/testsuite/ChangeLog: * gcc.target/aarch64/and-be.c: New test. * gcc.target/aarch64/and-le.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64.cc | 36 + gcc/optabs.cc | 44 +++ gcc/optabs.h | 4 + gcc/testsuite/gcc.target/aarch64/and-be.c | 123 ++ gcc/testsuite/gcc.target/aarch64/and-le.c | 123 ++ 5 files changed, 330 insertions(+) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f5552e4b86ce..34f9725485d2 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -26886,6 +26886,40 @@ aarch64_evpc_ins (struct expand_vec_perm_d *d) return true; } +/* Recognize patterns suitable for the AND instructions. */ +static bool +aarch64_evpc_and (struct expand_vec_perm_d *d) +{ + /* Either d->op0 or d->op1 should be a vector of all zeros. */ + if (d->one_vector_p || (!d->zero_op0_p && !d->zero_op1_p)) +return false; + + machine_mode mode = d->vmode; + machine_mode sel_mode; + if (!related_int_vector_mode (mode).exists (&sel_mode)) +return false; + + insn_code and_code = optab_handler (and_optab, sel_mode); + rtx and_mask = vec_perm_and_mask (sel_mode, d->perm, d->zero_op0_p); + if (and_code == CODE_FOR_nothing || !and_mask) +return false; + + if (d->testing_p) +return true; + + class expand_operand ops[3]; + rtx in = d->zero_op0_p ? d->op1 : d->op0; + create_output_operand (&ops[0], gen_lowpart (sel_mode, d->target), sel_mode); + create_input_operand (&ops[1], gen_lowpart (sel_mode, in), sel_mode); + create_input_operand (&ops[2], and_mask, sel_mode); + expand_insn (and_code, 3, ops); + rtx result = gen_lowpart (mode, ops[0].value); + if (!rtx_equal_p (d->target, result)) +emit_move_insn (d->target, result); + + return true; +} + static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) { @@ -26921,6 +26955,8 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) return true; else if (aarch64_evpc_uzp (d)) return true; + else if (aarch64_evpc_and (d)) + return true; else if (aarch64_evpc_trn (d)) return true; else if (aarch64_evpc_sel (d)) diff --git a/gcc/optabs.cc b/gcc/optabs.cc index 92d6d50d55a0..5c9450f61450 100644 --- a/gcc/optabs.cc +++ b/gcc/optabs.cc @@ -6362,6 +6362,50 @@ expand_vec_perm_1 (enum insn_code icode, rtx target, return NULL_RTX; } +/* Check if vec_perm mask SEL is a constant equivalent to an and operation of + the non-zero vec_perm operand with some mask consisting of 0xffs and 0x00s, + assuming the other vec_perm operand is a constant vector of zeros. Return + the mask for the equivalent and operation, or NULL_RTX if the vec_perm can + not be modeled as an and. MODE is the mode of the value being anded. + ZERO_OP0_P is true if the first operand of the vec_perm is a con
[gcc r16-703] aarch64: Optimize AND with certain vector of immediates as FMOV [PR100165]
https://gcc.gnu.org/g:0417a630811404c2362060b7e15f99e5a4a0d76a commit r16-703-g0417a630811404c2362060b7e15f99e5a4a0d76a Author: Pengxuan Zheng Date: Mon May 12 10:12:11 2025 -0700 aarch64: Optimize AND with certain vector of immediates as FMOV [PR100165] We can optimize AND with certain vector of immediates as FMOV if the result of the AND is as if the upper lane of the input vector is set to zero and the lower lane remains unchanged. For example, at present: v4hi f_v4hi (v4hi x) { return x & (v4hi){ 0x, 0x, 0, 0 }; } generates: f_v4hi: movid31, 0x and v0.8b, v0.8b, v31.8b ret With this patch, it generates: f_v4hi: fmovs0, s0 ret Changes since v1: * v2: Simplify the mask checking logic by using native_decode_int and address a few other review comments. PR target/100165 gcc/ChangeLog: * config/aarch64/aarch64-protos.h (aarch64_output_fmov): New prototype. (aarch64_simd_valid_and_imm_fmov): Likewise. * config/aarch64/aarch64-simd.md (and3): Allow FMOV codegen. * config/aarch64/aarch64.cc (aarch64_simd_valid_and_imm_fmov): New. (aarch64_output_fmov): Likewise. * config/aarch64/constraints.md (Df): New constraint. * config/aarch64/predicates.md (aarch64_reg_or_and_imm): Update predicate to support FMOV codegen. gcc/testsuite/ChangeLog: * gcc.target/aarch64/fmov-1-be.c: New test. * gcc.target/aarch64/fmov-1-le.c: New test. * gcc.target/aarch64/fmov-2-be.c: New test. * gcc.target/aarch64/fmov-2-le.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64-protos.h | 2 + gcc/config/aarch64/aarch64-simd.md | 10 +- gcc/config/aarch64/aarch64.cc| 50 + gcc/config/aarch64/constraints.md| 7 ++ gcc/config/aarch64/predicates.md | 3 +- gcc/testsuite/gcc.target/aarch64/fmov-1-be.c | 151 +++ gcc/testsuite/gcc.target/aarch64/fmov-1-le.c | 151 +++ gcc/testsuite/gcc.target/aarch64/fmov-2-be.c | 90 gcc/testsuite/gcc.target/aarch64/fmov-2-le.c | 90 9 files changed, 548 insertions(+), 6 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index b59eecf5bdff..8f37e56d440e 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -933,6 +933,7 @@ char *aarch64_output_simd_mov_imm (rtx, unsigned); char *aarch64_output_simd_orr_imm (rtx, unsigned); char *aarch64_output_simd_and_imm (rtx, unsigned); char *aarch64_output_simd_xor_imm (rtx, unsigned); +char *aarch64_output_fmov (rtx); char *aarch64_output_sve_mov_immediate (rtx); char *aarch64_output_sve_ptrues (rtx); @@ -948,6 +949,7 @@ bool aarch64_simd_scalar_immediate_valid_for_move (rtx, scalar_int_mode); bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool); bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *); bool aarch64_simd_valid_and_imm (rtx); +bool aarch64_simd_valid_and_imm_fmov (rtx, unsigned int * = NULL); bool aarch64_simd_valid_mov_imm (rtx); bool aarch64_simd_valid_orr_imm (rtx); bool aarch64_simd_valid_xor_imm (rtx); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 1099e742cbf7..6e30dc48934c 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -1117,17 +1117,17 @@ [(set_attr "type" "neon_fp_abd_")] ) -;; For AND (vector, register) and BIC (vector, immediate) +;; For AND (vector, register), BIC (vector, immediate) and FMOV (register) (define_insn "and3" [(set (match_operand:VDQ_I 0 "register_operand") (and:VDQ_I (match_operand:VDQ_I 1 "register_operand") (match_operand:VDQ_I 2 "aarch64_reg_or_and_imm")))] "TARGET_SIMD" - {@ [ cons: =0 , 1 , 2 ] - [ w, w , w ] and\t%0., %1., %2. - [ w, 0 , Db ] << aarch64_output_simd_and_imm (operands[2], ); + {@ [ cons: =0 , 1 , 2 ; attrs: type ] + [ w, w , w ; neon_logic ] and\t%0., %1., %2. + [ w, w , Df ; fmov ] << aarch64_output_fmov (operands[2]); + [ w, 0 , Db ; neon_logic ] << aarch64_output_simd_and_imm (operands[2], ); } - [(set_attr "type" "neon_logic")] ) ;; For ORR (vector, register) and ORR (vector, immediate) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 34f9725485d2..1da615c8955a 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -23620,6 +23620,36 @@ aarch64_simd_valid_and_imm (rtx op) return aarch64_simd_valid_imm (op, NULL,
[gcc r16-704] aarch64: Add more vector permute tests for the FMOV optimization [PR100165]
https://gcc.gnu.org/g:265fdb3fa91346f1be40111a9f3e8a0838f7d7fd commit r16-704-g265fdb3fa91346f1be40111a9f3e8a0838f7d7fd Author: Pengxuan Zheng Date: Mon May 12 10:21:49 2025 -0700 aarch64: Add more vector permute tests for the FMOV optimization [PR100165] This patch adds more tests for vector permutes which can now be optimized as FMOV with the generic PERM change and the aarch64 AND patch. Changes since v1: * v2: Add -mlittle-endian to the little endian tests explicitly and rename the tests accordingly. PR target/100165 gcc/testsuite/ChangeLog: * gcc.target/aarch64/fmov-3-be.c: New test. * gcc.target/aarch64/fmov-3-le.c: New test. * gcc.target/aarch64/fmov-4-be.c: New test. * gcc.target/aarch64/fmov-4-le.c: New test. * gcc.target/aarch64/fmov-5-be.c: New test. * gcc.target/aarch64/fmov-5-le.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/testsuite/gcc.target/aarch64/fmov-3-be.c | 77 ++ gcc/testsuite/gcc.target/aarch64/fmov-3-le.c | 129 +++ gcc/testsuite/gcc.target/aarch64/fmov-4-be.c | 54 ++ gcc/testsuite/gcc.target/aarch64/fmov-4-le.c | 94 + gcc/testsuite/gcc.target/aarch64/fmov-5-be.c | 150 +++ gcc/testsuite/gcc.target/aarch64/fmov-5-le.c | 150 +++ 6 files changed, 654 insertions(+) diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-3-be.c b/gcc/testsuite/gcc.target/aarch64/fmov-3-be.c new file mode 100644 index ..0bddd96ea000 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/fmov-3-be.c @@ -0,0 +1,77 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mbig-endian" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#pragma GCC target ("arch=armv8-a") + +typedef short v4hi __attribute__ ((vector_size (8))); +typedef int v4si __attribute__ ((vector_size (16))); +typedef float v4sf __attribute__ ((vector_size (16))); +typedef short v8hi __attribute__ ((vector_size (16))); + +/* +** f_v4hi: +** fmovs0, s0 +** ret +*/ +v4hi +f_v4hi (v4hi x) +{ + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 4, 5, 2, 3 }); +} + +/* +** f_v8hi: +** fmovs0, s0 +** ret +*/ +v8hi +f_v8hi (v8hi x) +{ + return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8hi){ 8, 9, 10, 11, 12, 13, 6, 7 }); +} + +/* +** f_v4si: +** fmovd0, d0 +** ret +*/ +v4si +f_v4si (v4si x) +{ + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 6, 7, 2, 3 }); +} + +/* +** g_v4si: +** fmovd0, d0 +** ret +*/ +v4si +g_v4si (v4si x) +{ + return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 2, 3, 6, 7 }); +} + +/* +** h_v4si: +** fmovs0, s0 +** ret +*/ +v4si +h_v4si (v4si x) +{ + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 4, 5, 6, 3 }); +} + +/* +** f_v4sf: +** fmovd0, d0 +** ret +*/ +v4sf +f_v4sf (v4sf x) +{ + return __builtin_shuffle (x, (v4sf){ 0, 0, 0, 0 }, (v4si){ 6, 7, 2, 3 }); +} diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-3-le.c b/gcc/testsuite/gcc.target/aarch64/fmov-3-le.c new file mode 100644 index ..4545841db36e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/fmov-3-le.c @@ -0,0 +1,129 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mlittle-endian" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#pragma GCC target ("arch=armv8-a") + +typedef short v4hi __attribute__ ((vector_size (8))); +typedef char v8qi __attribute__ ((vector_size (8))); +typedef int v4si __attribute__ ((vector_size (16))); +typedef float v4sf __attribute__ ((vector_size (16))); +typedef short v8hi __attribute__ ((vector_size (16))); +typedef char v16qi __attribute__ ((vector_size (16))); + +/* +** f_v4hi: +** fmovs0, s0 +** ret +*/ +v4hi +f_v4hi (v4hi x) +{ + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 0, 1, 4, 5 }); +} + +/* +** g_v4hi: +** (?:(?!fmov).)* +** ret +*/ +v4hi +g_v4hi (v4hi x) +{ + return __builtin_shuffle (x, (v4hi){ 0, 0, 0, 0 }, (v4hi){ 3, 1, 4, 2 }); +} + +/* +** f_v8hi: +** fmovs0, s0 +** ret +*/ +v8hi +f_v8hi (v8hi x) +{ + return __builtin_shuffle (x, (v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }, + (v8hi){ 0, 1, 8, 9, 10, 11, 12, 13 }); +} + +/* +** f_v4si: +** fmovd0, d0 +** ret +*/ +v4si +f_v4si (v4si x) +{ + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 1, 4, 5 }); +} + +/* +** g_v4si: +** fmovd0, d0 +** ret +*/ +v4si +g_v4si (v4si x) +{ + return __builtin_shuffle ((v4si){ 0, 0, 0, 0 }, x, (v4si){ 4, 5, 2, 3 }); +} + +/* +** h_v4si: +** fmovs0, s0 +** ret +*/ +v4si +h_v4si (v4si x) +{ + return __builtin_shuffle (x, (v4si){ 0, 0, 0, 0 }, (v4si){ 0, 4, 5, 6 }); +} + +/* +** f_v4sf: +** fmovd0, d0 +** ret +*/ +v4sf +f_v4sf (v4sf x)
[gcc r16-811] aarch64: Carry over zeroness in aarch64_evpc_reencode
https://gcc.gnu.org/g:84c6988c026114727693cd7cd74b8cd5cdcdeb74 commit r16-811-g84c6988c026114727693cd7cd74b8cd5cdcdeb74 Author: Pengxuan Zheng Date: Tue May 20 17:58:23 2025 -0700 aarch64: Carry over zeroness in aarch64_evpc_reencode There was a bug in aarch64_evpc_reencode which could leave zero_op0_p and zero_op1_p of the struct "newd" uninitialized. r16-701-gd77c3bc1c35e303 fixed the issue by zero initializing "newd." This patch provides an alternative fix as suggested by Richard Sandiford based on the fact that the zeroness is preserved by aarch64_evpc_reencode. gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_evpc_reencode): Copy zero_op0_p and zero_op1_p from d to newd. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 1da615c8955a..2b837ec8e673 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -26327,7 +26327,7 @@ aarch64_evpc_trn (struct expand_vec_perm_d *d) static bool aarch64_evpc_reencode (struct expand_vec_perm_d *d) { - expand_vec_perm_d newd = {}; + expand_vec_perm_d newd; /* The subregs that we'd create are not supported for big-endian SVE; see aarch64_modes_compatible_p for details. */ @@ -26353,6 +26353,8 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d) newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL; newd.testing_p = d->testing_p; newd.one_vector_p = d->one_vector_p; + newd.zero_op0_p = d->zero_op0_p; + newd.zero_op1_p = d->zero_op1_p; newd.perm.new_vector (newpermindices.encoding (), newd.one_vector_p ? 1 : 2, newpermindices.nelts_per_input ());