[gcc r15-3669] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]
https://gcc.gnu.org/g:a92f54f580c37732a5de01e47aed56882231f196 commit r15-3669-ga92f54f580c37732a5de01e47aed56882231f196 Author: Pengxuan Zheng Date: Tue Sep 10 17:59:46 2024 -0700 aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328] SVE's INDEX instruction can be used to populate vectors by values starting from "base" and incremented by "step" for each subsequent value. We can take advantage of it to generate vector constants if TARGET_SVE is available and the base and step values are within [-16, 15]. For example, with the following function: typedef int v4si __attribute__ ((vector_size (16))); v4si f_v4si (void) { return (v4si){ 0, 1, 2, 3 }; } GCC currently generates: f_v4si: adrpx0, .LC4 ldr q0, [x0, #:lo12:.LC4] ret .LC4: .word 0 .word 1 .word 2 .word 3 With this patch, we generate an INDEX instruction instead if TARGET_SVE is available. f_v4si: index z0.s, #0, #1 ret PR target/113328 gcc/ChangeLog: * config/aarch64/aarch64.cc (aarch64_simd_valid_immediate): Improve handling of some ADVSIMD vectors by using SVE's INDEX if TARGET_SVE is available. (aarch64_output_simd_mov_immediate): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use SVE's INDEX instruction. * gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise. * gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise. * gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise. * gcc.target/aarch64/sve/vec_init_3.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64.cc | 13 ++- .../gcc.target/aarch64/sve/acle/general/dupq_1.c | 3 +- .../gcc.target/aarch64/sve/acle/general/dupq_2.c | 3 +- .../gcc.target/aarch64/sve/acle/general/dupq_3.c | 3 +- .../gcc.target/aarch64/sve/acle/general/dupq_4.c | 3 +- gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c | 99 ++ 6 files changed, 115 insertions(+), 9 deletions(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 6ccf08d1cc0a..92763d403c75 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -22987,7 +22987,8 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, if (CONST_VECTOR_P (op) && CONST_VECTOR_DUPLICATE_P (op)) n_elts = CONST_VECTOR_NPATTERNS (op); - else if ((vec_flags & VEC_SVE_DATA) + else if (which == AARCH64_CHECK_MOV + && TARGET_SVE && const_vec_series_p (op, &base, &step)) { gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); @@ -25245,6 +25246,16 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, if (which == AARCH64_CHECK_MOV) { + if (info.insn == simd_immediate_info::INDEX) + { + gcc_assert (TARGET_SVE); + snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #" + HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC, + element_char, INTVAL (info.u.index.base), + INTVAL (info.u.index.step)); + return templ; + } + mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi"; shift_op = (info.u.mov.modifier == simd_immediate_info::MSL ? "msl" : "lsl"); diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c index 216699b0536e..0940bedd0ddb 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c @@ -10,7 +10,6 @@ dupq (int x) return svdupq_s32 (x, 1, 2, 3); } -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */ +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */ /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */ /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */ -/* { dg-final { scan-assembler {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c index d494943a2753..218a66013375 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c @@ -10,7 +10,6 @@ dupq (int x) return svdupq_s32 (x, 1, 2, 3); } -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */ +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */ /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
[gcc r15-1801] aarch64: Add vector popcount besides QImode [PR113859]
https://gcc.gnu.org/g:895bbc08d38c2aca3cbbab273a247021fea73930 commit r15-1801-g895bbc08d38c2aca3cbbab273a247021fea73930 Author: Pengxuan Zheng Date: Wed Jun 12 18:23:13 2024 -0700 aarch64: Add vector popcount besides QImode [PR113859] This patch improves GCC’s vectorization of __builtin_popcount for aarch64 target by adding popcount patterns for vector modes besides QImode, i.e., HImode, SImode and DImode. With this patch, we now generate the following for V8HI: cnt v1.16b, v0.16b uaddlp v2.8h, v1.16b For V4HI, we generate: cnt v1.8b, v0.8b uaddlp v2.4h, v1.8b For V4SI, we generate: cnt v1.16b, v0.16b uaddlp v2.8h, v1.16b uaddlp v3.4s, v2.8h For V4SI with TARGET_DOTPROD, we generate the following instead: moviv0.4s, #0 moviv1.16b, #1 cnt v3.16b, v2.16b udotv0.4s, v3.16b, v1.16b For V2SI, we generate: cnt v1.8b, v.8b uaddlp v2.4h, v1.8b uaddlp v3.2s, v2.4h For V2SI with TARGET_DOTPROD, we generate the following instead: moviv0.8b, #0 moviv1.8b, #1 cnt v3.8b, v2.8b udotv0.2s, v3.8b, v1.8b For V2DI, we generate: cnt v1.16b, v.16b uaddlp v2.8h, v1.16b uaddlp v3.4s, v2.8h uaddlp v4.2d, v3.4s For V4SI with TARGET_DOTPROD, we generate the following instead: moviv0.4s, #0 moviv1.16b, #1 cnt v3.16b, v2.16b udotv0.4s, v3.16b, v1.16b uaddlp v0.2d, v0.4s PR target/113859 gcc/ChangeLog: * config/aarch64/aarch64-simd.md (aarch64_addlp): Rename to... (@aarch64_addlp): ... This. (popcount2): New define_expand. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt-udot.c: New test. * gcc.target/aarch64/popcnt-vec.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64-simd.md | 41 ++- gcc/testsuite/gcc.target/aarch64/popcnt-udot.c | 58 ++ gcc/testsuite/gcc.target/aarch64/popcnt-vec.c | 69 ++ 3 files changed, 167 insertions(+), 1 deletion(-) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 01b084d8ccb..fd0c5e612b5 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3461,7 +3461,7 @@ [(set_attr "type" "neon_reduc_add")] ) -(define_expand "aarch64_addlp" +(define_expand "@aarch64_addlp" [(set (match_operand: 0 "register_operand") (plus: (vec_select: @@ -3517,6 +3517,45 @@ [(set_attr "type" "neon_cnt")] ) +(define_expand "popcount2" + [(set (match_operand:VDQHSD 0 "register_operand") + (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))] + "TARGET_SIMD" + { +/* Generate a byte popcount. */ +machine_mode mode = == 64 ? V8QImode : V16QImode; +rtx tmp = gen_reg_rtx (mode); +auto icode = optab_handler (popcount_optab, mode); +emit_insn (GEN_FCN (icode) (tmp, gen_lowpart (mode, operands[1]))); + +if (TARGET_DOTPROD + && (mode == SImode || mode == DImode)) + { + /* For V4SI and V2SI, we can generate a UDOT with a 0 accumulator and a + 1 multiplicand. For V2DI, another UAADDLP is needed. */ + rtx ones = force_reg (mode, CONST1_RTX (mode)); + auto icode = optab_handler (udot_prod_optab, mode); + mode = == 64 ? V2SImode : V4SImode; + rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode); + rtx zeros = force_reg (mode, CONST0_RTX (mode)); + emit_insn (GEN_FCN (icode) (dest, tmp, ones, zeros)); + tmp = dest; + } + +/* Use a sequence of UADDLPs to accumulate the counts. Each step doubles + the element size and halves the number of elements. */ +while (mode != mode) + { + auto icode = code_for_aarch64_addlp (ZERO_EXTEND, GET_MODE (tmp)); + mode = insn_data[icode].operand[0].mode; + rtx dest = mode == mode ? operands[0] : gen_reg_rtx (mode); + emit_insn (GEN_FCN (icode) (dest, tmp)); + tmp = dest; + } +DONE; + } +) + ;; 'across lanes' max and min ops. ;; Template for outputting a scalar, so we can create __builtins which can be diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c new file mode 100644 index 000..f6a968dae95 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-udot.c @@ -0,0 +1,58 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=armv8.2-a+dotprod -fno-vect-cost-model -fno-schedule-insns -fno-schedule-insns2" } */ + +/* +** bar: +** moviv([0-9]+).16b, 0x1 +** moviv([0-9]+).4s, 0 +** ldr q([0-9]+), \[x0\] +** cnt v([0-9]+).16b, v\3.16b +** udotv\2.4s, v\4.16b, v\
[gcc r15-2659] aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860]
https://gcc.gnu.org/g:e4b8db26de35239bd621aad9c0361f25d957122b commit r15-2659-ge4b8db26de35239bd621aad9c0361f25d957122b Author: Pengxuan Zheng Date: Wed Jul 31 17:00:01 2024 -0700 aarch64: Improve Advanced SIMD popcount expansion by using SVE [PR113860] This patch improves the Advanced SIMD popcount expansion by using SVE if available. For example, GCC currently generates the following code sequence for V2DI: cnt v31.16b, v31.16b uaddlp v31.8h, v31.16b uaddlp v31.4s, v31.8h uaddlp v31.2d, v31.4s However, by using SVE, we can generate the following sequence instead: ptrue p7.b, all cnt z31.d, p7/m, z31.d Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too. The scalar popcount expansion can also be improved similarly by using SVE and those changes will be included in a separate patch. PR target/113860 gcc/ChangeLog: * config/aarch64/aarch64-simd.md (popcount2): Add TARGET_SVE support. * config/aarch64/aarch64-sve.md (@aarch64_pred_): Use new iterator SVE_VDQ_I. * config/aarch64/iterators.md (SVE_VDQ_I): New mode iterator. (VPRED): Add V8QI, V16QI, V4HI, V8HI and V2SI. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt-sve.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64-simd.md| 9 +++ gcc/config/aarch64/aarch64-sve.md | 13 ++-- gcc/config/aarch64/iterators.md | 5 ++ gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++ 4 files changed, 109 insertions(+), 6 deletions(-) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 459e11b09a19..816f499e9634 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3508,6 +3508,15 @@ (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))] "TARGET_SIMD" { +if (TARGET_SVE) + { + rtx p = aarch64_ptrue_reg (mode); + emit_insn (gen_aarch64_pred_popcount (operands[0], + p, + operands[1])); + DONE; + } + /* Generate a byte popcount. */ machine_mode mode = == 64 ? V8QImode : V16QImode; rtx tmp = gen_reg_rtx (mode); diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index c3ed5075c4ed..a5cd42be9d5c 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3104,16 +3104,16 @@ ;; Integer unary arithmetic predicated with a PTRUE. (define_insn "@aarch64_pred_" - [(set (match_operand:SVE_I 0 "register_operand") - (unspec:SVE_I + [(set (match_operand:SVE_VDQ_I 0 "register_operand") + (unspec:SVE_VDQ_I [(match_operand: 1 "register_operand") - (SVE_INT_UNARY:SVE_I -(match_operand:SVE_I 2 "register_operand"))] + (SVE_INT_UNARY:SVE_VDQ_I +(match_operand:SVE_VDQ_I 2 "register_operand"))] UNSPEC_PRED_X))] "TARGET_SVE" {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ] - [ w, Upl , 0 ; * ] \t%0., %1/m, %2. - [ ?&w , Upl , w ; yes] movprfx\t%0, %2\;\t%0., %1/m, %2. + [ w, Upl , 0 ; * ] \t%Z0., %1/m, %Z2. + [ ?&w , Upl , w ; yes] movprfx\t%Z0, %Z2\;\t%Z0., %1/m, %Z2. } ) @@ -3168,6 +3168,7 @@ } ) + ;; - ;; [INT] General unary arithmetic corresponding to unspecs ;; - diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 95fe8f070f4c..aaa4afefe2ce 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -559,6 +559,9 @@ ;; element modes (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI]) +;; All SVE and Advanced SIMD integer vector modes. +(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I]) + ;; SVE integer vector modes whose elements are 16 bits or wider. (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI VNx4SI VNx2SI @@ -2278,6 +2281,8 @@ (VNx32BF "VNx8BI") (VNx16SI "VNx4BI") (VNx16SF "VNx4BI") (VNx8DI "VNx2BI") (VNx8DF "VNx2BI") +(V8QI "VNx8BI") (V16QI "VNx16BI") +(V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI") (V4SI "VNx4BI") (V2DI "VNx2BI")]) ;; ...and again in lower case. diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c new file mode 100644 index ..8e349efe3907 --- /dev/null +
[gcc r15-949] MAINTAINERS: Add myself to Write After Approval and DCO
https://gcc.gnu.org/g:96ec186d1dbeaa87453c3703e25fae7ce3ddbbb7 commit r15-949-g96ec186d1dbeaa87453c3703e25fae7ce3ddbbb7 Author: Pengxuan Zheng Date: Fri May 31 11:07:05 2024 -0700 MAINTAINERS: Add myself to Write After Approval and DCO ChangeLog: * MAINTAINERS: Add myself to Write After Approval and DCO. Signed-off-by: Pengxuan Zheng Diff: --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index e2870eef2ef..6444e6ea2f1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -743,6 +743,7 @@ Dennis Zhang Yufeng Zhang Qing Zhao Shujing Zhao +Pengxuan Zheng Jon Ziegler Roman Zippel Josef Zlomek @@ -789,3 +790,4 @@ Martin Uecker Jonathan Wakely Alexander Westbrooks Chung-Ju Wu +Pengxuan Zheng
[gcc r15-950] aarch64: testsuite: Explicitly add -mlittle-endian to vget_low_2.c
https://gcc.gnu.org/g:7fb62627cfb3e03811bb667fa7159bbc7f972f00 commit r15-950-g7fb62627cfb3e03811bb667fa7159bbc7f972f00 Author: Pengxuan Zheng Date: Wed May 22 17:38:43 2024 -0700 aarch64: testsuite: Explicitly add -mlittle-endian to vget_low_2.c vget_low_2.c is a test case for little-endian, but we missed the -mlittle-endian flag in r15-697-ga2e4fe5a53cf75. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vget_low_2.c: Add -mlittle-endian. Signed-off-by: Pengxuan Zheng Diff: --- gcc/testsuite/gcc.target/aarch64/vget_low_2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c index 44414e1c043..93e9e664ee9 100644 --- a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O3 -fdump-tree-optimized" } */ +/* { dg-options "-O3 -fdump-tree-optimized -mlittle-endian" } */ #include
[gcc r15-1079] aarch64: Add vector floating point extend pattern [PR113880, PR113869]
https://gcc.gnu.org/g:230d62a2cdd16c1ec8fe87998ec01081503f010d commit r15-1079-g230d62a2cdd16c1ec8fe87998ec01081503f010d Author: Pengxuan Zheng Date: Thu May 30 17:53:23 2024 -0700 aarch64: Add vector floating point extend pattern [PR113880, PR113869] This patch adds vector floating point extend pattern for V2SF->V2DF and V4HF->V4SF conversions by renaming the existing aarch64_float_extend_lo_ pattern to the standard optab one, i.e., extend2. This allows the vectorizer to vectorize certain floating point widening operations for the aarch64 target. PR target/113880 PR target/113869 gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (VAR1): Remap float_extend_lo_ builtin codes to standard optab ones. * config/aarch64/aarch64-simd.md (aarch64_float_extend_lo_): Rename to... (extend2): ... This. gcc/testsuite/ChangeLog: * gcc.target/aarch64/extend-vec.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64-builtins.cc| 9 + gcc/config/aarch64/aarch64-simd.md| 2 +- gcc/testsuite/gcc.target/aarch64/extend-vec.c | 21 + 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index f8eeccb554d..25189888d17 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -534,6 +534,15 @@ BUILTIN_VDQ_BHSI (urhadd, uavg, _ceil, 0) BUILTIN_VDQ_BHSI (shadd, avg, _floor, 0) BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0) +/* The builtins below should be expanded through the standard optabs + CODE_FOR_extend2. */ +#undef VAR1 +#define VAR1(F,T,N,M) \ + constexpr insn_code CODE_FOR_aarch64_##F##M = CODE_FOR_##T##N##M##2; + +VAR1 (float_extend_lo_, extend, v2sf, v2df) +VAR1 (float_extend_lo_, extend, v4hf, v4sf) + #undef VAR1 #define VAR1(T, N, MAP, FLAG, A) \ {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG}, diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 868f4486218..c5e2c9f00d0 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3132,7 +3132,7 @@ DONE; } ) -(define_insn "aarch64_float_extend_lo_" +(define_insn "extend2" [(set (match_operand: 0 "register_operand" "=w") (float_extend: (match_operand:VDF 1 "register_operand" "w")))] diff --git a/gcc/testsuite/gcc.target/aarch64/extend-vec.c b/gcc/testsuite/gcc.target/aarch64/extend-vec.c new file mode 100644 index 000..f6241d5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/extend-vec.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.2d, v[0-9]+.2s} 1 } } */ +void +f (float *__restrict a, double *__restrict b) +{ + b[0] = a[0]; + b[1] = a[1]; +} + +/* { dg-final { scan-assembler-times {fcvtl\tv[0-9]+.4s, v[0-9]+.4h} 1 } } */ +void +f1 (_Float16 *__restrict a, float *__restrict b) +{ + + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; +}
[gcc r15-1182] aarch64: Add vector floating point trunc pattern
https://gcc.gnu.org/g:e7cd8ea1fa3e48404954bb7c06e9bcd603f132dd commit r15-1182-ge7cd8ea1fa3e48404954bb7c06e9bcd603f132dd Author: Pengxuan Zheng Date: Fri Jun 7 19:52:00 2024 -0700 aarch64: Add vector floating point trunc pattern This patch is a follow-up of r15-1079-g230d62a2cdd16c to add vector floating point trunc pattern for V2DF->V2SF and V4SF->V4HF conversions by renaming the existing aarch64_float_truncate_lo_ pattern to the standard optab one, i.e., trunc2. This allows the vectorizer to vectorize certain floating point narrowing operations for the aarch64 target. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (VAR1): Remap float_truncate_lo_ builtin codes to standard optab ones. * config/aarch64/aarch64-simd.md (aarch64_float_truncate_lo_): Rename to... (trunc2): ... This. gcc/testsuite/ChangeLog: * gcc.target/aarch64/trunc-vec.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64-builtins.cc | 7 +++ gcc/config/aarch64/aarch64-simd.md | 6 +++--- gcc/testsuite/gcc.target/aarch64/trunc-vec.c | 21 + 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 25189888d17d..d589e59defc2 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -543,6 +543,13 @@ BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0) VAR1 (float_extend_lo_, extend, v2sf, v2df) VAR1 (float_extend_lo_, extend, v4hf, v4sf) +/* __builtin_aarch64_float_truncate_lo_ should be expanded through the + standard optabs CODE_FOR_trunc2. */ +constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v4hf += CODE_FOR_truncv4sfv4hf2; +constexpr insn_code CODE_FOR_aarch64_float_truncate_lo_v2sf += CODE_FOR_truncv2dfv2sf2; + #undef VAR1 #define VAR1(T, N, MAP, FLAG, A) \ {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG}, diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index c5e2c9f00d02..f644bd1731e5 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3197,7 +3197,7 @@ } ) -(define_insn "aarch64_float_truncate_lo_" +(define_insn "trunc2" [(set (match_operand:VDF 0 "register_operand" "=w") (float_truncate:VDF (match_operand: 1 "register_operand" "w")))] @@ -3256,7 +3256,7 @@ int lo = BYTES_BIG_ENDIAN ? 2 : 1; int hi = BYTES_BIG_ENDIAN ? 1 : 2; -emit_insn (gen_aarch64_float_truncate_lo_v2sf (tmp, operands[lo])); +emit_insn (gen_truncv2dfv2sf2 (tmp, operands[lo])); emit_insn (gen_aarch64_float_truncate_hi_v4sf (operands[0], tmp, operands[hi])); DONE; @@ -3272,7 +3272,7 @@ { rtx tmp = gen_reg_rtx (V2SFmode); emit_insn (gen_aarch64_vec_concatdf (tmp, operands[1], operands[2])); -emit_insn (gen_aarch64_float_truncate_lo_v2sf (operands[0], tmp)); +emit_insn (gen_truncv2dfv2sf2 (operands[0], tmp)); DONE; } ) diff --git a/gcc/testsuite/gcc.target/aarch64/trunc-vec.c b/gcc/testsuite/gcc.target/aarch64/trunc-vec.c new file mode 100644 index ..05e8af7912de --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/trunc-vec.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +/* { dg-final { scan-assembler-times {fcvtn\tv[0-9]+.2s, v[0-9]+.2d} 1 } } */ +void +f (double *__restrict a, float *__restrict b) +{ + b[0] = a[0]; + b[1] = a[1]; +} + +/* { dg-final { scan-assembler-times {fcvtn\tv[0-9]+.4h, v[0-9]+.4s} 1 } } */ +void +f1 (float *__restrict a, _Float16 *__restrict b) +{ + + b[0] = a[0]; + b[1] = a[1]; + b[2] = a[2]; + b[3] = a[3]; +}
[gcc r15-4579] aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]
https://gcc.gnu.org/g:9ffcf1f193b477f417a4c1960cd32696a23b99b4 commit r15-4579-g9ffcf1f193b477f417a4c1960cd32696a23b99b4 Author: Pengxuan Zheng Date: Mon Oct 14 05:37:49 2024 -0700 aarch64: Improve scalar mode popcount expansion by using SVE [PR113860] This is similar to the recent improvements to the Advanced SIMD popcount expansion by using SVE. We can utilize SVE to generate more efficient code for scalar mode popcount too. Changes since v1: * v2: Add a new VNx1BI mode and a new test case for V1DI. * v3: Abandon VNx1BI changes and add a new variant of aarch64_ptrue_reg. PR target/113860 gcc/ChangeLog: * config/aarch64/aarch64-protos.h (aarch64_ptrue_reg): New function. * config/aarch64/aarch64-simd.md (popcount2): Update pattern to also support V1DI mode. * config/aarch64/aarch64.cc (aarch64_ptrue_reg): New function. * config/aarch64/aarch64.md (popcount2): Add TARGET_SVE support. * config/aarch64/iterators.md (VDQHSD_V1DI): New mode iterator. (SVE_VDQ_I): Add V1DI. (bitsize): Likewise. (VPRED): Likewise. (VEC_POP_MODE): New mode attribute. (vec_pop_mode): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/popcnt-sve.c: Update test. * gcc.target/aarch64/popcnt11.c: New test. * gcc.target/aarch64/popcnt12.c: New test. Signed-off-by: Pengxuan Zheng Diff: --- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64-simd.md| 15 +-- gcc/config/aarch64/aarch64.cc | 21 ++ gcc/config/aarch64/aarch64.md | 9 + gcc/config/aarch64/iterators.md | 16 ++-- gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 10 ++--- gcc/testsuite/gcc.target/aarch64/popcnt11.c | 58 +++ gcc/testsuite/gcc.target/aarch64/popcnt12.c | 20 + 8 files changed, 139 insertions(+), 11 deletions(-) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 06aa0aac0df6..75f30a52e617 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -917,6 +917,7 @@ rtx aarch64_expand_sve_dupq (rtx, machine_mode, rtx); void aarch64_expand_mov_immediate (rtx, rtx); rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type); rtx aarch64_ptrue_reg (machine_mode); +rtx aarch64_ptrue_reg (machine_mode, unsigned int); rtx aarch64_pfalse_reg (machine_mode); bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *); void aarch64_emit_sve_pred_move (rtx, rtx, rtx); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 04851524fdea..68839246fd8a 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -3516,19 +3516,28 @@ ) (define_expand "popcount2" - [(set (match_operand:VDQHSD 0 "register_operand") - (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))] + [(set (match_operand:VDQHSD_V1DI 0 "register_operand") + (popcount:VDQHSD_V1DI + (match_operand:VDQHSD_V1DI 1 "register_operand")))] "TARGET_SIMD" { if (TARGET_SVE) { - rtx p = aarch64_ptrue_reg (mode); + rtx p = aarch64_ptrue_reg (mode, == 64 ? 8 : 16); emit_insn (gen_aarch64_pred_popcount (operands[0], p, operands[1])); DONE; } +if (mode == V1DImode) + { + rtx out = gen_reg_rtx (DImode); + emit_insn (gen_popcountdi2 (out, gen_lowpart (DImode, operands[1]))); + emit_move_insn (operands[0], gen_lowpart (mode, out)); + DONE; + } + /* Generate a byte popcount. */ machine_mode mode = == 64 ? V8QImode : V16QImode; machine_mode mode2 = == 64 ? V2SImode : V4SImode; diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 3e1d67431566..e6d957d275d1 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -3630,6 +3630,27 @@ aarch64_ptrue_reg (machine_mode mode) return gen_lowpart (mode, reg); } +/* Return an all-true (restricted to the leading VL bits) predicate register of + mode MODE. */ + +rtx +aarch64_ptrue_reg (machine_mode mode, unsigned int vl) +{ + gcc_assert (aarch64_sve_pred_mode_p (mode)); + + rtx_vector_builder builder (VNx16BImode, vl, 2); + + for (int i = 0; i < vl; i++) +builder.quick_push (CONST1_RTX (BImode)); + + for (int i = 0; i < vl; i++) +builder.quick_push (CONST0_RTX (BImode)); + + rtx const_vec = builder.build (); + rtx reg = force_reg (VNx16BImode, const_vec); + return gen_lowpart (mode, reg); +} + /* Return an all-false predicate register of mode MODE. */ rtx diff --git a/gcc/config/aarch64/aarch64.