Sorry for the slow review.
Pengxuan Zheng <[email protected]> writes:
> This patch improves the Advanced SIMD popcount expansion by using SVE if
> available.
>
> For example, GCC currently generates the following code sequence for V2DI:
> cnt v31.16b, v31.16b
> uaddlp v31.8h, v31.16b
> uaddlp v31.4s, v31.8h
> uaddlp v31.2d, v31.4s
>
> However, by using SVE, we can generate the following sequence instead:
> ptrue p7.b, all
> cnt z31.d, p7/m, z31.d
>
> Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.
>
> The scalar popcount expansion can also be improved similarly by using SVE and
> those changes will be included in a separate patch.
>
> PR target/113860
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-simd.md (popcount<mode>2): Add TARGET_SVE
> support.
> * config/aarch64/aarch64-sve.md (@aarch64_pred_popcount<mode>): New
> insn.
> * config/aarch64/iterators.md (VPRED): Add V4HI, V8HI and V2SI.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/popcnt-sve.c: New test.
>
> Signed-off-by: Pengxuan Zheng <[email protected]>
> ---
> gcc/config/aarch64/aarch64-simd.md | 9 ++
> gcc/config/aarch64/aarch64-sve.md | 12 +++
> gcc/config/aarch64/iterators.md | 1 +
> gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++++++++++++++++++
> 4 files changed, 110 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index bbeee221f37..895d6e5eab5 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3508,6 +3508,15 @@ (define_expand "popcount<mode>2"
> (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
> "TARGET_SIMD"
> {
> + if (TARGET_SVE)
> + {
> + rtx p = aarch64_ptrue_reg (<VPRED>mode);
> + emit_insn (gen_aarch64_pred_popcount<mode> (operands[0],
> + p,
> + operands[1]));
> + DONE;
> + }
> +
> /* Generate a byte popcount. */
> machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode;
> rtx tmp = gen_reg_rtx (mode);
> diff --git a/gcc/config/aarch64/aarch64-sve.md
> b/gcc/config/aarch64/aarch64-sve.md
> index 5331e7121d5..b5021dd2da0 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -3168,6 +3168,18 @@ (define_insn "*cond_<optab><mode>_any"
> }
> )
>
> +;; Popcount predicated with a PTRUE.
> +(define_insn "@aarch64_pred_popcount<mode>"
> + [(set (match_operand:VDQHSD 0 "register_operand" "=w")
> + (unspec:VDQHSD
> + [(match_operand:<VPRED> 1 "register_operand" "Upl")
> + (popcount:VDQHSD
> + (match_operand:VDQHSD 2 "register_operand" "0"))]
> + UNSPEC_PRED_X))]
> + "TARGET_SVE"
> + "cnt\t%Z0.<Vetype>, %1/m, %Z2.<Vetype>"
> +)
> +
Could you instead change:
(define_insn "@aarch64_pred_<optab><mode>"
[(set (match_operand:SVE_I 0 "register_operand")
(unspec:SVE_I
[(match_operand:<VPRED> 1 "register_operand")
(SVE_INT_UNARY:SVE_I
(match_operand:SVE_I 2 "register_operand"))]
UNSPEC_PRED_X))]
"TARGET_SVE"
{@ [ cons: =0 , 1 , 2 ; attrs: movprfx ]
[ w , Upl , 0 ; * ] <sve_int_op>\t%0.<Vetype>, %1/m,
%2.<Vetype>
[ ?&w , Upl , w ; yes ] movprfx\t%0,
%2\;<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>
}
)
to use a new iterator SVE_VDQ_I, defined as:
(define_mode_iterator SVE_VDQ_I [SVE_I VDQI_I])
? That will give the benefit of the movprfx handling and avoid code
duplication. It will define some patterns that are initially unused,
but that's ok. I think the direction of travel would be to use some
of the others eventually.
OK with that change if there are no other comments in 24 hours.
Thanks,
Richard
> ;; -------------------------------------------------------------------------
> ;; ---- [INT] General unary arithmetic corresponding to unspecs
> ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index f527b2cfeb8..a06159b23ea 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -2278,6 +2278,7 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI
> "VNx8BI")
> (VNx32BF "VNx8BI")
> (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
> (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
> + (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
> (V4SI "VNx4BI") (V2DI "VNx2BI")])
>
> ;; ...and again in lower case.
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> new file mode 100644
> index 00000000000..8e349efe390
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
> @@ -0,0 +1,88 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=armv8.2-a+sve -fno-vect-cost-model
> -fno-schedule-insns -fno-schedule-insns2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/*
> +** f_v4hi:
> +** ptrue (p[0-7]).b, all
> +** ldr d([0-9]+), \[x0\]
> +** cnt z\2.h, \1/m, z\2.h
> +** str d\2, \[x1\]
> +** ret
> +*/
> +void
> +f_v4hi (unsigned short *__restrict b, unsigned short *__restrict d)
> +{
> + d[0] = __builtin_popcount (b[0]);
> + d[1] = __builtin_popcount (b[1]);
> + d[2] = __builtin_popcount (b[2]);
> + d[3] = __builtin_popcount (b[3]);
> +}
> +
> +/*
> +** f_v8hi:
> +** ptrue (p[0-7]).b, all
> +** ldr q([0-9]+), \[x0\]
> +** cnt z\2.h, \1/m, z\2.h
> +** str q\2, \[x1\]
> +** ret
> +*/
> +void
> +f_v8hi (unsigned short *__restrict b, unsigned short *__restrict d)
> +{
> + d[0] = __builtin_popcount (b[0]);
> + d[1] = __builtin_popcount (b[1]);
> + d[2] = __builtin_popcount (b[2]);
> + d[3] = __builtin_popcount (b[3]);
> + d[4] = __builtin_popcount (b[4]);
> + d[5] = __builtin_popcount (b[5]);
> + d[6] = __builtin_popcount (b[6]);
> + d[7] = __builtin_popcount (b[7]);
> +}
> +
> +/*
> +** f_v2si:
> +** ptrue (p[0-7]).b, all
> +** ldr d([0-9]+), \[x0\]
> +** cnt z\2.s, \1/m, z\2.s
> +** str d\2, \[x1\]
> +** ret
> +*/
> +void
> +f_v2si (unsigned int *__restrict b, unsigned int *__restrict d)
> +{
> + d[0] = __builtin_popcount (b[0]);
> + d[1] = __builtin_popcount (b[1]);
> +}
> +
> +/*
> +** f_v4si:
> +** ptrue (p[0-7]).b, all
> +** ldr q([0-9]+), \[x0\]
> +** cnt z\2.s, \1/m, z\2.s
> +** str q\2, \[x1\]
> +** ret
> +*/
> +void
> +f_v4si (unsigned int *__restrict b, unsigned int *__restrict d)
> +{
> + d[0] = __builtin_popcount (b[0]);
> + d[1] = __builtin_popcount (b[1]);
> + d[2] = __builtin_popcount (b[2]);
> + d[3] = __builtin_popcount (b[3]);
> +}
> +
> +/*
> +** f_v2di:
> +** ptrue (p[0-7]).b, all
> +** ldr q([0-9]+), \[x0\]
> +** cnt z\2.d, \1/m, z\2.d
> +** str q\2, \[x1\]
> +** ret
> +*/
> +void
> +f_v2di (unsigned long *__restrict b, unsigned long *__restrict d)
> +{
> + d[0] = __builtin_popcountll (b[0]);
> + d[1] = __builtin_popcountll (b[1]);
> +}