Re: [PATCH v2] aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]

Richard Sandiford Thu, 10 Oct 2024 12:19:24 -0700

Pengxuan Zheng <quic_pzh...@quicinc.com> writes:
> This is similar to the recent improvements to the Advanced SIMD popcount
> expansion by using SVE. We can utilize SVE to generate more efficient code for
> scalar mode popcount too.
>
> Changes since v1:
> * v2: Add a new VNx1BI mode and a new test case for V1DI.


Sorry for the delay in reviewing this, and for the run-around,
but: following the later discussion in the FLOGB thread about using
SVE for Advanced SIMD modes, the agreement was to use the full SVE
predicate mode, but with predicate restricted to the leading 64 bits
or 128 bits (for 64-bit and 128-bit Advanced SIMD modes respectively).
I think we should do that even when it isn't strictly necessary, partly
so that all Advanced SIMD code uses the same predicate, and partly to
avoid bugs that only show up on VL>128 targets.

I'm afraid that means going back to VNx2BI, as in your original patch.
But we should use:

        ptrue   pN.b, vl8

rather than:

        ptrue   pN.b, all

to set the predicate.  We could do this by adding;

rtx
aarch64_ptrue_reg (machine_mode mode, unsigned int vl)

where "vl" is 8 for 64-bit modes and 16 for 128-bit modes.  Like with
the current aarch64_ptrue_reg, the predicate would always be constructed
in VNx16BImode and then cast to the right mode.

Thanks,
Richard

>
>       PR target/113860
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64-modes.def (VECTOR_BOOL_MODE): Add VNx1BI.
>       (ADJUST_NUNITS): Likewise.
>       (ADJUST_ALIGNMENT): Likewise.
>       * config/aarch64/aarch64-simd.md (popcount<mode>2): Update pattern to
>       also support V1DI mode.
>       * config/aarch64/aarch64.cc (aarch64_sve_pred_mode_p): Add VNx1BImode.
>       * config/aarch64/aarch64.md (popcount<mode>2): Add TARGET_SVE support.
>       * config/aarch64/iterators.md (VDQHSD_V1DI): New mode iterator.
>       (SVE_VDQ_I): Add V1DI.
>       (bitsize): Likewise.
>       (VPRED): Likewise.
>       (VEC_POP_MODE): New mode attribute.
>       (vec_pop_mode): Likewise.
>
> gcc/testsuite/ChangeLog:
>
>       * gcc.target/aarch64/popcnt11.c: New test.
>       * gcc.target/aarch64/popcnt12.c: New test.
>
> Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64-modes.def        |  3 ++
>  gcc/config/aarch64/aarch64-simd.md          | 13 ++++-
>  gcc/config/aarch64/aarch64.cc               |  3 +-
>  gcc/config/aarch64/aarch64.md               |  9 ++++
>  gcc/config/aarch64/iterators.md             | 16 ++++--
>  gcc/testsuite/gcc.target/aarch64/popcnt11.c | 58 +++++++++++++++++++++
>  gcc/testsuite/gcc.target/aarch64/popcnt12.c | 18 +++++++
>  7 files changed, 114 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt11.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt12.c
>
> diff --git a/gcc/config/aarch64/aarch64-modes.def 
> b/gcc/config/aarch64/aarch64-modes.def
> index 25a22c1195e..d822d4dfc13 100644
> --- a/gcc/config/aarch64/aarch64-modes.def
> +++ b/gcc/config/aarch64/aarch64-modes.def
> @@ -53,18 +53,21 @@ VECTOR_BOOL_MODE (VNx16BI, 16, BI, 2);
>  VECTOR_BOOL_MODE (VNx8BI, 8, BI, 2);
>  VECTOR_BOOL_MODE (VNx4BI, 4, BI, 2);
>  VECTOR_BOOL_MODE (VNx2BI, 2, BI, 2);
> +VECTOR_BOOL_MODE (VNx1BI, 1, BI, 2);
>  
>  ADJUST_NUNITS (VNx32BI, aarch64_sve_vg * 16);
>  ADJUST_NUNITS (VNx16BI, aarch64_sve_vg * 8);
>  ADJUST_NUNITS (VNx8BI, aarch64_sve_vg * 4);
>  ADJUST_NUNITS (VNx4BI, aarch64_sve_vg * 2);
>  ADJUST_NUNITS (VNx2BI, aarch64_sve_vg);
> +ADJUST_NUNITS (VNx1BI, exact_div (aarch64_sve_vg, 2));
>  
>  ADJUST_ALIGNMENT (VNx32BI, 2);
>  ADJUST_ALIGNMENT (VNx16BI, 2);
>  ADJUST_ALIGNMENT (VNx8BI, 2);
>  ADJUST_ALIGNMENT (VNx4BI, 2);
>  ADJUST_ALIGNMENT (VNx2BI, 2);
> +ADJUST_ALIGNMENT (VNx1BI, 2);
>  
>  /* Bfloat16 modes.  */
>  FLOAT_MODE (BF, 2, 0);
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 23c03a96371..386b1fa1f4b 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3515,8 +3515,9 @@ (define_insn "popcount<mode>2<vczle><vczbe>"
>  )
>  
>  (define_expand "popcount<mode>2"
> -  [(set (match_operand:VDQHSD 0 "register_operand")
> -     (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
> +  [(set (match_operand:VDQHSD_V1DI 0 "register_operand")
> +     (popcount:VDQHSD_V1DI
> +       (match_operand:VDQHSD_V1DI 1 "register_operand")))]
>    "TARGET_SIMD"
>    {
>      if (TARGET_SVE)
> @@ -3528,6 +3529,14 @@ (define_expand "popcount<mode>2"
>       DONE;
>        }
>  
> +    if (<MODE>mode == V1DImode)
> +      {
> +     rtx out = gen_reg_rtx (DImode);
> +     emit_insn (gen_popcountdi2 (out, gen_lowpart (DImode, operands[1])));
> +     emit_move_insn (operands[0], gen_lowpart (<MODE>mode, out));
> +     DONE;
> +      }
> +
>      /* Generate a byte popcount.  */
>      machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode;
>      rtx tmp = gen_reg_rtx (mode);
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 92763d403c7..78f65f886b7 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -1476,7 +1476,8 @@ aarch64_sve_pred_mode_p (machine_mode mode)
>         && (mode == VNx16BImode
>             || mode == VNx8BImode
>             || mode == VNx4BImode
> -           || mode == VNx2BImode));
> +           || mode == VNx2BImode
> +           || mode == VNx1BImode));
>  }
>  
>  /* Three mutually-exclusive flags describing a vector or predicate type.  */
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index c54b29cd64b..ef52770f1cb 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -5345,6 +5345,15 @@ (define_expand "popcount<mode>2"
>       (popcount:ALLI (match_operand:ALLI 1 "register_operand")))]
>    "TARGET_CSSC ? GET_MODE_BITSIZE (<MODE>mode) >= 32 : TARGET_SIMD"
>  {
> +  if (!TARGET_CSSC && TARGET_SVE && <MODE>mode != QImode)
> +    {
> +      rtx tmp = gen_reg_rtx (<VEC_POP_MODE>mode);
> +      rtx op1 = gen_lowpart (<VEC_POP_MODE>mode, operands[1]);
> +      emit_insn (gen_popcount<vec_pop_mode>2 (tmp, op1));
> +      emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
> +      DONE;
> +    }
> +
>    if (!TARGET_CSSC)
>      {
>        rtx v = gen_reg_rtx (V8QImode);
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 20a318e023b..6c46e4fec3b 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -290,6 +290,8 @@ (define_mode_iterator VDQHS [V4HI V8HI V2SI V4SI])
>  ;; Advanced SIMD modes for H, S and D types.
>  (define_mode_iterator VDQHSD [V4HI V8HI V2SI V4SI V2DI])
>  
> +(define_mode_iterator VDQHSD_V1DI [VDQHSD V1DI])
> +
>  ;; Advanced SIMD and scalar integer modes for H and S.
>  (define_mode_iterator VSDQ_HSI [V4HI V8HI V2SI V4SI HI SI])
>  
> @@ -560,7 +562,7 @@ (define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
>  (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
>  
>  ;; All SVE and Advanced SIMD integer vector modes.
> -(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
> +(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I V1DI])
>  
>  ;; SVE integer vector modes whose elements are 16 bits or wider.
>  (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
> @@ -1230,7 +1232,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI "16")
>  (define_mode_attr bitsize [(V8QI "64") (V16QI "128")
>                          (V4HI "64") (V8HI "128")
>                          (V2SI "64") (V4SI "128")
> -                                    (V2DI "128")])
> +                        (V1DI "64") (V2DI "128")])
>  
>  ;; Map a floating point or integer mode to the appropriate register name 
> prefix
>  (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
> @@ -2284,7 +2286,7 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI 
> "VNx8BI")
>                        (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
>                        (V8QI "VNx8BI") (V16QI "VNx16BI")
>                        (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
> -                      (V4SI "VNx4BI") (V2DI "VNx2BI")])
> +                      (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx1BI")])
>  
>  ;; ...and again in lower case.
>  (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
> @@ -2318,6 +2320,14 @@ (define_mode_attr VDOUBLE [(VNx16QI "VNx32QI")
>                          (VNx4SI "VNx8SI") (VNx4SF "VNx8SF")
>                          (VNx2DI "VNx4DI") (VNx2DF "VNx4DF")])
>  
> +;; The Advanced SIMD modes of popcount corresponding to scalar modes.
> +(define_mode_attr VEC_POP_MODE [(QI "V8QI") (HI "V4HI")
> +                             (SI "V2SI") (DI "V1DI")])
> +
> +;; ...and again in lower case.
> +(define_mode_attr vec_pop_mode [(QI "v8qi") (HI "v4hi")
> +                             (SI "v2si") (DI "v1di")])
> +
>  ;; On AArch64 the By element instruction doesn't have a 2S variant.
>  ;; However because the instruction always selects a pair of values
>  ;; The normal 3SAME instruction can be used here instead.
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt11.c 
> b/gcc/testsuite/gcc.target/aarch64/popcnt11.c
> new file mode 100644
> index 00000000000..595b2f9eb93
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt11.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=armv8.2-a+sve" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/*
> +** f_qi:
> +**   ldr     b([0-9]+), \[x0\]
> +**   cnt     v\1.8b, v\1.8b
> +**   smov    w0, v\1.b\[0\]
> +**   ret
> +*/
> +unsigned
> +f_qi (unsigned char *a)
> +{
> +  return __builtin_popcountg (a[0]);
> +}
> +
> +/*
> +** f_hi:
> +**   ldr     h([0-9]+), \[x0\]
> +**   ptrue   (p[0-7]).b, all
> +**   cnt     z\1.h, \2/m, z\1.h
> +**   smov    w0, v\1.h\[0\]
> +**   ret
> +*/
> +unsigned
> +f_hi (unsigned short *a)
> +{
> +  return __builtin_popcountg (a[0]);
> +}
> +
> +/*
> +** f_si:
> +**   ldr     s([0-9]+), \[x0\]
> +**   ptrue   (p[0-7]).b, all
> +**   cnt     z\1.s, \2/m, z\1.s
> +**   umov    x0, v\1.d\[0\]
> +**   ret
> +*/
> +unsigned
> +f_si (unsigned int *a)
> +{
> +  return __builtin_popcountg (a[0]);
> +}
> +
> +/*
> +** f_di:
> +**   ldr     d([0-9]+), \[x0\]
> +**   ptrue   (p[0-7])\.b, all
> +**   cnt     z\1\.d, \2/m, z\1\.d
> +**   fmov    x0, d\1
> +**   ret
> +*/
> +unsigned
> +f_di (unsigned long *a)
> +{
> +  return __builtin_popcountg (a[0]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt12.c 
> b/gcc/testsuite/gcc.target/aarch64/popcnt12.c
> new file mode 100644
> index 00000000000..f086cae55a2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/popcnt12.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fgimple" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/*
> +** foo:
> +**   cnt     v0.8b, v0.8b
> +**   addv    b0, v0.8b
> +**   ret
> +*/
> +__Uint64x1_t __GIMPLE
> +foo (__Uint64x1_t x)
> +{
> +  __Uint64x1_t z;
> +
> +  z = .POPCOUNT (x);
> +  return z;
> +}

Re: [PATCH v2] aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]

Reply via email to