This patch improves the Advanced SIMD popcount expansion by using SVE if
available.

For example, GCC currently generates the following code sequence for V2DI:
  cnt     v31.16b, v31.16b
  uaddlp  v31.8h, v31.16b
  uaddlp  v31.4s, v31.8h
  uaddlp  v31.2d, v31.4s

However, by using SVE, we can generate the following sequence instead:
  ptrue   p7.b, all
  cnt     z31.d, p7/m, z31.d

Similar improvements can be made for V4HI, V8HI, V2SI and V4SI too.

The scalar popcount expansion can also be improved similarly by using SVE and
those changes will be included in a separate patch.

        PR target/113860

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md (popcount<mode>2): Add TARGET_SVE
        support.
        * config/aarch64/aarch64-sve.md (@aarch64_pred_popcount<mode>): New
        insn.
        * config/aarch64/iterators.md (VPRED): Add V4HI, V8HI and V2SI.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/popcnt-sve.c: New test.

Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com>
---
 gcc/config/aarch64/aarch64-simd.md            |  9 ++
 gcc/config/aarch64/aarch64-sve.md             | 12 +++
 gcc/config/aarch64/iterators.md               |  1 +
 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c | 88 +++++++++++++++++++
 4 files changed, 110 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt-sve.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index bbeee221f37..895d6e5eab5 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3508,6 +3508,15 @@ (define_expand "popcount<mode>2"
        (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
   "TARGET_SIMD"
   {
+    if (TARGET_SVE)
+      {
+       rtx p = aarch64_ptrue_reg (<VPRED>mode);
+       emit_insn (gen_aarch64_pred_popcount<mode> (operands[0],
+                                                   p,
+                                                   operands[1]));
+       DONE;
+      }
+
     /* Generate a byte popcount.  */
     machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode;
     rtx tmp = gen_reg_rtx (mode);
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 5331e7121d5..b5021dd2da0 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3168,6 +3168,18 @@ (define_insn "*cond_<optab><mode>_any"
   }
 )
 
+;; Popcount predicated with a PTRUE.
+(define_insn "@aarch64_pred_popcount<mode>"
+  [(set (match_operand:VDQHSD 0 "register_operand" "=w")
+       (unspec:VDQHSD
+         [(match_operand:<VPRED> 1 "register_operand" "Upl")
+          (popcount:VDQHSD
+            (match_operand:VDQHSD 2 "register_operand" "0"))]
+         UNSPEC_PRED_X))]
+  "TARGET_SVE"
+  "cnt\t%Z0.<Vetype>, %1/m, %Z2.<Vetype>"
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] General unary arithmetic corresponding to unspecs
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index f527b2cfeb8..a06159b23ea 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -2278,6 +2278,7 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI 
"VNx8BI")
                         (VNx32BF "VNx8BI")
                         (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
                         (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
+                        (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
                         (V4SI "VNx4BI") (V2DI "VNx2BI")])
 
 ;; ...and again in lower case.
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c 
b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
new file mode 100644
index 00000000000..8e349efe390
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
@@ -0,0 +1,88 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8.2-a+sve -fno-vect-cost-model 
-fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f_v4hi:
+**     ptrue   (p[0-7]).b, all
+**     ldr     d([0-9]+), \[x0\]
+**     cnt     z\2.h, \1/m, z\2.h
+**     str     d\2, \[x1\]
+**     ret
+*/
+void
+f_v4hi (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[1] = __builtin_popcount (b[1]);
+  d[2] = __builtin_popcount (b[2]);
+  d[3] = __builtin_popcount (b[3]);
+}
+
+/*
+** f_v8hi:
+**     ptrue   (p[0-7]).b, all
+**     ldr     q([0-9]+), \[x0\]
+**     cnt     z\2.h, \1/m, z\2.h
+**     str     q\2, \[x1\]
+**     ret
+*/
+void
+f_v8hi (unsigned short *__restrict b, unsigned short *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[1] = __builtin_popcount (b[1]);
+  d[2] = __builtin_popcount (b[2]);
+  d[3] = __builtin_popcount (b[3]);
+  d[4] = __builtin_popcount (b[4]);
+  d[5] = __builtin_popcount (b[5]);
+  d[6] = __builtin_popcount (b[6]);
+  d[7] = __builtin_popcount (b[7]);
+}
+
+/*
+** f_v2si:
+**     ptrue   (p[0-7]).b, all
+**     ldr     d([0-9]+), \[x0\]
+**     cnt     z\2.s, \1/m, z\2.s
+**     str     d\2, \[x1\]
+**     ret
+*/
+void
+f_v2si (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[1] = __builtin_popcount (b[1]);
+}
+
+/*
+** f_v4si:
+**     ptrue   (p[0-7]).b, all
+**     ldr     q([0-9]+), \[x0\]
+**     cnt     z\2.s, \1/m, z\2.s
+**     str     q\2, \[x1\]
+**     ret
+*/
+void
+f_v4si (unsigned int *__restrict b, unsigned int *__restrict d)
+{
+  d[0] = __builtin_popcount (b[0]);
+  d[1] = __builtin_popcount (b[1]);
+  d[2] = __builtin_popcount (b[2]);
+  d[3] = __builtin_popcount (b[3]);
+}
+
+/*
+** f_v2di:
+**     ptrue   (p[0-7]).b, all
+**     ldr     q([0-9]+), \[x0\]
+**     cnt     z\2.d, \1/m, z\2.d
+**     str     q\2, \[x1\]
+**     ret
+*/
+void
+f_v2di (unsigned long *__restrict b, unsigned long *__restrict d)
+{
+  d[0] = __builtin_popcountll (b[0]);
+  d[1] = __builtin_popcountll (b[1]);
+}
-- 
2.17.1

Reply via email to