For popcount for bytes, we don't need the reduction addition
after the vector cnt instruction as we are only counting one
byte's popcount.
This implements a new define_expand to handle that.
Bootstrapped and tested on aarch64-linux-gnu with no regressions.
PR target/113042
gcc/ChangeLog:
* config/aarch64/aarch64.md (popcountqi2): New pattern.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/popcnt5.c: New test.
Signed-off-by: Andrew Pinski <[email protected]>
---
gcc/config/aarch64/aarch64.md | 26 ++++++++++++++++++++++
gcc/testsuite/gcc.target/aarch64/popcnt5.c | 19 ++++++++++++++++
2 files changed, 45 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt5.c
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 389a1906e23..ebaf7ec9970 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5358,6 +5358,32 @@ (define_expand "popcount<mode>2"
}
})
+/* Popcount for byte can remove the reduction part after the popcount.
+ For optimization reasons, enabling this for CSSC. */
+(define_expand "popcountqi2"
+ [(set (match_operand:QI 0 "register_operand" "=w")
+ (popcount:QI (match_operand:QI 1 "register_operand" "w")))]
+ "TARGET_CSSC || TARGET_SIMD"
+{
+ rtx in = operands[1];
+ rtx out = operands[0];
+ if (TARGET_CSSC)
+ {
+ rtx tmp = gen_reg_rtx (SImode);
+ rtx out1 = gen_reg_rtx (SImode);
+ emit_insn (gen_zero_extendqisi2 (tmp, in));
+ emit_insn (gen_popcountsi2 (out1, tmp));
+ emit_move_insn (out, gen_lowpart (QImode, out1));
+ DONE;
+ }
+ rtx v = gen_reg_rtx (V8QImode);
+ rtx v1 = gen_reg_rtx (V8QImode);
+ emit_move_insn (v, gen_lowpart (V8QImode, in));
+ emit_insn (gen_popcountv8qi2 (v1, v));
+ emit_move_insn (out, gen_lowpart (QImode, v1));
+ DONE;
+})
+
(define_insn "clrsb<mode>2"
[(set (match_operand:GPI 0 "register_operand" "=r")
(clrsb:GPI (match_operand:GPI 1 "register_operand" "r")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/popcnt5.c
b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
new file mode 100644
index 00000000000..406369d9b29
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/popcnt5.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR target/113042 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** h8:
+** ldr b[0-9]+, \[x0\]
+** cnt v[0-9]+.8b, v[0-9]+.8b
+** smov w0, v[0-9]+.b\[0\]
+** ret
+*/
+/* We should not need the addv here since we only need a byte popcount. */
+
+unsigned h8 (const unsigned char *a) {
+ return __builtin_popcountg (a[0]);
+}
--
2.42.0