This patch add vector pattern for __builtin_ctz.
like __builtin_clz, only 32bit version of ctz supported.
for scalar version ctz, we expand it into:
rbit
clz
reverse bits first, then turn cout tailing zero into count leading zero.
while for vector version, rbit only support byte granularity .8B and .16B.
no half-word, and word. so we need to first reverse byte within word,
then reverse bits within byte. thus the generated instruction sequences are:
void
count_tz_v4si (unsigned *__restrict a, int *__restrict b)
{
int i;
for (i = 0; i < 4; i++)
b[i] = __builtin_ctz (a[i]);
}
void
count_tz_v2si (unsigned *__restrict a, int *__restrict b)
{
int i;
for (i = 0; i < 2; i++)
b[i] = __builtin_ctz (a[i]);
}
count_tz_v4si:
ldr q0, [x0]
rev32 v0.16b, v0.16b
rbit v0.16b, v0.16b
clz v0.4s, v0.4s
str q0, [x1]
ret
count_tz_v2si:
ldr d0, [x0]
rev32 v0.8b, v0.8b
rbit v0.8b, v0.8b
clz v0.2s, v0.2s
str d0, [x1]
ret
no regression on aarch64-none-gnu-linux qemu test.
ok for trunk?
thanks.
gcc/
* config/aarch64/iterators.md (VS): New mode iterator.
(vsi2qi): New mode attribute.
(VSI2QI): Likewise.
* config/aarch64/aarch64-simd-builtins.def: New entry for ctz.
* config/aarch64/aarch64-simd.md (ctz<mode>2): New pattern for ctz.
* config/aarch64/aarch64-builtins.c
(aarch64_builtin_vectorized_function): Support BUILT_IN_CTZ.
gcc/testsuite/
* gcc.target/aarch64/vect_ctz_1.c: New testcase.
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 527445c..3250f3c 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -1097,6 +1097,14 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_clzv4si];
return NULL_TREE;
}
+ case BUILT_IN_CTZ:
+ {
+ if (AARCH64_CHECK_BUILTIN_MODE (2, S))
+ return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_ctzv2si];
+ else if (AARCH64_CHECK_BUILTIN_MODE (4, S))
+ return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_ctzv4si];
+ return NULL_TREE;
+ }
#undef AARCH64_CHECK_BUILTIN_MODE
#define AARCH64_CHECK_BUILTIN_MODE(C, N) \
(out_mode == N##Imode && out_n == C \
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 62b7f33..c611b5c 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -46,6 +46,7 @@
BUILTIN_VD_BHSI (BINOP, addp, 0)
VAR1 (UNOP, addp, 0, di)
BUILTIN_VDQ_BHSI (UNOP, clz, 2)
+ BUILTIN_VS (UNOP, ctz, 2)
BUILTIN_VALL (GETLANE, be_checked_get_lane, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ef196e4..5ee960f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -303,6 +303,20 @@
[(set_attr "type" "neon_rbit")]
)
+(define_expand "ctz<mode>2"
+ [(set (match_operand:VS 0 "register_operand")
+ (ctz:VS (match_operand:VS 1 "register_operand")))]
+ "TARGET_SIMD"
+ {
+ emit_insn (gen_bswap<mode> (operands[0], operands[1]));
+ rtx op0_castsi2qi = simplify_gen_subreg(<VS:VSI2QI>mode, operands[0],
+ <MODE>mode, 0);
+ emit_insn (gen_aarch64_rbit<VS:vsi2qi> (op0_castsi2qi, op0_castsi2qi));
+ emit_insn (gen_clz<mode>2 (operands[0], operands[0]));
+ DONE;
+ }
+)
+
(define_insn "*aarch64_mul3_elt<mode>"
[(set (match_operand:VMUL 0 "register_operand" "=w")
(mult:VMUL
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 9935167..b416e6a 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -183,6 +183,9 @@
;; All byte modes.
(define_mode_iterator VB [V8QI V16QI])
+;; 2 and 4 lane SI modes.
+(define_mode_iterator VS [V2SI V4SI])
+
(define_mode_iterator TX [TI TF])
;; Opaque structure modes.
@@ -670,6 +673,9 @@
(V2DI "p") (V2DF "p")
(V2SF "p") (V4SF "v")])
+(define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
+(define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
+
;; -------------------------------------------------------------------
;; Code Iterators
;; -------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c b/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c
new file mode 100644
index 0000000..40823b0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_ctz_1.c
@@ -0,0 +1,41 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps -fno-inline" } */
+
+extern void abort ();
+
+#define TEST(name, subname, count) \
+void \
+count_tz_##name (unsigned *__restrict a, int *__restrict b) \
+{ \
+ int i; \
+ for (i = 0; i < count; i++) \
+ b[i] = __builtin_##subname (a[i]); \
+}
+
+#define CHECK(name, count, input, output) \
+ count_tz_##name (input, output); \
+ for (i = 0; i < count; i++) \
+ { \
+ if (output[i] != r[i]) \
+ abort (); \
+ }
+
+TEST (v4si, ctz, 4)
+TEST (v2si, ctz, 2)
+/* { dg-final { scan-assembler "clz\tv\[0-9\]+\.4s" } } */
+/* { dg-final { scan-assembler "clz\tv\[0-9\]+\.2s" } } */
+
+int
+main ()
+{
+ unsigned int x4[4] = { 0x0, 0xFF80, 0x1FFFF, 0xFF000000 };
+ int r[4] = { 32, 7, 0, 24 };
+ int d[4], i;
+
+ CHECK (v4si, 4, x4, d);
+ CHECK (v2si, 2, x4, d);
+
+ return 0;
+}
+
+/* { dg-final { cleanup-saved-temps } } */