SVE2 adds [US]ADDW[TB] which we can use when we have to do a single step
widening addition. This is useful for instance when the value to be widened
does not come from a load. For example for
int foo2_int(unsigned short *x, unsigned short * restrict y) {
int sum = 0;
for (int i = 0; i < 8000; i++)
{
x[i] = x[i] + y[i];
sum += x[i];
}
return sum;
}
we used to generate
.L6:
ld1h z1.h, p7/z, [x0, x2, lsl 1]
ld1h z29.h, p7/z, [x1, x2, lsl 1]
add z29.h, z29.h, z1.h
punpklo p6.h, p7.b
uunpklo z0.s, z29.h
add z31.s, p6/m, z31.s, z0.s
punpkhi p6.h, p7.b
uunpkhi z30.s, z29.h
add z31.s, p6/m, z31.s, z30.s
st1h z29.h, p7, [x0, x2, lsl 1]
add x2, x2, x4
whilelo p7.h, w2, w3
b.any .L6
ptrue p7.b, all
uaddv d31, p7, z31.s
but with +sve2
.L12:
ld1h z30.h, p7/z, [x0, x2, lsl 1]
ld1h z29.h, p7/z, [x1, x2, lsl 1]
add z30.h, z30.h, z29.h
uaddwb z31.s, z31.s, z30.h
uaddwt z31.s, z31.s, z30.h
st1h z30.h, p7, [x0, x2, lsl 1]
mov x3, x2
inch x2
cmp w2, w4
bls .L12
inch x3
uaddv d31, p7, z31.s
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
PR middle-end/122069
* config/aarch64/aarch64-sve2.md
(@aarch64_sve_<sve_int_op>_internal<mode>): New.
(widen_ssum<mode><Vnarrow>3): New.
(widen_usum<mode><Vnarrow>3): New.
* config/aarch64/iterators.md (Vnarrow): New, to match VNARROW.
gcc/testsuite/ChangeLog:
PR middle-end/122069
* gcc.target/aarch64/sve2/pr122069_1.c: New test.
* gcc.target/aarch64/sve2/pr122069_2.c: New test.
---
diff --git a/gcc/config/aarch64/aarch64-sve2.md
b/gcc/config/aarch64/aarch64-sve2.md
index
69a376706facaa5f0dd5032fa30cb9298d222568..5f3b10ade8f55f9a71eaa0e3fe060627621108f6
100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -2377,6 +2377,50 @@ (define_insn "@aarch64_sve_<sve_int_op><mode>"
[(set_attr "sve_type" "sve_int_general")]
)
+(define_insn "@aarch64_sve_<sve_int_op>_internal<mode>"
+ [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
+ (unspec:SVE_FULL_HSDI
+ [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
+ (match_operand:<VNARROW> 2 "register_operand" "w")
+ (match_operand:SVE_FULL_HSDI 3 "register_operand" "0")]
+ SVE2_INT_BINARY_WIDE))]
+ "TARGET_SVE2"
+ "<sve_int_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Ventype>"
+ [(set_attr "sve_type" "sve_int_general")]
+)
+
+;; Define single step widening for widen_ssum using SADDWB and SADDWT
+(define_expand "widen_ssum<mode><Vnarrow>3"
+ [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
+ (unspec:SVE_FULL_HSDI
+ [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
+ (match_operand:<VNARROW> 1 "register_operand" "w")]
+ UNSPEC_SADDWB))
+ (set (match_dup 0)
+ (unspec:SVE_FULL_HSDI
+ [(match_dup 2)
+ (match_dup 1)
+ (match_dup 0)]
+ UNSPEC_SADDWT))]
+ "TARGET_SVE2"
+)
+
+;; Define single step widening for widen_usum using UADDWB and UADDWT
+(define_expand "widen_usum<mode><Vnarrow>3"
+ [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
+ (unspec:SVE_FULL_HSDI
+ [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
+ (match_operand:<VNARROW> 1 "register_operand" "w")]
+ UNSPEC_UADDWB))
+ (set (match_dup 0)
+ (unspec:SVE_FULL_HSDI
+ [(match_dup 2)
+ (match_dup 1)
+ (match_dup 0)]
+ UNSPEC_UADDWT))]
+ "TARGET_SVE2"
+)
+
;; -------------------------------------------------------------------------
;; ---- [INT] Long binary arithmetic
;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index
61ca4990b94170f016a9f50e3505c8cfb24df9be..3757998c0ea9831b526a5bbc8568933fc05ed5d4
100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1935,6 +1935,11 @@ (define_mode_attr VNARROW [(VNx8HI "VNx16QI")
(VNx2DI "VNx4SI") (VNx2DF "VNx4SF")
(VNx8SI "VNx8HI") (VNx16SI "VNx16QI")
(VNx8DI "VNx8HI")])
+(define_mode_attr Vnarrow [(VNx8HI "vnx16qi")
+ (VNx4SI "vnx8hi") (VNx4SF "vnx8hf")
+ (VNx2DI "vnx4si") (VNx2DF "vnx4sf")
+ (VNx8SI "vnx8hi") (VNx16SI "vnx16qi")
+ (VNx8DI "vnx8hi")])
;; Suffix mapping Advanced SIMD modes to be expanded as SVE instructions.
(define_mode_attr sve_di_suf [(VNx16QI "") (VNx8HI "") (VNx4SI "") (VNx2DI "")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_1.c
b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_1.c
new file mode 100644
index
0000000000000000000000000000000000000000..6a347072ae892ceeabddc05dbb4ead4814dda2da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_1.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a+sve2 -mautovec-preference=sve-only --param
vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks
-fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+/*
+** foo_int:
+** ...
+** sub z[0-9]+.b, z[0-9]+.b, z[0-9]+.b
+** udot z[0-9]+.s, z[0-9]+.b, z[0-9]+.b
+** ...
+*/
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+/*
+** foo2_int:
+** ...
+** add z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+** uaddwb z[0-9]+.s, z[0-9]+.s, z[0-9]+.h
+** uaddwt z[0-9]+.s, z[0-9]+.s, z[0-9]+.h
+** ...
+*/
+int foo2_int(unsigned short *x, unsigned short * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ {
+ x[i] = x[i] + y[i];
+ sum += x[i];
+ }
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_2.c
b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_2.c
new file mode 100644
index
0000000000000000000000000000000000000000..f9ae97158688aad60ed2b705c02621ee7a33e6ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_2.c
@@ -0,0 +1,81 @@
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve2_hw } */
+/* { dg-options "-O3 -march=armv8-a+sve2 -mautovec-preference=sve-only --param
vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks
-fno-schedule-insns2 -fdump-tree-vect-details" }*/
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+__attribute__((noipa))
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+__attribute__((noipa))
+int foo_int2(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int2(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+int main ()
+{
+ unsigned short a[100];
+ unsigned short b[100];
+ unsigned short r1[100];
+ unsigned short r2[100];
+ unsigned char c[100];
+ unsigned char d[100];
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ a[i] = c[i] = i;
+ b[i] = d[i] = 100 - i;
+ }
+
+ if (foo_int (c, d) != foo_int2 (c, d))
+ __builtin_abort();
+
+
+ if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
+ __builtin_abort();
+
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ if (r1[i] != r2[i])
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
\ No newline at end of file
--
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index 69a376706facaa5f0dd5032fa30cb9298d222568..5f3b10ade8f55f9a71eaa0e3fe060627621108f6 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -2377,6 +2377,50 @@ (define_insn "@aarch64_sve_<sve_int_op><mode>"
[(set_attr "sve_type" "sve_int_general")]
)
+(define_insn "@aarch64_sve_<sve_int_op>_internal<mode>"
+ [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
+ (unspec:SVE_FULL_HSDI
+ [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
+ (match_operand:<VNARROW> 2 "register_operand" "w")
+ (match_operand:SVE_FULL_HSDI 3 "register_operand" "0")]
+ SVE2_INT_BINARY_WIDE))]
+ "TARGET_SVE2"
+ "<sve_int_op>\t%0.<Vetype>, %1.<Vetype>, %2.<Ventype>"
+ [(set_attr "sve_type" "sve_int_general")]
+)
+
+;; Define single step widening for widen_ssum using SADDWB and SADDWT
+(define_expand "widen_ssum<mode><Vnarrow>3"
+ [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
+ (unspec:SVE_FULL_HSDI
+ [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
+ (match_operand:<VNARROW> 1 "register_operand" "w")]
+ UNSPEC_SADDWB))
+ (set (match_dup 0)
+ (unspec:SVE_FULL_HSDI
+ [(match_dup 2)
+ (match_dup 1)
+ (match_dup 0)]
+ UNSPEC_SADDWT))]
+ "TARGET_SVE2"
+)
+
+;; Define single step widening for widen_usum using UADDWB and UADDWT
+(define_expand "widen_usum<mode><Vnarrow>3"
+ [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
+ (unspec:SVE_FULL_HSDI
+ [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
+ (match_operand:<VNARROW> 1 "register_operand" "w")]
+ UNSPEC_UADDWB))
+ (set (match_dup 0)
+ (unspec:SVE_FULL_HSDI
+ [(match_dup 2)
+ (match_dup 1)
+ (match_dup 0)]
+ UNSPEC_UADDWT))]
+ "TARGET_SVE2"
+)
+
;; -------------------------------------------------------------------------
;; ---- [INT] Long binary arithmetic
;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 61ca4990b94170f016a9f50e3505c8cfb24df9be..3757998c0ea9831b526a5bbc8568933fc05ed5d4 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1935,6 +1935,11 @@ (define_mode_attr VNARROW [(VNx8HI "VNx16QI")
(VNx2DI "VNx4SI") (VNx2DF "VNx4SF")
(VNx8SI "VNx8HI") (VNx16SI "VNx16QI")
(VNx8DI "VNx8HI")])
+(define_mode_attr Vnarrow [(VNx8HI "vnx16qi")
+ (VNx4SI "vnx8hi") (VNx4SF "vnx8hf")
+ (VNx2DI "vnx4si") (VNx2DF "vnx4sf")
+ (VNx8SI "vnx8hi") (VNx16SI "vnx16qi")
+ (VNx8DI "vnx8hi")])
;; Suffix mapping Advanced SIMD modes to be expanded as SVE instructions.
(define_mode_attr sve_di_suf [(VNx16QI "") (VNx8HI "") (VNx4SI "") (VNx2DI "")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_1.c b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..6a347072ae892ceeabddc05dbb4ead4814dda2da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_1.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a+sve2 -mautovec-preference=sve-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+/*
+** foo_int:
+** ...
+** sub z[0-9]+.b, z[0-9]+.b, z[0-9]+.b
+** udot z[0-9]+.s, z[0-9]+.b, z[0-9]+.b
+** ...
+*/
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+/*
+** foo2_int:
+** ...
+** add z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+** uaddwb z[0-9]+.s, z[0-9]+.s, z[0-9]+.h
+** uaddwt z[0-9]+.s, z[0-9]+.s, z[0-9]+.h
+** ...
+*/
+int foo2_int(unsigned short *x, unsigned short * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 8000; i++)
+ {
+ x[i] = x[i] + y[i];
+ sum += x[i];
+ }
+ return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..f9ae97158688aad60ed2b705c02621ee7a33e6ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/pr122069_2.c
@@ -0,0 +1,81 @@
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve2_hw } */
+/* { dg-options "-O3 -march=armv8-a+sve2 -mautovec-preference=sve-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
+
+inline char char_abs(char i) {
+ return (i < 0 ? -i : i);
+}
+
+__attribute__((noipa))
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+__attribute__((noipa))
+int foo_int2(unsigned char *x, unsigned char * restrict y) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ sum += char_abs(x[i] - y[i]);
+ return sum;
+}
+
+__attribute__((noipa))
+int foo2_int2(unsigned short *x, unsigned short * restrict y,
+ unsigned short * restrict z) {
+ int sum = 0;
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ z[i] = x[i] + y[i];
+ sum += z[i];
+ }
+ return sum;
+}
+
+int main ()
+{
+ unsigned short a[100];
+ unsigned short b[100];
+ unsigned short r1[100];
+ unsigned short r2[100];
+ unsigned char c[100];
+ unsigned char d[100];
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ {
+ a[i] = c[i] = i;
+ b[i] = d[i] = 100 - i;
+ }
+
+ if (foo_int (c, d) != foo_int2 (c, d))
+ __builtin_abort();
+
+
+ if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
+ __builtin_abort();
+
+#pragma GCC novector
+ for (int i = 0; i < 100; i++)
+ if (r1[i] != r2[i])
+ __builtin_abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
\ No newline at end of file