This patch is a mechanical rewrite of the widen_[us]sum optabs from a direct to
a conversion optab.  The result of which requires the output mode to be added to
the existing patterns.

No change in functionality is expected.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        PR middle-end/122069
        * config/aarch64/aarch64-simd.md (widen_ssum<mode>3): Change into..
        (widen_ssum<Vdblw><mode>3, widen_ssum<Vwide><mode>3): ... these.
        (widen_usum<mode>3): Change into ...
        (widen_usum<Vdblw><mode>3, widen_usum<Vwide><mode>3): ... these.
        * config/aarch64/iterators.md (Vdblw): New.
        (Vwide): Extend to match VWIDE.

gcc/testsuite/ChangeLog:

        PR middle-end/122069
        * gcc.target/aarch64/pr122069_1.c: New test.
        * gcc.target/aarch64/pr122069_2.c: New test.

---
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
0d5b02a739fa74724d6dc8b658638d55b8db6890..6488119a14020c801f9994ef84250ceb5ba15481
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4647,7 +4647,7 @@ (define_insn 
"aarch64_<ANY_EXTEND:su><ADDSUB:optab>l<mode>"
 
 ;; <su><addsub>w<q>.
 
-(define_expand "widen_ssum<mode>3"
+(define_expand "widen_ssum<Vdblw><mode>3"
   [(set (match_operand:<VDBLW> 0 "register_operand")
        (plus:<VDBLW> (sign_extend:<VDBLW> 
                        (match_operand:VQW 1 "register_operand"))
@@ -4664,7 +4664,7 @@ (define_expand "widen_ssum<mode>3"
   }
 )
 
-(define_expand "widen_ssum<mode>3"
+(define_expand "widen_ssum<Vwide><mode>3"
   [(set (match_operand:<VWIDE> 0 "register_operand")
        (plus:<VWIDE> (sign_extend:<VWIDE>
                        (match_operand:VD_BHSI 1 "register_operand"))
@@ -4675,7 +4675,7 @@ (define_expand "widen_ssum<mode>3"
   DONE;
 })
 
-(define_expand "widen_usum<mode>3"
+(define_expand "widen_usum<Vdblw><mode>3"
   [(set (match_operand:<VDBLW> 0 "register_operand")
        (plus:<VDBLW> (zero_extend:<VDBLW> 
                        (match_operand:VQW 1 "register_operand"))
@@ -4692,7 +4692,7 @@ (define_expand "widen_usum<mode>3"
   }
 )
 
-(define_expand "widen_usum<mode>3"
+(define_expand "widen_usum<Vwide><mode>3"
   [(set (match_operand:<VWIDE> 0 "register_operand")
        (plus:<VWIDE> (zero_extend:<VWIDE>
                        (match_operand:VD_BHSI 1 "register_operand"))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 
332e7ffd2eaf6597f7bce9c22df70a72ebfe8164..61ca4990b94170f016a9f50e3505c8cfb24df9be
 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1901,6 +1901,11 @@ (define_mode_attr VDBLW [(V8QI "V4HI") (V16QI "V8HI")
                   (V4HI "V2SI") (V8HI "V4SI")
                   (V2SI "DI")   (V4SI "V2DI")])
 
+;; Modes with double-width elements.
+(define_mode_attr Vdblw [(V8QI "v4hi") (V16QI "v8hi")
+                        (V4HI "v2si") (V8HI "v4si")
+                        (V2SI "di")   (V4SI "v2di")])
+
 (define_mode_attr VQUADW [(V8QI "V4SI") (V16QI "V8SI")
                   (V4HI "V2DI") (V8HI "V4DI")])
 
@@ -2003,7 +2008,9 @@ (define_mode_attr v2xwide [(V8QI "v8hi") (V4HI "v4si")
 (define_mode_attr VWIDE_PRED [(VNx8HF "VNx4BI") (VNx4SF "VNx2BI")])
 
 ;; Widened modes of vector modes, lowercase
-(define_mode_attr Vwide [(V2SF "v2df") (V4HF "v4sf")
+(define_mode_attr Vwide [(V2SI "v2di") (V4HI "v4si")
+                        (V2SF "v2df") (V4HF "v4sf")
+                        (V8QI "v8hi")
                         (VNx16QI "vnx8hi") (VNx8HI "vnx4si")
                         (VNx4SI  "vnx2di")
                         (VNx8HF  "vnx4sf") (VNx4SF "vnx2df")
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_1.c 
b/gcc/testsuite/gcc.target/aarch64/pr122069_1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..b2f973261ea0df8d3d5c7da29834b35ed21a4d52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122069_1.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only --param 
vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks 
-fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+inline char char_abs(char i) {
+  return (i < 0 ? -i : i);
+}
+
+/*
+** foo_int:
+**     ...
+**     sub     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**     zip1    v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**     zip2    v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**     uaddw   v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+**     uaddw2  v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
+**     uaddw   v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+**     uaddw2  v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
+**     ...
+*/
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 8000; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+/*
+** foo2_int:
+**     ...
+**     add     v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+**     uaddw   v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+**     uaddw2  v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
+**     ...
+*/
+int foo2_int(unsigned short *x, unsigned short * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 8000; i++)
+    {
+      x[i] = x[i] + y[i];
+      sum += x[i];
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_2.c 
b/gcc/testsuite/gcc.target/aarch64/pr122069_2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..c6a276f88a67a50038268c177bc60f4dee5258f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122069_2.c
@@ -0,0 +1,80 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only --param 
vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks 
-fno-schedule-insns2 -fdump-tree-vect-details" }*/
+
+inline char char_abs(char i) {
+  return (i < 0 ? -i : i);
+}
+
+__attribute__((noipa))
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 100; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+__attribute__((noipa))
+int foo2_int(unsigned short *x, unsigned short * restrict y,
+            unsigned short * restrict z) {
+  int sum = 0;
+  for (int i = 0; i < 100; i++)
+    {
+      z[i] = x[i] + y[i];
+      sum += z[i];
+    }
+  return sum;
+}
+
+__attribute__((noipa))
+int foo_int2(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+__attribute__((noipa))
+int foo2_int2(unsigned short *x, unsigned short * restrict y,
+             unsigned short * restrict z) {
+  int sum = 0;
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    {
+      z[i] = x[i] + y[i];
+      sum += z[i];
+    }
+  return sum;
+}
+
+int main ()
+{
+  unsigned short a[100];
+  unsigned short b[100];
+  unsigned short r1[100];
+  unsigned short r2[100];
+  unsigned char c[100];
+  unsigned char d[100];
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    {
+      a[i] = c[i] = i;
+      b[i] = d[i] = 100 - i;
+    }
+
+  if (foo_int (c, d) != foo_int2 (c, d))
+    __builtin_abort();
+
+
+  if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
+    __builtin_abort();
+
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    if (r1[i] != r2[i])
+      __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
\ No newline at end of file


-- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 0d5b02a739fa74724d6dc8b658638d55b8db6890..6488119a14020c801f9994ef84250ceb5ba15481 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4647,7 +4647,7 @@ (define_insn "aarch64_<ANY_EXTEND:su><ADDSUB:optab>l<mode>"
 
 ;; <su><addsub>w<q>.
 
-(define_expand "widen_ssum<mode>3"
+(define_expand "widen_ssum<Vdblw><mode>3"
   [(set (match_operand:<VDBLW> 0 "register_operand")
 	(plus:<VDBLW> (sign_extend:<VDBLW> 
 		        (match_operand:VQW 1 "register_operand"))
@@ -4664,7 +4664,7 @@ (define_expand "widen_ssum<mode>3"
   }
 )
 
-(define_expand "widen_ssum<mode>3"
+(define_expand "widen_ssum<Vwide><mode>3"
   [(set (match_operand:<VWIDE> 0 "register_operand")
 	(plus:<VWIDE> (sign_extend:<VWIDE>
 		        (match_operand:VD_BHSI 1 "register_operand"))
@@ -4675,7 +4675,7 @@ (define_expand "widen_ssum<mode>3"
   DONE;
 })
 
-(define_expand "widen_usum<mode>3"
+(define_expand "widen_usum<Vdblw><mode>3"
   [(set (match_operand:<VDBLW> 0 "register_operand")
 	(plus:<VDBLW> (zero_extend:<VDBLW> 
 		        (match_operand:VQW 1 "register_operand"))
@@ -4692,7 +4692,7 @@ (define_expand "widen_usum<mode>3"
   }
 )
 
-(define_expand "widen_usum<mode>3"
+(define_expand "widen_usum<Vwide><mode>3"
   [(set (match_operand:<VWIDE> 0 "register_operand")
 	(plus:<VWIDE> (zero_extend:<VWIDE>
 		        (match_operand:VD_BHSI 1 "register_operand"))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 332e7ffd2eaf6597f7bce9c22df70a72ebfe8164..61ca4990b94170f016a9f50e3505c8cfb24df9be 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1901,6 +1901,11 @@ (define_mode_attr VDBLW [(V8QI "V4HI") (V16QI "V8HI")
                   (V4HI "V2SI") (V8HI "V4SI")
                   (V2SI "DI")   (V4SI "V2DI")])
 
+;; Modes with double-width elements.
+(define_mode_attr Vdblw [(V8QI "v4hi") (V16QI "v8hi")
+			 (V4HI "v2si") (V8HI "v4si")
+			 (V2SI "di")   (V4SI "v2di")])
+
 (define_mode_attr VQUADW [(V8QI "V4SI") (V16QI "V8SI")
                   (V4HI "V2DI") (V8HI "V4DI")])
 
@@ -2003,7 +2008,9 @@ (define_mode_attr v2xwide [(V8QI "v8hi") (V4HI "v4si")
 (define_mode_attr VWIDE_PRED [(VNx8HF "VNx4BI") (VNx4SF "VNx2BI")])
 
 ;; Widened modes of vector modes, lowercase
-(define_mode_attr Vwide [(V2SF "v2df") (V4HF "v4sf")
+(define_mode_attr Vwide [(V2SI "v2di") (V4HI "v4si")
+			 (V2SF "v2df") (V4HF "v4sf")
+			 (V8QI "v8hi")
 			 (VNx16QI "vnx8hi") (VNx8HI "vnx4si")
 			 (VNx4SI  "vnx2di")
 			 (VNx8HF  "vnx4sf") (VNx4SF "vnx2df")
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_1.c b/gcc/testsuite/gcc.target/aarch64/pr122069_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..b2f973261ea0df8d3d5c7da29834b35ed21a4d52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122069_1.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+inline char char_abs(char i) {
+  return (i < 0 ? -i : i);
+}
+
+/*
+** foo_int:
+** 	...
+** 	sub	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** 	zip1	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** 	zip2	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uaddw2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uaddw2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
+** 	...
+*/
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 8000; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+/*
+** foo2_int:
+** 	...
+** 	add	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** 	uaddw	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+** 	uaddw2	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
+** 	...
+*/
+int foo2_int(unsigned short *x, unsigned short * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 8000; i++)
+    {
+      x[i] = x[i] + y[i];
+      sum += x[i];
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_2.c b/gcc/testsuite/gcc.target/aarch64/pr122069_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..c6a276f88a67a50038268c177bc60f4dee5258f1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122069_2.c
@@ -0,0 +1,80 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -march=armv8-a -mautovec-preference=asimd-only --param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2 -fdump-tree-vect-details" }*/
+
+inline char char_abs(char i) {
+  return (i < 0 ? -i : i);
+}
+
+__attribute__((noipa))
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 100; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+__attribute__((noipa))
+int foo2_int(unsigned short *x, unsigned short * restrict y,
+	     unsigned short * restrict z) {
+  int sum = 0;
+  for (int i = 0; i < 100; i++)
+    {
+      z[i] = x[i] + y[i];
+      sum += z[i];
+    }
+  return sum;
+}
+
+__attribute__((noipa))
+int foo_int2(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+__attribute__((noipa))
+int foo2_int2(unsigned short *x, unsigned short * restrict y,
+	      unsigned short * restrict z) {
+  int sum = 0;
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    {
+      z[i] = x[i] + y[i];
+      sum += z[i];
+    }
+  return sum;
+}
+
+int main ()
+{
+  unsigned short a[100];
+  unsigned short b[100];
+  unsigned short r1[100];
+  unsigned short r2[100];
+  unsigned char c[100];
+  unsigned char d[100];
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    {
+      a[i] = c[i] = i;
+      b[i] = d[i] = 100 - i;
+    }
+
+  if (foo_int (c, d) != foo_int2 (c, d))
+    __builtin_abort();
+
+
+  if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
+    __builtin_abort();
+
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    if (r1[i] != r2[i])
+      __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
\ No newline at end of file

Reply via email to