Hi All,

The vectorizer now tries to maintain the target VF that a user wanted through
uncreasing the unroll factor if the user used pragma GCC unroll and we've
vectorized the loop.

This change makes the AArch64 backend honor this initial value being set by
the vectorizer.

Consider the loop

void f1 (int *restrict a, int n)
{
#pragma GCC unroll 4 requested
  for (int i = 0; i < n; i++)
    a[i] *= 2;
}

The target can then choose to create multiple epilogues to deal with the "rest".

The example above now generates:

.L4:
        ldr     q31, [x2]
        add     v31.4s, v31.4s, v31.4s
        str     q31, [x2], 16
        cmp     x2, x3
        bne     .L4

as V4SI maintains the requested VF, but e.g. pragma unroll 8 generates:

.L4:
        ldp     q30, q31, [x2]
        add     v30.4s, v30.4s, v30.4s
        add     v31.4s, v31.4s, v31.4s
        stp     q30, q31, [x2], 32
        cmp     x3, x2
        bne     .L4

Note that as a follow up I plan on looking into asking the vectorizer to
generate multiple epilogues when we do unroll like this as we can
re-request the same mode but without the unroll as the first epilogue.
Atm I added a TODO since e.g. for early break we don't support vector
epilogues yet and multiple epilogues needs some thoughts and internal
discussions.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64.cc
        (aarch64_vector_costs::determine_suggested_unroll_factor): Use
        m_suggested_unroll_factor instead of 1.
        (aarch64_vector_costs::finish_cost): Add todo for epilogues.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/unroll-vect.c: New test.

---
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
9e3f2885bccb62550c5fcfdf93d72fbc2e63233e..cf6f56a08d67044c8dc34578902eb4cb416641bd
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -18075,7 +18075,7 @@ aarch64_vector_costs::determine_suggested_unroll_factor 
()
   if (!sve && !TARGET_SVE2 && m_has_avg)
     return 1;
 
-  unsigned int max_unroll_factor = 1;
+  unsigned int max_unroll_factor = m_suggested_unroll_factor;
   for (auto vec_ops : m_ops)
     {
       aarch64_simd_vec_issue_info const *vec_issue
@@ -18293,6 +18293,8 @@ aarch64_vector_costs::finish_cost (const vector_costs 
*uncast_scalar_costs)
                                             m_costs[vect_body]);
       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
 
+      /* TODO: Add support for multiple epilogues and costing for early break. 
 */
+
       /* For gather and scatters there's an additional overhead for the first
         iteration.  For low count loops they're not beneficial so model the
         overhead as loop prologue costs.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/unroll-vect.c 
b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
new file mode 100644
index 
0000000000000000000000000000000000000000..3cb774ba95787ebee488fbe7306299ef28e6bb35
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8-a --param 
aarch64-autovec-preference=asimd-only -std=gnu99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f1:
+**     ...
+**     add     v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     add     v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     add     v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     add     v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**     ...
+*/
+void f1 (int *restrict a, int n)
+{
+#pragma GCC unroll 16
+  for (int i = 0; i < n; i++)
+    a[i] *= 2;
+}
+


-- 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9e3f2885bccb62550c5fcfdf93d72fbc2e63233e..cf6f56a08d67044c8dc34578902eb4cb416641bd 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -18075,7 +18075,7 @@ aarch64_vector_costs::determine_suggested_unroll_factor ()
   if (!sve && !TARGET_SVE2 && m_has_avg)
     return 1;
 
-  unsigned int max_unroll_factor = 1;
+  unsigned int max_unroll_factor = m_suggested_unroll_factor;
   for (auto vec_ops : m_ops)
     {
       aarch64_simd_vec_issue_info const *vec_issue
@@ -18293,6 +18293,8 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
 					     m_costs[vect_body]);
       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
 
+      /* TODO: Add support for multiple epilogues and costing for early break.  */
+
       /* For gather and scatters there's an additional overhead for the first
 	 iteration.  For low count loops they're not beneficial so model the
 	 overhead as loop prologue costs.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/unroll-vect.c b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
new file mode 100644
index 0000000000000000000000000000000000000000..3cb774ba95787ebee488fbe7306299ef28e6bb35
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/unroll-vect.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -march=armv8-a --param aarch64-autovec-preference=asimd-only -std=gnu99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f1:
+**	...
+**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	...
+*/
+void f1 (int *restrict a, int n)
+{
+#pragma GCC unroll 16
+  for (int i = 0; i < n; i++)
+    a[i] *= 2;
+}
+

Reply via email to