When we do loop masking via mask or length a single scalar iteration
should be sufficient to avoid excess accesses.  This fixes the last
known FAILs with --param vect-force-slp=1.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

Do we know of a case where the peeling isn't sufficient with VL vectors?

The CI will probably fail because of dependent patches I just pushed :/

Thanks,
Richard.

        PR tree-optimization/117558
        * tree-vect-stmts.cc (get_group_load_store_type): Exempt
        VL vector types from not sufficient gap peeling testing.
---
 gcc/tree-vect-stmts.cc | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index d3552266eee..b5f90803eed 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2181,33 +2181,30 @@ get_group_load_store_type (vec_info *vinfo, 
stmt_vec_info stmt_info,
 
          /* Peeling for gaps assumes that a single scalar iteration
             is enough to make sure the last vector iteration doesn't
-            access excess elements.  */
+            access excess elements.  For variable-length vectors the
+            required loop masking ensures a single iteration is always
+            sufficient.  */
+         unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
          if (overrun_p
-             && (!can_div_trunc_p (group_size
-                                   * LOOP_VINFO_VECT_FACTOR (loop_vinfo) - gap,
-                                   nunits, &tem, &remain)
-                 || maybe_lt (remain + group_size, nunits)))
-           {
+             && nunits.is_constant (&cnunits)
+             && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
+             && ((cremain = (group_size * cvf - gap) % cnunits), true)
+             && cremain + group_size < cnunits
              /* But peeling a single scalar iteration is enough if
                 we can use the next power-of-two sized partial
                 access and that is sufficiently small to be covered
                 by the single scalar iteration.  */
-             unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
-             if (!nunits.is_constant (&cnunits)
-                 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
-                 || (((cremain = (group_size * cvf - gap) % cnunits), true)
-                     && ((cpart_size = (1 << ceil_log2 (cremain))), true)
-                     && (cremain + group_size < cpart_size
-                         || vector_vector_composition_type
-                              (vectype, cnunits / cpart_size,
-                               &half_vtype) == NULL_TREE)))
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "peeling for gaps insufficient for "
-                                    "access\n");
-                 return false;
-               }
+             && ((cpart_size = (1 << ceil_log2 (cremain))), true)
+             && (cremain + group_size < cpart_size
+                 || vector_vector_composition_type
+                      (vectype, cnunits / cpart_size,
+                       &half_vtype) == NULL_TREE))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "peeling for gaps insufficient for "
+                                "access\n");
+             return false;
            }
        }
     }
-- 
2.43.0

Reply via email to