This patch adds a mode in which the vectoriser tries each available
base vector mode and picks the one with the lowest cost.  For now
the behaviour is behind a default-off --param, but a later patch
enables it by default for SVE.

The patch keeps the current behaviour of preferring a VF of
loop->simdlen over any larger or smaller VF, regardless of costs
or target preferences.


2019-11-05  Richard Sandiford  <richard.sandif...@arm.com>

gcc/
        * params.def (vect-compare-loop-costs): New param.
        * doc/invoke.texi: Document it.
        * tree-vectorizer.h (_loop_vec_info::vec_outside_cost)
        (_loop_vec_info::vec_inside_cost): New member variables.
        * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize them.
        (vect_better_loop_vinfo_p, vect_joust_loop_vinfos): New functions.
        (vect_analyze_loop): When the new parameter allows, try vectorizing
        the loop with each available vector mode and picking the one with
        the lowest cost.
        (vect_estimate_min_profitable_iters): Record the computed costs
        in the loop_vec_info.

Index: gcc/params.def
===================================================================
--- gcc/params.def      2019-10-31 17:15:25.470517368 +0000
+++ gcc/params.def      2019-11-05 14:19:58.781197820 +0000
@@ -661,6 +661,13 @@ DEFPARAM(PARAM_VECT_MAX_PEELING_FOR_ALIG
          "Maximum number of loop peels to enhance alignment of data references 
in a loop.",
          -1, -1, 64)
 
+DEFPARAM(PARAM_VECT_COMPARE_LOOP_COSTS,
+        "vect-compare-loop-costs",
+        "Whether to try vectorizing a loop using each supported"
+        " combination of vector types and picking the version with the"
+        " lowest cost.",
+        0, 0, 1)
+
 DEFPARAM(PARAM_MAX_CSELIB_MEMORY_LOCATIONS,
         "max-cselib-memory-locations",
         "The maximum memory locations recorded by cselib.",
Index: gcc/doc/invoke.texi
===================================================================
--- gcc/doc/invoke.texi 2019-11-04 21:13:57.611756365 +0000
+++ gcc/doc/invoke.texi 2019-11-05 14:19:58.777197850 +0000
@@ -11563,6 +11563,12 @@ doing loop versioning for alias in the v
 The maximum number of loop peels to enhance access alignment
 for vectorizer. Value -1 means no limit.
 
+@item vect-compare-loop-costs
+Whether to try vectorizing a loop using each supported combination of
+vector types and picking the version with the lowest cost.  This parameter
+has no effect when @option{-fno-vect-cost-model} or
+@option{-fvect-cost-model=unlimited} are used.
+
 @item max-iterations-to-track
 The maximum number of iterations of a loop the brute-force algorithm
 for analysis of the number of iterations of the loop tries to evaluate.
Index: gcc/tree-vectorizer.h
===================================================================
--- gcc/tree-vectorizer.h       2019-11-05 14:19:33.829371745 +0000
+++ gcc/tree-vectorizer.h       2019-11-05 14:19:58.781197820 +0000
@@ -601,6 +601,13 @@ typedef class _loop_vec_info : public ve
   /* Cost of a single scalar iteration.  */
   int single_scalar_iteration_cost;
 
+  /* The cost of the vector prologue and epilogue, including peeled
+     iterations and set-up code.  */
+  int vec_outside_cost;
+
+  /* The cost of the vector loop body.  */
+  int vec_inside_cost;
+
   /* Is the loop vectorizable? */
   bool vectorizable;
 
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c        2019-11-05 14:19:33.829371745 +0000
+++ gcc/tree-vect-loop.c        2019-11-05 14:19:58.781197820 +0000
@@ -830,6 +830,8 @@ _loop_vec_info::_loop_vec_info (class lo
     scan_map (NULL),
     slp_unrolling_factor (1),
     single_scalar_iteration_cost (0),
+    vec_outside_cost (0),
+    vec_inside_cost (0),
     vectorizable (false),
     can_fully_mask_p (true),
     fully_masked_p (false),
@@ -2373,6 +2375,80 @@ vect_analyze_loop_2 (loop_vec_info loop_
   goto start_over;
 }
 
+/* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
+   to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
+   OLD_LOOP_VINFO is better unless something specifically indicates
+   otherwise.
+
+   Note that this deliberately isn't a partial order.  */
+
+static bool
+vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
+                         loop_vec_info old_loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
+  gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
+
+  poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
+  poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
+
+  /* Always prefer a VF of loop->simdlen over any other VF.  */
+  if (loop->simdlen)
+    {
+      bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
+      bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
+      if (new_simdlen_p != old_simdlen_p)
+       return new_simdlen_p;
+    }
+
+  /* Limit the VFs to what is likely to be the maximum number of iterations,
+     to handle cases in which at least one loop_vinfo is fully-masked.  */
+  HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
+  if (estimated_max_niter != -1)
+    {
+      if (known_le (estimated_max_niter, new_vf))
+       new_vf = estimated_max_niter;
+      if (known_le (estimated_max_niter, old_vf))
+       old_vf = estimated_max_niter;
+    }
+
+  /* Check whether the (fractional) cost per scalar iteration is lower
+     or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
+  poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
+                            * poly_widest_int (old_vf));
+  poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
+                            * poly_widest_int (new_vf));
+  if (maybe_lt (rel_old, rel_new))
+    return false;
+  if (known_lt (rel_new, rel_old))
+    return true;
+
+  /* If there's nothing to choose between the loop bodies, see whether
+     there's a difference in the prologue and epilogue costs.  */
+  if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
+    return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
+
+  return false;
+}
+
+/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
+   true if we should.  */
+
+static bool
+vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
+                       loop_vec_info old_loop_vinfo)
+{
+  if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
+    return false;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "***** Preferring vector mode %s to vector mode %s\n",
+                    GET_MODE_NAME (new_loop_vinfo->vector_mode),
+                    GET_MODE_NAME (old_loop_vinfo->vector_mode));
+  return true;
+}
+
 /* Function vect_analyze_loop.
 
    Apply a set of analyses on LOOP, and create a loop_vec_info struct
@@ -2408,6 +2484,8 @@ vect_analyze_loop (class loop *loop, vec
   machine_mode next_vector_mode = VOIDmode;
   poly_uint64 lowest_th = 0;
   unsigned vectorized_loops = 0;
+  bool pick_lowest_cost_p = (PARAM_VALUE (PARAM_VECT_COMPARE_LOOP_COSTS)
+                            && !unlimited_cost_model (loop));
 
   bool vect_epilogues = false;
   opt_result res = opt_result::success ();
@@ -2428,6 +2506,34 @@ vect_analyze_loop (class loop *loop, vec
 
       bool fatal = false;
 
+      /* When pick_lowest_cost_p is true, we should in principle iterate
+        over all the loop_vec_infos that LOOP_VINFO could replace and
+        try to vectorize LOOP_VINFO under the same conditions.
+        E.g. when trying to replace an epilogue loop, we should vectorize
+        LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
+        to replace the main loop, we should vectorize LOOP_VINFO as a main
+        loop too.
+
+        However, autovectorize_vector_modes is usually sorted as follows:
+
+        - Modes that naturally produce lower VFs usually follow modes that
+          naturally produce higher VFs.
+
+        - When modes naturally produce the same VF, maskable modes
+          usually follow unmaskable ones, so that the maskable mode
+          can be used to vectorize the epilogue of the unmaskable mode.
+
+        This order is preferred because it leads to the maximum
+        epilogue vectorization opportunities.  Targets should only use
+        a different order if they want to make wide modes available while
+        disparaging them relative to earlier, smaller modes.  The assumption
+        in that case is that the wider modes are more expensive in some
+        way that isn't reflected directly in the costs.
+
+        There should therefore be few interesting cases in which
+        LOOP_VINFO fails when treated as an epilogue loop, succeeds when
+        treated as a standalone loop, and ends up being genuinely cheaper
+        than FIRST_LOOP_VINFO.  */
       if (vect_epilogues)
        LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
 
@@ -2475,13 +2581,34 @@ vect_analyze_loop (class loop *loop, vec
              LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
              simdlen = 0;
            }
+         else if (pick_lowest_cost_p && first_loop_vinfo)
+           {
+             /* Keep trying to roll back vectorization attempts while the
+                loop_vec_infos they produced were worse than this one.  */
+             vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
+             while (!vinfos.is_empty ()
+                    && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
+               {
+                 gcc_assert (vect_epilogues);
+                 delete vinfos.pop ();
+               }
+             if (vinfos.is_empty ()
+                 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
+               {
+                 delete first_loop_vinfo;
+                 first_loop_vinfo = opt_loop_vec_info::success (NULL);
+                 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
+               }
+           }
 
          if (first_loop_vinfo == NULL)
            {
              first_loop_vinfo = loop_vinfo;
              lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
            }
-         else if (vect_epilogues)
+         else if (vect_epilogues
+                  /* For now only allow one epilogue loop.  */
+                  && first_loop_vinfo->epilogue_vinfos.is_empty ())
            {
              first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
              poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
@@ -2501,12 +2628,14 @@ vect_analyze_loop (class loop *loop, vec
                            && loop->inner == NULL
                            && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK)
                            && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
-                           /* For now only allow one epilogue loop.  */
-                           && first_loop_vinfo->epilogue_vinfos.is_empty ());
+                           /* For now only allow one epilogue loop, but allow
+                              pick_lowest_cost_p to replace it.  */
+                           && (first_loop_vinfo->epilogue_vinfos.is_empty ()
+                               || pick_lowest_cost_p));
 
          /* Commit to first_loop_vinfo if we have no reason to try
             alternatives.  */
-         if (!simdlen && !vect_epilogues)
+         if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
            break;
        }
       else
@@ -3454,7 +3583,11 @@ vect_estimate_min_profitable_iters (loop
               &vec_inside_cost, &vec_epilogue_cost);
 
   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
-  
+
+  /* Stash the costs so that we can compare two loop_vec_infos.  */
+  loop_vinfo->vec_inside_cost = vec_inside_cost;
+  loop_vinfo->vec_outside_cost = vec_outside_cost;
+
   if (dump_enabled_p ())
     {
       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");

Reply via email to