[PATCH] Honor OpenMP simdlen in the vectorizer

Jakub Jelinek Fri, 17 May 2019 01:09:16 -0700

Hi!

When simdlen clause is specified on simd loop, it specifies the preferred
vectorization factor.  It is a preference, so if there is no possibility of
satisfying it, we can do something else, but still, we shouldn't ignore it
as we've been ignoring it before.


Unfortunately, we iterate over vectorization sizes rather than over
vectorization factors, so in order to determine the vectorization factor, we
need to analyze.

The following patch in the vectorizer when seeing a possible vectorization
which doesn't have the requested vectorization factor remembers first such
vectorization and continues searching and if no vectorization size with the
right vectorization factor is found, just uses the first one.

Another thing is that on x86 with -mprefer-vector-width={256,128} (the
former is the default), we don't actually push all the possible
vectorization sizes.  IMHO when one uses the simd clause and says say
simdlen(16) for loop which just uses ints, then the user wants to use %zmmN
operations even if the default is -mprefer-vector-width=256 or even if that
option is used explicitly.  Perhaps one option would be to push the
64 size to the vector always, just when it is not preferred put it last, but
then even for normal loops if 32 and 16 byte vectorization is unsuccessful,
we'd either waste compile time or in rare corner cases could in theory
vectorize using that vectorization size even when it is not preferred.
So, the patch adds an argument and does that only when the simdlen clause
is used.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2019-05-17  Jakub Jelinek  <ja...@redhat.com>

        * cfgloop.h (struct loop): Add simdlen member.
        * omp-expand.c (expand_omp_simd): Set it if simdlen clause is present.
        * tree-vect-loop.c (vect_analyze_loop): Pass loop->simdlen != 0
        as new argument to autovectorize_vector_sizes target hook.  If
        loop->simdlen, pick up vector size where the vectorization factor
        is equal to loop->simd, and if there is none, fall back to the first
        successful one.
        (vect_transform_loop): Adjust autovectorize_vector_sizes target hook
        caller.
        * omp-low.c (omp_clause_aligned_alignment): Likewise.
        * omp-general.c (omp_max_vf): Likewise.
        * optabs-query.c (can_vec_mask_load_store_p): Likewise.
        * tree-vect-slp.c (vect_slp_bb): Likewise.
        * target.def (autovectorize_vector_sizes): Add ALL argument and
        document it.
        * doc/tm.texi: Adjust documentation.
        * targhooks.c (default_autovectorize_vector_sizes): Add bool argument.
        * targhooks.h (default_autovectorize_vector_sizes): Likewise.
        * config/aarch64/aarch64.c (aarch64_autovectorize_vector_sizes): Add
        bool argument.
        * config/arc/arc.c (arc_autovectorize_vector_sizes): Likewise.
        * config/arm/arm.c (arm_autovectorize_vector_sizes): Likewise.
        * config/mips/mips.c (mips_autovectorize_vector_sizes): Likewise.
        * config/i386/i386.c (ix86_autovectorize_vector_sizes): Likewise.  If
        true and TARGET_AVX512F or TARGET_AVX, push 3 or 2 sizes even if
        preferred vector size is not 512-bit or 256-bit, just put those
        unpreferred ones last.

        * gcc.target/i386/avx512f-simd-1.c: New test.

--- gcc/cfgloop.h.jj    2019-03-08 11:43:35.063317726 +0100
+++ gcc/cfgloop.h       2019-05-16 15:52:05.974315760 +0200
@@ -174,6 +174,9 @@ struct GTY ((chain_next ("%h.next"))) lo
      of the loop can be safely evaluated concurrently.  */
   int safelen;
 
+  /* Preferred vectorization factor for the loop if non-zero.  */
+  int simdlen;
+
   /* Constraints are generally set by consumers and affect certain
      semantics of niter analyzer APIs.  Currently the APIs affected are
      number_of_iterations_exit* functions and their callers.  One typical
--- gcc/omp-expand.c.jj 2019-05-15 23:42:16.049859907 +0200
+++ gcc/omp-expand.c    2019-05-16 16:10:46.093932348 +0200
@@ -4974,6 +4974,13 @@ expand_omp_simd (struct omp_region *regi
          && loop->safelen > 1)
        {
          loop->force_vectorize = true;
+         if (simdlen && tree_fits_uhwi_p (OMP_CLAUSE_SIMDLEN_EXPR (simdlen)))
+           {
+             unsigned HOST_WIDE_INT v
+               = tree_to_uhwi (OMP_CLAUSE_SIMDLEN_EXPR (simdlen));
+             if (v < INT_MAX && v <= (unsigned HOST_WIDE_INT) loop->safelen)
+               loop->simdlen = v;
+           }
          cfun->has_force_vectorize_loops = true;
        }
       else if (dont_vectorize)
--- gcc/tree-vect-loop.c.jj     2019-05-16 15:25:17.826832201 +0200
+++ gcc/tree-vect-loop.c        2019-05-16 19:00:33.999540073 +0200
@@ -2254,7 +2254,8 @@ vect_analyze_loop (struct loop *loop, lo
 
   /* Autodetect first vector size we try.  */
   current_vector_size = 0;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
+  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
+                                               loop->simdlen != 0);
   unsigned int next_size = 0;
 
   DUMP_VECT_SCOPE ("analyze_loop_nest");
@@ -2273,6 +2274,8 @@ vect_analyze_loop (struct loop *loop, lo
 
   unsigned n_stmts = 0;
   poly_uint64 autodetected_vector_size = 0;
+  opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
+  poly_uint64 first_vector_size = 0;
   while (1)
     {
       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
@@ -2283,6 +2286,7 @@ vect_analyze_loop (struct loop *loop, lo
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                             "bad loop form.\n");
+         gcc_checking_assert (first_loop_vinfo == NULL);
          return loop_vinfo;
        }
 
@@ -2296,10 +2300,27 @@ vect_analyze_loop (struct loop *loop, lo
        {
          LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
 
-         return loop_vinfo;
+         if (loop->simdlen
+             && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                          (unsigned HOST_WIDE_INT) loop->simdlen))
+           {
+             if (first_loop_vinfo == NULL)
+               {
+                 first_loop_vinfo = loop_vinfo;
+                 first_vector_size = current_vector_size;
+                 loop->aux = NULL;
+               }
+             else
+               delete loop_vinfo;
+           }
+         else
+           {
+             delete first_loop_vinfo;
+             return loop_vinfo;
+           }
        }
-
-      delete loop_vinfo;
+      else
+       delete loop_vinfo;
 
       if (next_size == 0)
        autodetected_vector_size = current_vector_size;
@@ -2308,10 +2329,31 @@ vect_analyze_loop (struct loop *loop, lo
          && known_eq (vector_sizes[next_size], autodetected_vector_size))
        next_size += 1;
 
-      if (fatal
-         || next_size == vector_sizes.length ()
+      if (fatal)
+       {
+         gcc_checking_assert (first_loop_vinfo == NULL);
+         return opt_loop_vec_info::propagate_failure (res);
+       }
+
+      if (next_size == vector_sizes.length ()
          || known_eq (current_vector_size, 0U))
-       return opt_loop_vec_info::propagate_failure (res);
+       {
+         if (first_loop_vinfo)
+           {
+             current_vector_size = first_vector_size;
+             loop->aux = (loop_vec_info) first_loop_vinfo;
+             if (dump_enabled_p ())
+               {
+                 dump_printf_loc (MSG_NOTE, vect_location,
+                                  "***** Choosing vector size ");
+                 dump_dec (MSG_NOTE, current_vector_size);
+                 dump_printf (MSG_NOTE, "\n");
+               }
+             return first_loop_vinfo;
+           }
+         else
+           return opt_loop_vec_info::propagate_failure (res);
+       }
 
       /* Try the next biggest vector size.  */
       current_vector_size = vector_sizes[next_size++];
@@ -8670,7 +8712,8 @@ vect_transform_loop (loop_vec_info loop_
   if (epilogue)
     {
       auto_vector_sizes vector_sizes;
-      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
+      targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
+                                                   loop->simdlen != 0);
       unsigned int next_size = 0;
 
       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
--- gcc/tree-vect-slp.c.jj      2019-05-14 21:37:33.653388439 +0200
+++ gcc/tree-vect-slp.c 2019-05-16 18:59:12.825873858 +0200
@@ -2983,7 +2983,7 @@ vect_slp_bb (basic_block bb)
 
   /* Autodetect first vector size we try.  */
   current_vector_size = 0;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
+  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
   unsigned int next_size = 0;
 
   gsi = gsi_start_bb (bb);
--- gcc/target.def.jj   2019-02-18 20:48:35.742681472 +0100
+++ gcc/target.def      2019-05-16 18:55:50.373200394 +0200
@@ -1899,12 +1899,14 @@ DEFHOOK
 the only one that is worth considering, this hook should add all suitable\n\
 vector sizes to @var{sizes}, in order of decreasing preference.  The first\n\
 one should be the size of @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}.\n\
+If @var{all} is true, add suitable vector sizes even when they are generally\n\
+not expected to be worthwhile.\n\
 \n\
 The hook does not need to do anything if the vector returned by\n\
 @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant\n\
 for autovectorization.  The default implementation does nothing.",
  void,
- (vector_sizes *sizes),
+ (vector_sizes *sizes, bool all),
  default_autovectorize_vector_sizes)
 
 /* Function to get a target mode for a vector mask.  */
--- gcc/doc/tm.texi.jj  2019-02-18 20:48:34.132707883 +0100
+++ gcc/doc/tm.texi     2019-05-16 19:08:05.975113214 +0200
@@ -6016,11 +6016,13 @@ against lower halves of vectors recursiv
 reached.  The default is @var{mode} which means no splitting.
 @end deftypefn
 
-@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES 
(vector_sizes *@var{sizes})
+@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES 
(vector_sizes *@var{sizes}, bool @var{all})
 If the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is not
 the only one that is worth considering, this hook should add all suitable
 vector sizes to @var{sizes}, in order of decreasing preference.  The first
 one should be the size of @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}.
+If @var{all} is true, add suitable vector sizes even when they are generally
+not expected to be worthwhile.
 
 The hook does not need to do anything if the vector returned by
 @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant
--- gcc/targhooks.c.jj  2019-04-17 21:21:40.918117115 +0200
+++ gcc/targhooks.c     2019-05-16 18:56:38.586408190 +0200
@@ -1316,7 +1316,7 @@ default_split_reduction (machine_mode mo
    is tried.  */
 
 void
-default_autovectorize_vector_sizes (vector_sizes *)
+default_autovectorize_vector_sizes (vector_sizes *, bool)
 {
 }
 
--- gcc/targhooks.h.jj  2019-01-16 09:35:04.563323106 +0100
+++ gcc/targhooks.h     2019-05-16 18:56:27.002598531 +0200
@@ -110,7 +110,7 @@ default_builtin_support_vector_misalignm
                                             int, bool);
 extern machine_mode default_preferred_simd_mode (scalar_mode mode);
 extern machine_mode default_split_reduction (machine_mode);
-extern void default_autovectorize_vector_sizes (vector_sizes *);
+extern void default_autovectorize_vector_sizes (vector_sizes *, bool);
 extern opt_machine_mode default_get_mask_mode (poly_uint64, poly_uint64);
 extern bool default_empty_mask_is_expensive (unsigned);
 extern void *default_init_cost (struct loop *);
--- gcc/omp-low.c.jj    2019-05-16 15:04:41.785179634 +0200
+++ gcc/omp-low.c       2019-05-16 18:58:07.253951283 +0200
@@ -3600,7 +3600,7 @@ omp_clause_aligned_alignment (tree claus
   unsigned int al = 1;
   opt_scalar_mode mode_iter;
   auto_vector_sizes sizes;
-  targetm.vectorize.autovectorize_vector_sizes (&sizes);
+  targetm.vectorize.autovectorize_vector_sizes (&sizes, true);
   poly_uint64 vs = 0;
   for (unsigned int i = 0; i < sizes.length (); ++i)
     vs = ordered_max (vs, sizes[i]);
--- gcc/omp-general.c.jj        2019-02-22 15:22:20.880919652 +0100
+++ gcc/omp-general.c   2019-05-16 18:57:05.254969995 +0200
@@ -469,7 +469,7 @@ omp_max_vf (void)
     return 1;
 
   auto_vector_sizes sizes;
-  targetm.vectorize.autovectorize_vector_sizes (&sizes);
+  targetm.vectorize.autovectorize_vector_sizes (&sizes, true);
   if (!sizes.is_empty ())
     {
       poly_uint64 vf = 0;
--- gcc/optabs-query.c.jj       2019-02-11 11:38:08.177618415 +0100
+++ gcc/optabs-query.c  2019-05-16 18:58:48.830268128 +0200
@@ -593,7 +593,7 @@ can_vec_mask_load_store_p (machine_mode
     return true;
 
   auto_vector_sizes vector_sizes;
-  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
+  targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, true);
   for (unsigned int i = 0; i < vector_sizes.length (); ++i)
     {
       poly_uint64 cur = vector_sizes[i];
--- gcc/config/aarch64/aarch64.c.jj     2019-05-11 11:32:58.229357774 +0200
+++ gcc/config/aarch64/aarch64.c        2019-05-16 19:04:18.269854907 +0200
@@ -14105,7 +14105,7 @@ aarch64_preferred_simd_mode (scalar_mode
 /* Return a list of possible vector sizes for the vectorizer
    to iterate over.  */
 static void
-aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
+aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
 {
   if (TARGET_SVE)
     sizes->safe_push (BYTES_PER_SVE_VECTOR);
--- gcc/config/arc/arc.c.jj     2019-04-24 17:44:44.280019376 +0200
+++ gcc/config/arc/arc.c        2019-05-16 19:04:31.934630363 +0200
@@ -480,7 +480,7 @@ arc_preferred_simd_mode (scalar_mode mod
    TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES.  */
 
 static void
-arc_autovectorize_vector_sizes (vector_sizes *sizes)
+arc_autovectorize_vector_sizes (vector_sizes *sizes, bool)
 {
   if (TARGET_PLUS_QMACW)
     {
--- gcc/config/arm/arm.c.jj     2019-05-10 09:31:31.113119373 +0200
+++ gcc/config/arm/arm.c        2019-05-16 19:04:51.586307442 +0200
@@ -288,7 +288,7 @@ static bool arm_builtin_support_vector_m
 static void arm_conditional_register_usage (void);
 static enum flt_eval_method arm_excess_precision (enum excess_precision_type);
 static reg_class_t arm_preferred_rename_class (reg_class_t rclass);
-static void arm_autovectorize_vector_sizes (vector_sizes *);
+static void arm_autovectorize_vector_sizes (vector_sizes *, bool);
 static int arm_default_branch_cost (bool, bool);
 static int arm_cortex_a5_branch_cost (bool, bool);
 static int arm_cortex_m_branch_cost (bool, bool);
@@ -28347,7 +28347,7 @@ arm_vector_alignment (const_tree type)
 }
 
 static void
-arm_autovectorize_vector_sizes (vector_sizes *sizes)
+arm_autovectorize_vector_sizes (vector_sizes *sizes, bool)
 {
   if (!TARGET_NEON_VECTORIZE_DOUBLE)
     {
--- gcc/config/i386/i386.c.jj   2019-05-15 23:36:47.920060787 +0200
+++ gcc/config/i386/i386.c      2019-05-16 19:03:16.217874556 +0200
@@ -21328,7 +21328,7 @@ ix86_preferred_simd_mode (scalar_mode mo
    256bit and 128bit vectors.  */
 
 static void
-ix86_autovectorize_vector_sizes (vector_sizes *sizes)
+ix86_autovectorize_vector_sizes (vector_sizes *sizes, bool all)
 {
   if (TARGET_AVX512F && !TARGET_PREFER_AVX256)
     {
@@ -21336,11 +21336,22 @@ ix86_autovectorize_vector_sizes (vector_
       sizes->safe_push (32);
       sizes->safe_push (16);
     }
+  else if (TARGET_AVX512F && all)
+    {
+      sizes->safe_push (32);
+      sizes->safe_push (16);
+      sizes->safe_push (64);
+    }
   else if (TARGET_AVX && !TARGET_PREFER_AVX128)
     {
       sizes->safe_push (32);
       sizes->safe_push (16);
     }
+  else if (TARGET_AVX && all)
+    {
+      sizes->safe_push (16);
+      sizes->safe_push (32);
+    }
 }
 
 /* Implemenation of targetm.vectorize.get_mask_mode.  */
--- gcc/config/mips/mips.c.jj   2019-05-14 21:37:20.166613524 +0200
+++ gcc/config/mips/mips.c      2019-05-16 19:05:29.124690606 +0200
@@ -13460,7 +13460,7 @@ mips_preferred_simd_mode (scalar_mode mo
 /* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES.  */
 
 static void
-mips_autovectorize_vector_sizes (vector_sizes *sizes)
+mips_autovectorize_vector_sizes (vector_sizes *sizes, bool)
 {
   if (ISA_HAS_MSA)
     sizes->safe_push (16);
--- gcc/testsuite/gcc.target/i386/avx512f-simd-1.c.jj   2019-05-16 
19:29:17.556218761 +0200
+++ gcc/testsuite/gcc.target/i386/avx512f-simd-1.c      2019-05-16 
19:23:50.508592664 +0200
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-fopenmp-simd -O2 -mavx512f -masm=att" } */
+/* { dg-final { scan-assembler "vpadd\[^\n\r]*%xmm" } } */
+/* { dg-final { scan-assembler "vpadd\[^\n\r]*%ymm" } } */
+/* { dg-final { scan-assembler "vpadd\[^\n\r]*%zmm" } } */
+
+#define N 1024
+int a[N];
+
+void
+f1 (void)
+{
+  int i;
+  #pragma omp simd simdlen (4)
+  for (i = 0; i < N; ++i)
+    a[i] = a[i] + 1;
+}
+
+void
+f2 (void)
+{
+  int i;
+  #pragma omp simd simdlen (8)
+  for (i = 0; i < N; ++i)
+    a[i] = a[i] + 2;
+}
+
+void
+f3 (void)
+{
+  int i;
+  #pragma omp simd simdlen (16)
+  for (i = 0; i < N; ++i)
+    a[i] = a[i] + 3;
+}

        Jakub

[PATCH] Honor OpenMP simdlen in the vectorizer

Reply via email to