Hi! When simdlen clause is specified on simd loop, it specifies the preferred vectorization factor. It is a preference, so if there is no possibility of satisfying it, we can do something else, but still, we shouldn't ignore it as we've been ignoring it before.
Unfortunately, we iterate over vectorization sizes rather than over vectorization factors, so in order to determine the vectorization factor, we need to analyze. The following patch in the vectorizer when seeing a possible vectorization which doesn't have the requested vectorization factor remembers first such vectorization and continues searching and if no vectorization size with the right vectorization factor is found, just uses the first one. Another thing is that on x86 with -mprefer-vector-width={256,128} (the former is the default), we don't actually push all the possible vectorization sizes. IMHO when one uses the simd clause and says say simdlen(16) for loop which just uses ints, then the user wants to use %zmmN operations even if the default is -mprefer-vector-width=256 or even if that option is used explicitly. Perhaps one option would be to push the 64 size to the vector always, just when it is not preferred put it last, but then even for normal loops if 32 and 16 byte vectorization is unsuccessful, we'd either waste compile time or in rare corner cases could in theory vectorize using that vectorization size even when it is not preferred. So, the patch adds an argument and does that only when the simdlen clause is used. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2019-05-17 Jakub Jelinek <ja...@redhat.com> * cfgloop.h (struct loop): Add simdlen member. * omp-expand.c (expand_omp_simd): Set it if simdlen clause is present. * tree-vect-loop.c (vect_analyze_loop): Pass loop->simdlen != 0 as new argument to autovectorize_vector_sizes target hook. If loop->simdlen, pick up vector size where the vectorization factor is equal to loop->simd, and if there is none, fall back to the first successful one. (vect_transform_loop): Adjust autovectorize_vector_sizes target hook caller. * omp-low.c (omp_clause_aligned_alignment): Likewise. * omp-general.c (omp_max_vf): Likewise. * optabs-query.c (can_vec_mask_load_store_p): Likewise. * tree-vect-slp.c (vect_slp_bb): Likewise. * target.def (autovectorize_vector_sizes): Add ALL argument and document it. * doc/tm.texi: Adjust documentation. * targhooks.c (default_autovectorize_vector_sizes): Add bool argument. * targhooks.h (default_autovectorize_vector_sizes): Likewise. * config/aarch64/aarch64.c (aarch64_autovectorize_vector_sizes): Add bool argument. * config/arc/arc.c (arc_autovectorize_vector_sizes): Likewise. * config/arm/arm.c (arm_autovectorize_vector_sizes): Likewise. * config/mips/mips.c (mips_autovectorize_vector_sizes): Likewise. * config/i386/i386.c (ix86_autovectorize_vector_sizes): Likewise. If true and TARGET_AVX512F or TARGET_AVX, push 3 or 2 sizes even if preferred vector size is not 512-bit or 256-bit, just put those unpreferred ones last. * gcc.target/i386/avx512f-simd-1.c: New test. --- gcc/cfgloop.h.jj 2019-03-08 11:43:35.063317726 +0100 +++ gcc/cfgloop.h 2019-05-16 15:52:05.974315760 +0200 @@ -174,6 +174,9 @@ struct GTY ((chain_next ("%h.next"))) lo of the loop can be safely evaluated concurrently. */ int safelen; + /* Preferred vectorization factor for the loop if non-zero. */ + int simdlen; + /* Constraints are generally set by consumers and affect certain semantics of niter analyzer APIs. Currently the APIs affected are number_of_iterations_exit* functions and their callers. One typical --- gcc/omp-expand.c.jj 2019-05-15 23:42:16.049859907 +0200 +++ gcc/omp-expand.c 2019-05-16 16:10:46.093932348 +0200 @@ -4974,6 +4974,13 @@ expand_omp_simd (struct omp_region *regi && loop->safelen > 1) { loop->force_vectorize = true; + if (simdlen && tree_fits_uhwi_p (OMP_CLAUSE_SIMDLEN_EXPR (simdlen))) + { + unsigned HOST_WIDE_INT v + = tree_to_uhwi (OMP_CLAUSE_SIMDLEN_EXPR (simdlen)); + if (v < INT_MAX && v <= (unsigned HOST_WIDE_INT) loop->safelen) + loop->simdlen = v; + } cfun->has_force_vectorize_loops = true; } else if (dont_vectorize) --- gcc/tree-vect-loop.c.jj 2019-05-16 15:25:17.826832201 +0200 +++ gcc/tree-vect-loop.c 2019-05-16 19:00:33.999540073 +0200 @@ -2254,7 +2254,8 @@ vect_analyze_loop (struct loop *loop, lo /* Autodetect first vector size we try. */ current_vector_size = 0; - targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); + targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, + loop->simdlen != 0); unsigned int next_size = 0; DUMP_VECT_SCOPE ("analyze_loop_nest"); @@ -2273,6 +2274,8 @@ vect_analyze_loop (struct loop *loop, lo unsigned n_stmts = 0; poly_uint64 autodetected_vector_size = 0; + opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL); + poly_uint64 first_vector_size = 0; while (1) { /* Check the CFG characteristics of the loop (nesting, entry/exit). */ @@ -2283,6 +2286,7 @@ vect_analyze_loop (struct loop *loop, lo if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "bad loop form.\n"); + gcc_checking_assert (first_loop_vinfo == NULL); return loop_vinfo; } @@ -2296,10 +2300,27 @@ vect_analyze_loop (struct loop *loop, lo { LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; - return loop_vinfo; + if (loop->simdlen + && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo), + (unsigned HOST_WIDE_INT) loop->simdlen)) + { + if (first_loop_vinfo == NULL) + { + first_loop_vinfo = loop_vinfo; + first_vector_size = current_vector_size; + loop->aux = NULL; + } + else + delete loop_vinfo; + } + else + { + delete first_loop_vinfo; + return loop_vinfo; + } } - - delete loop_vinfo; + else + delete loop_vinfo; if (next_size == 0) autodetected_vector_size = current_vector_size; @@ -2308,10 +2329,31 @@ vect_analyze_loop (struct loop *loop, lo && known_eq (vector_sizes[next_size], autodetected_vector_size)) next_size += 1; - if (fatal - || next_size == vector_sizes.length () + if (fatal) + { + gcc_checking_assert (first_loop_vinfo == NULL); + return opt_loop_vec_info::propagate_failure (res); + } + + if (next_size == vector_sizes.length () || known_eq (current_vector_size, 0U)) - return opt_loop_vec_info::propagate_failure (res); + { + if (first_loop_vinfo) + { + current_vector_size = first_vector_size; + loop->aux = (loop_vec_info) first_loop_vinfo; + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "***** Choosing vector size "); + dump_dec (MSG_NOTE, current_vector_size); + dump_printf (MSG_NOTE, "\n"); + } + return first_loop_vinfo; + } + else + return opt_loop_vec_info::propagate_failure (res); + } /* Try the next biggest vector size. */ current_vector_size = vector_sizes[next_size++]; @@ -8670,7 +8712,8 @@ vect_transform_loop (loop_vec_info loop_ if (epilogue) { auto_vector_sizes vector_sizes; - targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); + targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, + loop->simdlen != 0); unsigned int next_size = 0; /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work --- gcc/tree-vect-slp.c.jj 2019-05-14 21:37:33.653388439 +0200 +++ gcc/tree-vect-slp.c 2019-05-16 18:59:12.825873858 +0200 @@ -2983,7 +2983,7 @@ vect_slp_bb (basic_block bb) /* Autodetect first vector size we try. */ current_vector_size = 0; - targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); + targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false); unsigned int next_size = 0; gsi = gsi_start_bb (bb); --- gcc/target.def.jj 2019-02-18 20:48:35.742681472 +0100 +++ gcc/target.def 2019-05-16 18:55:50.373200394 +0200 @@ -1899,12 +1899,14 @@ DEFHOOK the only one that is worth considering, this hook should add all suitable\n\ vector sizes to @var{sizes}, in order of decreasing preference. The first\n\ one should be the size of @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}.\n\ +If @var{all} is true, add suitable vector sizes even when they are generally\n\ +not expected to be worthwhile.\n\ \n\ The hook does not need to do anything if the vector returned by\n\ @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant\n\ for autovectorization. The default implementation does nothing.", void, - (vector_sizes *sizes), + (vector_sizes *sizes, bool all), default_autovectorize_vector_sizes) /* Function to get a target mode for a vector mask. */ --- gcc/doc/tm.texi.jj 2019-02-18 20:48:34.132707883 +0100 +++ gcc/doc/tm.texi 2019-05-16 19:08:05.975113214 +0200 @@ -6016,11 +6016,13 @@ against lower halves of vectors recursiv reached. The default is @var{mode} which means no splitting. @end deftypefn -@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES (vector_sizes *@var{sizes}) +@deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES (vector_sizes *@var{sizes}, bool @var{all}) If the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is not the only one that is worth considering, this hook should add all suitable vector sizes to @var{sizes}, in order of decreasing preference. The first one should be the size of @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE}. +If @var{all} is true, add suitable vector sizes even when they are generally +not expected to be worthwhile. The hook does not need to do anything if the vector returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is the only one relevant --- gcc/targhooks.c.jj 2019-04-17 21:21:40.918117115 +0200 +++ gcc/targhooks.c 2019-05-16 18:56:38.586408190 +0200 @@ -1316,7 +1316,7 @@ default_split_reduction (machine_mode mo is tried. */ void -default_autovectorize_vector_sizes (vector_sizes *) +default_autovectorize_vector_sizes (vector_sizes *, bool) { } --- gcc/targhooks.h.jj 2019-01-16 09:35:04.563323106 +0100 +++ gcc/targhooks.h 2019-05-16 18:56:27.002598531 +0200 @@ -110,7 +110,7 @@ default_builtin_support_vector_misalignm int, bool); extern machine_mode default_preferred_simd_mode (scalar_mode mode); extern machine_mode default_split_reduction (machine_mode); -extern void default_autovectorize_vector_sizes (vector_sizes *); +extern void default_autovectorize_vector_sizes (vector_sizes *, bool); extern opt_machine_mode default_get_mask_mode (poly_uint64, poly_uint64); extern bool default_empty_mask_is_expensive (unsigned); extern void *default_init_cost (struct loop *); --- gcc/omp-low.c.jj 2019-05-16 15:04:41.785179634 +0200 +++ gcc/omp-low.c 2019-05-16 18:58:07.253951283 +0200 @@ -3600,7 +3600,7 @@ omp_clause_aligned_alignment (tree claus unsigned int al = 1; opt_scalar_mode mode_iter; auto_vector_sizes sizes; - targetm.vectorize.autovectorize_vector_sizes (&sizes); + targetm.vectorize.autovectorize_vector_sizes (&sizes, true); poly_uint64 vs = 0; for (unsigned int i = 0; i < sizes.length (); ++i) vs = ordered_max (vs, sizes[i]); --- gcc/omp-general.c.jj 2019-02-22 15:22:20.880919652 +0100 +++ gcc/omp-general.c 2019-05-16 18:57:05.254969995 +0200 @@ -469,7 +469,7 @@ omp_max_vf (void) return 1; auto_vector_sizes sizes; - targetm.vectorize.autovectorize_vector_sizes (&sizes); + targetm.vectorize.autovectorize_vector_sizes (&sizes, true); if (!sizes.is_empty ()) { poly_uint64 vf = 0; --- gcc/optabs-query.c.jj 2019-02-11 11:38:08.177618415 +0100 +++ gcc/optabs-query.c 2019-05-16 18:58:48.830268128 +0200 @@ -593,7 +593,7 @@ can_vec_mask_load_store_p (machine_mode return true; auto_vector_sizes vector_sizes; - targetm.vectorize.autovectorize_vector_sizes (&vector_sizes); + targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, true); for (unsigned int i = 0; i < vector_sizes.length (); ++i) { poly_uint64 cur = vector_sizes[i]; --- gcc/config/aarch64/aarch64.c.jj 2019-05-11 11:32:58.229357774 +0200 +++ gcc/config/aarch64/aarch64.c 2019-05-16 19:04:18.269854907 +0200 @@ -14105,7 +14105,7 @@ aarch64_preferred_simd_mode (scalar_mode /* Return a list of possible vector sizes for the vectorizer to iterate over. */ static void -aarch64_autovectorize_vector_sizes (vector_sizes *sizes) +aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool) { if (TARGET_SVE) sizes->safe_push (BYTES_PER_SVE_VECTOR); --- gcc/config/arc/arc.c.jj 2019-04-24 17:44:44.280019376 +0200 +++ gcc/config/arc/arc.c 2019-05-16 19:04:31.934630363 +0200 @@ -480,7 +480,7 @@ arc_preferred_simd_mode (scalar_mode mod TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES. */ static void -arc_autovectorize_vector_sizes (vector_sizes *sizes) +arc_autovectorize_vector_sizes (vector_sizes *sizes, bool) { if (TARGET_PLUS_QMACW) { --- gcc/config/arm/arm.c.jj 2019-05-10 09:31:31.113119373 +0200 +++ gcc/config/arm/arm.c 2019-05-16 19:04:51.586307442 +0200 @@ -288,7 +288,7 @@ static bool arm_builtin_support_vector_m static void arm_conditional_register_usage (void); static enum flt_eval_method arm_excess_precision (enum excess_precision_type); static reg_class_t arm_preferred_rename_class (reg_class_t rclass); -static void arm_autovectorize_vector_sizes (vector_sizes *); +static void arm_autovectorize_vector_sizes (vector_sizes *, bool); static int arm_default_branch_cost (bool, bool); static int arm_cortex_a5_branch_cost (bool, bool); static int arm_cortex_m_branch_cost (bool, bool); @@ -28347,7 +28347,7 @@ arm_vector_alignment (const_tree type) } static void -arm_autovectorize_vector_sizes (vector_sizes *sizes) +arm_autovectorize_vector_sizes (vector_sizes *sizes, bool) { if (!TARGET_NEON_VECTORIZE_DOUBLE) { --- gcc/config/i386/i386.c.jj 2019-05-15 23:36:47.920060787 +0200 +++ gcc/config/i386/i386.c 2019-05-16 19:03:16.217874556 +0200 @@ -21328,7 +21328,7 @@ ix86_preferred_simd_mode (scalar_mode mo 256bit and 128bit vectors. */ static void -ix86_autovectorize_vector_sizes (vector_sizes *sizes) +ix86_autovectorize_vector_sizes (vector_sizes *sizes, bool all) { if (TARGET_AVX512F && !TARGET_PREFER_AVX256) { @@ -21336,11 +21336,22 @@ ix86_autovectorize_vector_sizes (vector_ sizes->safe_push (32); sizes->safe_push (16); } + else if (TARGET_AVX512F && all) + { + sizes->safe_push (32); + sizes->safe_push (16); + sizes->safe_push (64); + } else if (TARGET_AVX && !TARGET_PREFER_AVX128) { sizes->safe_push (32); sizes->safe_push (16); } + else if (TARGET_AVX && all) + { + sizes->safe_push (16); + sizes->safe_push (32); + } } /* Implemenation of targetm.vectorize.get_mask_mode. */ --- gcc/config/mips/mips.c.jj 2019-05-14 21:37:20.166613524 +0200 +++ gcc/config/mips/mips.c 2019-05-16 19:05:29.124690606 +0200 @@ -13460,7 +13460,7 @@ mips_preferred_simd_mode (scalar_mode mo /* Implement TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES. */ static void -mips_autovectorize_vector_sizes (vector_sizes *sizes) +mips_autovectorize_vector_sizes (vector_sizes *sizes, bool) { if (ISA_HAS_MSA) sizes->safe_push (16); --- gcc/testsuite/gcc.target/i386/avx512f-simd-1.c.jj 2019-05-16 19:29:17.556218761 +0200 +++ gcc/testsuite/gcc.target/i386/avx512f-simd-1.c 2019-05-16 19:23:50.508592664 +0200 @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-fopenmp-simd -O2 -mavx512f -masm=att" } */ +/* { dg-final { scan-assembler "vpadd\[^\n\r]*%xmm" } } */ +/* { dg-final { scan-assembler "vpadd\[^\n\r]*%ymm" } } */ +/* { dg-final { scan-assembler "vpadd\[^\n\r]*%zmm" } } */ + +#define N 1024 +int a[N]; + +void +f1 (void) +{ + int i; + #pragma omp simd simdlen (4) + for (i = 0; i < N; ++i) + a[i] = a[i] + 1; +} + +void +f2 (void) +{ + int i; + #pragma omp simd simdlen (8) + for (i = 0; i < N; ++i) + a[i] = a[i] + 2; +} + +void +f3 (void) +{ + int i; + #pragma omp simd simdlen (16) + for (i = 0; i < N; ++i) + a[i] = a[i] + 3; +} Jakub