https://gcc.gnu.org/g:9dff0355c75d70d2a7583b626426599ee054406a

commit r16-5491-g9dff0355c75d70d2a7583b626426599ee054406a
Author: Richard Biener <[email protected]>
Date:   Fri Nov 21 12:14:46 2025 +0100

    Fix OMP SIMD clone mask register and query
    
    The following removes the confusion around num_mask_args that was
    added to properly "guess" the number of mask elements in a AVX512
    mask that's just represented as int.  The actual mistake lies in
    the mixup of 'ncopies' which is used to track the number of
    OMP SIMD calls to be emitted rather than the number of input
    vectors.  So this reverts the earlier r16-5374-g5c2fdfc24e343c,
    uses the proper 'ncopies' for loop mask record/query and adjusts
    the guessing of the SIMD arg mask elements.
    
            PR tree-optimization/122762
            PR tree-optimization/122736
            PR tree-optimization/122790
            * cgraph.h (cgraph_simd_clone_arg::linear_step): Document
            use for SIMD_CLONE_ARG_TYPE_MASK.
            * omp-simd-clone.cc (simd_clone_adjust_argument_types):
            Record the number of mask arguments in linear_step if
            mask_mode is not VOIDmode.
            * tree-vect-stmts.cc (vectorizable_simd_clone_call):
            Remove num_mask_args computation, use a proper ncopies
            to query/register loop masks, use linear_step for the
            number of mask arguments when determining the number of
            mask elements in a mask argument.
    
            * gcc.dg/vect/vect-simd-clone-23.c: New testcase.

Diff:
---
 gcc/cgraph.h                                   |  4 ++-
 gcc/omp-simd-clone.cc                          |  4 +++
 gcc/testsuite/gcc.dg/vect/vect-simd-clone-23.c | 17 ++++++++++
 gcc/tree-vect-stmts.cc                         | 46 ++++++++++----------------
 4 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index aa2207b1dd20..313610fbe2c6 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -805,7 +805,9 @@ struct GTY(()) cgraph_simd_clone_arg {
   /* For arg_type SIMD_CLONE_ARG_TYPE_LINEAR_*CONSTANT_STEP this is
      the constant linear step, if arg_type is
      SIMD_CLONE_ARG_TYPE_LINEAR_*VARIABLE_STEP, this is index of
-     the uniform argument holding the step, otherwise 0.  */
+     the uniform argument holding the step, otherwise 0.
+     For arg_type SIMD_CLONE_ARG_TYPE_MASK and a mask_mode that is
+     not VOIDmode, this is the number of mask arguments.  */
   HOST_WIDE_INT linear_step;
 };
 
diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index fe6093e45543..b71d5177a509 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -892,6 +892,10 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
       sc->args[i].orig_type = base_type;
       sc->args[i].arg_type = SIMD_CLONE_ARG_TYPE_MASK;
       sc->args[i].vector_type = mask_type;
+      /* Record the number of mask copies when that is difficult to
+        compute.  */
+      if (sc->mask_mode != VOIDmode)
+       sc->args[i].linear_step = k;
     }
 
   if (!node->definition)
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-23.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-23.c
new file mode 100644
index 000000000000..312ac9f468fa
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-23.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd" } */
+/* { dg-additional-options "-mavx512bw" { target avx512bw } } */
+
+#pragma omp declare simd simdlen(32) inbranch
+int __attribute__((const)) baz (int x);
+
+short a[1024];
+
+void __attribute__((noipa))
+foo (int n, int * __restrict b)
+{
+  for (int i = 0; i < n; ++i)
+    if (a[i])
+      b[i] = baz (b[i]);
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index b8e36d4ee090..4a397cc1142e 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4055,7 +4055,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
   class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
   tree fndecl, new_temp;
-  int ncopies, j;
+  int j;
   auto_vec<simd_call_arg_info> arginfo;
   vec<tree> vargs = vNULL;
   size_t i, nargs;
@@ -4211,6 +4211,8 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
          && TREE_CODE (op) == SSA_NAME)
        vect_simd_lane_linear (op, loop, &thisarginfo);
 
+      if (!vectype)
+       vectype = thisarginfo.vectype;
       arginfo.quick_push (thisarginfo);
     }
 
@@ -4339,13 +4341,6 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   if (bestn == NULL)
     return false;
 
-  unsigned int num_mask_args = 0;
-  for (i = 0; i < bestn->simdclone->nargs; i++)
-    if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK)
-      num_mask_args++;
-  if (!SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
-    gcc_assert (num_mask_args <= 1);
-
   for (i = 0; i < nargs; i++)
     {
       if ((arginfo[i].dt == vect_constant_def
@@ -4403,7 +4398,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
            {
              if (!SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype))
                  || maybe_ne (exact_div (bestn->simdclone->simdlen,
-                                         num_mask_args),
+                                         
bestn->simdclone->args[i].linear_step),
                               TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
                {
                  /* FORNOW we only have partial support for integer-type masks
@@ -4431,7 +4426,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
 
   fndecl = bestn->decl;
   nunits = bestn->simdclone->simdlen;
-  ncopies = vector_unroll_factor (vf * group_size, nunits);
+  int ncopies = vector_unroll_factor (vf * group_size, nunits);
 
   /* If the function isn't const, only allow it in simd loops where user
      has asserted that at least nunits consecutive iterations can be
@@ -4440,6 +4435,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
       && gimple_vuse (stmt))
     return false;
 
+  /* ncopies is the number of SIMD clone calls we create, since simdlen
+     is not necessarily matching nunits of the vector types used, track
+     that in ncopies_in.  */
+  int ncopies_in = vect_get_num_vectors (vf * group_size, vectype);
+
   /* Sanity check: make sure that at least one copy of the vectorized stmt
      needs to be generated.  */
   gcc_assert (ncopies >= 1);
@@ -4491,20 +4491,9 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
            case SIMD_CLONE_ARG_TYPE_MASK:
              if (loop_vinfo
                  && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
-               {
-                 tree arg_vectype;
-                 if (SCALAR_INT_MODE_P
-                       (TYPE_MODE (bestn->simdclone->args[i].vector_type)))
-                   arg_vectype = build_truth_vector_type_for_mode
-                       (exact_div (bestn->simdclone->simdlen, num_mask_args),
-                        TYPE_MODE (bestn->simdclone->args[i].vector_type));
-                 else
-                   arg_vectype = bestn->simdclone->args[i].vector_type;
-                 vect_record_loop_mask (loop_vinfo,
-                                        &LOOP_VINFO_MASKS (loop_vinfo),
-                                        ncopies * num_mask_args, arg_vectype,
-                                        op);
-               }
+               vect_record_loop_mask (loop_vinfo,
+                                      &LOOP_VINFO_MASKS (loop_vinfo),
+                                      ncopies_in, vectype, op);
              break;
            }
        }
@@ -4694,7 +4683,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
                                = &LOOP_VINFO_MASKS (loop_vinfo);
                              tree loop_mask
                                = vect_get_loop_mask (loop_vinfo, gsi,
-                                                     loop_masks, ncopies,
+                                                     loop_masks, ncopies_in,
                                                      vectype, j);
                              vec_oprnd0
                                = prepare_vec_mask (loop_vinfo,
@@ -4728,11 +4717,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
              else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode))
                {
                  atype = bestn->simdclone->args[i].vector_type;
-                 /* Guess the number of lanes represented by atype.  */
                  poly_uint64 atype_subparts
                    = exact_div (bestn->simdclone->simdlen,
-                                num_mask_args);
-                 o = vector_unroll_factor (nunits, atype_subparts);
+                                bestn->simdclone->args[i].linear_step);
+                 o = bestn->simdclone->args[i].linear_step;
                  for (m = j * o; m < (j + 1) * o; m++)
                    {
                      if (m == 0)
@@ -4756,7 +4744,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
                                = &LOOP_VINFO_MASKS (loop_vinfo);
                              tree loop_mask
                                = vect_get_loop_mask (loop_vinfo, gsi,
-                                                     loop_masks, ncopies,
+                                                     loop_masks, ncopies_in,
                                                      vectype, j);
                              vec_oprnd0
                                = prepare_vec_mask (loop_vinfo,

Reply via email to