Hi,

This patch follows Richard's suggestion in the thread discussion[1],
it's to factor out the nloads computation in vectorizable_load for
strided access, to ensure we can obtain the consistent information
when estimating the costs.

btw, the reason why I didn't try to save the information into
stmt_info during analysis phase and then fetch it in transform phase
is that the information is just for strided slp loading, and to
re-compute it looks not very expensive and acceptable.

Bootstrapped/regtested on powerpc64le-linux-gnu P9,
x86_64-redhat-linux and aarch64-linux-gnu.

Is it ok for trunk?  Or it belongs to next stage 1?

BR,
Kewen

[1] https://gcc.gnu.org/legacy-ml/gcc-patches/2017-09/msg01433.html

gcc/ChangeLog:

        PR tree-optimization/82255
        * tree-vect-stmts.c (vector_vector_composition_type): Adjust function
        location.
        (struct strided_load_info): New structure.
        (vect_get_strided_load_info): New function factored out from...
        (vectorizable_load): ...this.  Call function
        vect_get_strided_load_info accordingly.
        (vect_model_load_cost): Call function vect_get_strided_load_info.

gcc/testsuite/ChangeLog:

2020-01-15  Bill Schmidt  <wschm...@linux.ibm.com>
            Kewen Lin  <li...@gcc.gnu.org>

        PR tree-optimization/82255
        * gcc.dg/vect/costmodel/ppc/costmodel-pr82255.c: New test.

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-pr82255.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-pr82255.c
new file mode 100644
index 00000000000..aaeefc39595
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-pr82255.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
+
+/* PR82255: Ensure we don't require a vec_construct cost when we aren't
+   going to generate a strided load.  */
+
+extern int abs (int __x) __attribute__ ((__nothrow__, __leaf__))
+__attribute__ ((__const__));
+
+static int
+foo (unsigned char *w, int i, unsigned char *x, int j)
+{
+  int tot = 0;
+  for (int a = 0; a < 16; a++)
+    {
+#pragma GCC unroll 16
+      for (int b = 0; b < 16; b++)
+       tot += abs (w[b] - x[b]);
+      w += i;
+      x += j;
+    }
+  return tot;
+}
+
+void
+bar (unsigned char *w, unsigned char *x, int i, int *result)
+{
+  *result = foo (w, 16, x, i);
+}
+
+/* { dg-final { scan-tree-dump-times "vec_construct" 0 "vect" } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 068e4982303..d1cbc55a676 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -897,6 +897,146 @@ cfun_returns (tree decl)
   return false;
 }
 
+/* Function VECTOR_VECTOR_COMPOSITION_TYPE
+
+   This function returns a vector type which can be composed with NETLS pieces,
+   whose type is recorded in PTYPE.  VTYPE should be a vector type, and has the
+   same vector size as the return vector.  It checks target whether supports
+   pieces-size vector mode for construction firstly, if target fails to, check
+   pieces-size scalar mode for construction further.  It returns NULL_TREE if
+   fails to find the available composition.
+
+   For example, for (vtype=V16QI, nelts=4), we can probably get:
+     - V16QI with PTYPE V4QI.
+     - V4SI with PTYPE SI.
+     - NULL_TREE.  */
+
+static tree
+vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
+{
+  gcc_assert (VECTOR_TYPE_P (vtype));
+  gcc_assert (known_gt (nelts, 0U));
+
+  machine_mode vmode = TYPE_MODE (vtype);
+  if (!VECTOR_MODE_P (vmode))
+    return NULL_TREE;
+
+  poly_uint64 vbsize = GET_MODE_BITSIZE (vmode);
+  unsigned int pbsize;
+  if (constant_multiple_p (vbsize, nelts, &pbsize))
+    {
+      /* First check if vec_init optab supports construction from
+        vector pieces directly.  */
+      scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype));
+      poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode);
+      machine_mode rmode;
+      if (related_vector_mode (vmode, elmode, inelts).exists (&rmode)
+         && (convert_optab_handler (vec_init_optab, vmode, rmode)
+             != CODE_FOR_nothing))
+       {
+         *ptype = build_vector_type (TREE_TYPE (vtype), inelts);
+         return vtype;
+       }
+
+      /* Otherwise check if exists an integer type of the same piece size and
+        if vec_init optab supports construction from it directly.  */
+      if (int_mode_for_size (pbsize, 0).exists (&elmode)
+         && related_vector_mode (vmode, elmode, nelts).exists (&rmode)
+         && (convert_optab_handler (vec_init_optab, rmode, elmode)
+             != CODE_FOR_nothing))
+       {
+         *ptype = build_nonstandard_integer_type (pbsize, 1);
+         return build_vector_type (*ptype, nelts);
+       }
+    }
+
+  return NULL_TREE;
+}
+
+/* Hold information for VMAT_ELEMENTWISE or VMAT_STRIDED_SLP strided
+   loads in function vectorizable_load.  */
+struct strided_load_info {
+  /* Number of loads required.  */
+  int nloads;
+  /* Number of vector unit advanced for each load.  */
+  int lnel;
+  /* Access type for each load.  */
+  tree ltype;
+  /* Vector type used for possible vector construction.  */
+  tree lvectype;
+};
+
+/* Determine how we perform VMAT_ELEMENTWISE or VMAT_STRIDED_SLP loads by
+   considering vector units number, group size and target supports for
+   vector construction, return the strided_load_info information.
+   ACCESS_TYPE indicates memory access type, VECTYPE indicates vector
+   type, GROUP_SIZE indicates group size for grouped_access.  */
+
+static strided_load_info
+vect_get_strided_load_info (vect_memory_access_type access_type, tree vectype,
+                           int group_size)
+{
+  int nloads, lnel;
+  tree ltype, lvectype;
+
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
+  /* Checked by get_load_store_type.  */
+  unsigned int const_nunits = nunits.to_constant ();
+
+  if (access_type == VMAT_STRIDED_SLP)
+    {
+      if ((unsigned int) group_size < const_nunits)
+       {
+         /* First check if vec_init optab supports construction from vector
+            elts directly.  Otherwise avoid emitting a constructor of
+            vector elements by performing the loads using an integer type
+            of the same size, constructing a vector of those and then
+            re-interpreting it as the original vector type.  This avoids a
+            huge runtime penalty due to the general inability to perform
+            store forwarding from smaller stores to a larger load.  */
+         tree ptype;
+         unsigned int nelts = const_nunits / group_size;
+         tree vtype = vector_vector_composition_type (vectype, nelts, &ptype);
+         if (vtype != NULL_TREE)
+           {
+             nloads = nelts;
+             lnel = group_size;
+             ltype = ptype;
+             lvectype = vtype;
+           }
+         else
+           {
+             nloads = const_nunits;
+             lnel = 1;
+             ltype = TREE_TYPE (vectype);
+             lvectype = vectype;
+           }
+       }
+      else
+       {
+         nloads = 1;
+         lnel = const_nunits;
+         ltype = vectype;
+         lvectype = vectype;
+       }
+      ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
+    }
+  else
+    {
+      gcc_assert (access_type == VMAT_ELEMENTWISE);
+      nloads = const_nunits;
+      lnel = 1;
+      /* Load vector(1) scalar_type if it's 1 element-wise vectype.  */
+      if (nloads == 1)
+       ltype = vectype;
+      else
+       ltype = TREE_TYPE (vectype);
+      lvectype = vectype;
+    }
+
+  return {nloads, lnel, ltype, lvectype};
+}
+
 /* Function vect_model_store_cost
 
    Models cost for stores.  In the case of grouped accesses, one access
@@ -1157,8 +1297,22 @@ vect_model_load_cost (vec_info *vinfo,
                        cost_vec, cost_vec, true);
   if (memory_access_type == VMAT_ELEMENTWISE
       || memory_access_type == VMAT_STRIDED_SLP)
-    inside_cost += record_stmt_cost (cost_vec, ncopies, vec_construct,
-                                    stmt_info, 0, vect_body);
+    {
+      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+      int group_size = 1;
+      if (slp_node && grouped_access_p)
+       {
+         first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+         group_size = DR_GROUP_SIZE (first_stmt_info);
+       }
+      strided_load_info
+       info = vect_get_strided_load_info (memory_access_type,
+                                          vectype, group_size);
+      /* Only requires vec_construct when number of loads > 1.  */
+      if (info.nloads > 1)
+       inside_cost += record_stmt_cost (cost_vec, ncopies, vec_construct,
+                                        stmt_info, 0, vect_body);
+    }
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -1996,62 +2150,6 @@ vect_get_store_rhs (stmt_vec_info stmt_info)
   gcc_unreachable ();
 }
 
-/* Function VECTOR_VECTOR_COMPOSITION_TYPE
-
-   This function returns a vector type which can be composed with NETLS pieces,
-   whose type is recorded in PTYPE.  VTYPE should be a vector type, and has the
-   same vector size as the return vector.  It checks target whether supports
-   pieces-size vector mode for construction firstly, if target fails to, check
-   pieces-size scalar mode for construction further.  It returns NULL_TREE if
-   fails to find the available composition.
-
-   For example, for (vtype=V16QI, nelts=4), we can probably get:
-     - V16QI with PTYPE V4QI.
-     - V4SI with PTYPE SI.
-     - NULL_TREE.  */
-
-static tree
-vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
-{
-  gcc_assert (VECTOR_TYPE_P (vtype));
-  gcc_assert (known_gt (nelts, 0U));
-
-  machine_mode vmode = TYPE_MODE (vtype);
-  if (!VECTOR_MODE_P (vmode))
-    return NULL_TREE;
-
-  poly_uint64 vbsize = GET_MODE_BITSIZE (vmode);
-  unsigned int pbsize;
-  if (constant_multiple_p (vbsize, nelts, &pbsize))
-    {
-      /* First check if vec_init optab supports construction from
-        vector pieces directly.  */
-      scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype));
-      poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode);
-      machine_mode rmode;
-      if (related_vector_mode (vmode, elmode, inelts).exists (&rmode)
-         && (convert_optab_handler (vec_init_optab, vmode, rmode)
-             != CODE_FOR_nothing))
-       {
-         *ptype = build_vector_type (TREE_TYPE (vtype), inelts);
-         return vtype;
-       }
-
-      /* Otherwise check if exists an integer type of the same piece size and
-        if vec_init optab supports construction from it directly.  */
-      if (int_mode_for_size (pbsize, 0).exists (&elmode)
-         && related_vector_mode (vmode, elmode, nelts).exists (&rmode)
-         && (convert_optab_handler (vec_init_optab, rmode, elmode)
-             != CODE_FOR_nothing))
-       {
-         *ptype = build_nonstandard_integer_type (pbsize, 1);
-         return build_vector_type (*ptype, nelts);
-       }
-    }
-
-  return NULL_TREE;
-}
-
 /* A subroutine of get_load_store_type, with a subset of the same
    arguments.  Handle the case where STMT_INFO is part of a grouped load
    or store.
@@ -8745,49 +8843,7 @@ vectorizable_load (vec_info *vinfo,
 
       stride_step = cse_and_gimplify_to_preheader (loop_vinfo, stride_step);
 
-      running_off = offvar;
-      alias_off = build_int_cst (ref_type, 0);
-      int nloads = const_nunits;
-      int lnel = 1;
-      tree ltype = TREE_TYPE (vectype);
-      tree lvectype = vectype;
       auto_vec<tree> dr_chain;
-      if (memory_access_type == VMAT_STRIDED_SLP)
-       {
-         if (group_size < const_nunits)
-           {
-             /* First check if vec_init optab supports construction from vector
-                elts directly.  Otherwise avoid emitting a constructor of
-                vector elements by performing the loads using an integer type
-                of the same size, constructing a vector of those and then
-                re-interpreting it as the original vector type.  This avoids a
-                huge runtime penalty due to the general inability to perform
-                store forwarding from smaller stores to a larger load.  */
-             tree ptype;
-             tree vtype
-               = vector_vector_composition_type (vectype,
-                                                 const_nunits / group_size,
-                                                 &ptype);
-             if (vtype != NULL_TREE)
-               {
-                 nloads = const_nunits / group_size;
-                 lnel = group_size;
-                 lvectype = vtype;
-                 ltype = ptype;
-               }
-           }
-         else
-           {
-             nloads = 1;
-             lnel = const_nunits;
-             ltype = vectype;
-           }
-         ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
-       }
-      /* Load vector(1) scalar_type if it's 1 element-wise vectype.  */
-      else if (nloads == 1)
-       ltype = vectype;
-
       if (slp)
        {
          /* For SLP permutation support we need to load the whole group,
@@ -8804,6 +8860,13 @@ vectorizable_load (vec_info *vinfo,
          else
            ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
        }
+
+      running_off = offvar;
+      alias_off = build_int_cst (ref_type, 0);
+      strided_load_info
+       sload_info = vect_get_strided_load_info (memory_access_type, vectype, 
group_size);
+      int nloads = sload_info.nloads;
+      tree ltype = sload_info.ltype;
       unsigned int group_el = 0;
       unsigned HOST_WIDE_INT
        elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
@@ -8824,7 +8887,7 @@ vectorizable_load (vec_info *vinfo,
                CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
                                        gimple_assign_lhs (new_stmt));
 
-             group_el += lnel;
+             group_el += sload_info.lnel;
              if (! slp
                  || group_el == group_size)
                {
@@ -8839,11 +8902,11 @@ vectorizable_load (vec_info *vinfo,
            }
          if (nloads > 1)
            {
-             tree vec_inv = build_constructor (lvectype, v);
-             new_temp = vect_init_vector (vinfo, stmt_info,
-                                          vec_inv, lvectype, gsi);
+             tree vec_inv = build_constructor (sload_info.lvectype, v);
+             new_temp = vect_init_vector (vinfo, stmt_info, vec_inv,
+                                          sload_info.lvectype, gsi);
              new_stmt = SSA_NAME_DEF_STMT (new_temp);
-             if (lvectype != vectype)
+             if (sload_info.lvectype != vectype)
                {
                  new_stmt = gimple_build_assign (make_ssa_name (vectype),
                                                  VIEW_CONVERT_EXPR,

Reply via email to