https://gcc.gnu.org/g:af792f0226e479b165a49de5e8f9e1d16a4b26c0

commit r15-2191-gaf792f0226e479b165a49de5e8f9e1d16a4b26c0
Author: Tamar Christina <tamar.christ...@arm.com>
Date:   Mon Jul 22 10:26:14 2024 +0100

    middle-end: Implement conditonal store vectorizer pattern [PR115531]
    
    This adds a conditional store optimization for the vectorizer as a pattern.
    The vectorizer already supports modifying memory accesses because of the 
pattern
    based gather/scatter recognition.
    
    Doing it in the vectorizer allows us to still keep the ability to vectorize 
such
    loops for architectures that don't have MASK_STORE support, whereas doing 
this
    in ifcvt makes us commit to MASK_STORE.
    
    Concretely for this loop:
    
    void foo1 (char *restrict a, int *restrict b, int *restrict c, int n, int 
stride)
    {
      if (stride <= 1)
        return;
    
      for (int i = 0; i < n; i++)
        {
          int res = c[i];
          int t = b[i+stride];
          if (a[i] != 0)
            res = t;
          c[i] = res;
        }
    }
    
    today we generate:
    
    .L3:
            ld1b    z29.s, p7/z, [x0, x5]
            ld1w    z31.s, p7/z, [x2, x5, lsl 2]
            ld1w    z30.s, p7/z, [x1, x5, lsl 2]
            cmpne   p15.b, p6/z, z29.b, #0
            sel     z30.s, p15, z30.s, z31.s
            st1w    z30.s, p7, [x2, x5, lsl 2]
            add     x5, x5, x4
            whilelo p7.s, w5, w3
            b.any   .L3
    
    which in gimple is:
    
      vect_res_18.9_68 = .MASK_LOAD (vectp_c.7_65, 32B, loop_mask_67);
      vect_t_20.12_74 = .MASK_LOAD (vectp.10_72, 32B, loop_mask_67);
      vect__9.15_77 = .MASK_LOAD (vectp_a.13_75, 8B, loop_mask_67);
      mask__34.16_79 = vect__9.15_77 != { 0, ... };
      vect_res_11.17_80 = VEC_COND_EXPR <mask__34.16_79, vect_t_20.12_74, 
vect_res_18.9_68>;
      .MASK_STORE (vectp_c.18_81, 32B, loop_mask_67, vect_res_11.17_80);
    
    A MASK_STORE is already conditional, so there's no need to perform the load 
of
    the old values and the VEC_COND_EXPR.  This patch makes it so we generate:
    
      vect_res_18.9_68 = .MASK_LOAD (vectp_c.7_65, 32B, loop_mask_67);
      vect__9.15_77 = .MASK_LOAD (vectp_a.13_75, 8B, loop_mask_67);
      mask__34.16_79 = vect__9.15_77 != { 0, ... };
      .MASK_STORE (vectp_c.18_81, 32B, mask__34.16_79, vect_res_18.9_68);
    
    which generates:
    
    .L3:
            ld1b    z30.s, p7/z, [x0, x5]
            ld1w    z31.s, p7/z, [x1, x5, lsl 2]
            cmpne   p7.b, p7/z, z30.b, #0
            st1w    z31.s, p7, [x2, x5, lsl 2]
            add     x5, x5, x4
            whilelo p7.s, w5, w3
            b.any   .L3
    
    gcc/ChangeLog:
    
            PR tree-optimization/115531
            * tree-vect-patterns.cc (vect_cond_store_pattern_same_ref): New.
            (vect_recog_cond_store_pattern): New.
            (vect_vect_recog_func_ptrs): Use it.
            * target.def (conditional_operation_is_expensive): New.
            * doc/tm.texi: Regenerate.
            * doc/tm.texi.in: Document it.
            * targhooks.cc (default_conditional_operation_is_expensive): New.
            * targhooks.h (default_conditional_operation_is_expensive): New.

Diff:
---
 gcc/doc/tm.texi           |   7 ++
 gcc/doc/tm.texi.in        |   2 +
 gcc/target.def            |  12 ++++
 gcc/targhooks.cc          |   8 +++
 gcc/targhooks.h           |   1 +
 gcc/tree-vect-patterns.cc | 159 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 189 insertions(+)

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index f10d9a59c667..c7535d07f4dd 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6449,6 +6449,13 @@ The default implementation returns a 
@code{MODE_VECTOR_INT} with the
 same size and number of elements as @var{mode}, if such a mode exists.
 @end deftypefn
 
+@deftypefn {Target Hook} bool 
TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE (unsigned @var{ifn})
+This hook returns true if masked operation @var{ifn} (really of
+type @code{internal_fn}) should be considered more expensive to use than
+implementing the same operation without masking.  GCC can then try to use
+unconditional operations instead with extra selects.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE 
(unsigned @var{ifn})
 This hook returns true if masked internal function @var{ifn} (really of
 type @code{internal_fn}) should be considered expensive when the mask is
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 24596eb2f6b4..64cea3b1edaf 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4290,6 +4290,8 @@ address;  but often a machine-dependent strategy can 
generate better code.
 
 @hook TARGET_VECTORIZE_GET_MASK_MODE
 
+@hook TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE
+
 @hook TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
 
 @hook TARGET_VECTORIZE_CREATE_COSTS
diff --git a/gcc/target.def b/gcc/target.def
index ce4d1ecd58be..3de1aad4c84d 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2033,6 +2033,18 @@ same size and number of elements as @var{mode}, if such 
a mode exists.",
  (machine_mode mode),
  default_get_mask_mode)
 
+/* Function to say whether a conditional operation is expensive when
+   compared to non-masked operations.  */
+DEFHOOK
+(conditional_operation_is_expensive,
+ "This hook returns true if masked operation @var{ifn} (really of\n\
+type @code{internal_fn}) should be considered more expensive to use than\n\
+implementing the same operation without masking.  GCC can then try to use\n\
+unconditional operations instead with extra selects.",
+ bool,
+ (unsigned ifn),
+ default_conditional_operation_is_expensive)
+
 /* Function to say whether a masked operation is expensive when the
    mask is all zeros.  */
 DEFHOOK
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index b10104c363bf..793932a77c60 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1608,6 +1608,14 @@ default_get_mask_mode (machine_mode mode)
 
 /* By default consider masked stores to be expensive.  */
 
+bool
+default_conditional_operation_is_expensive (unsigned ifn)
+{
+  return ifn == IFN_MASK_STORE;
+}
+
+/* By default consider masked stores to be expensive.  */
+
 bool
 default_empty_mask_is_expensive (unsigned ifn)
 {
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index 3cbca0f13a5e..2704d6008f14 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -123,6 +123,7 @@ extern opt_machine_mode default_vectorize_related_mode 
(machine_mode,
                                                        poly_uint64);
 extern opt_machine_mode default_get_mask_mode (machine_mode);
 extern bool default_empty_mask_is_expensive (unsigned);
+extern bool default_conditional_operation_is_expensive (unsigned);
 extern vector_costs *default_vectorize_create_costs (vec_info *, bool);
 
 /* OpenACC hooks.  */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 4570c25b6647..b0821c74c1d8 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -51,8 +51,10 @@ along with GCC; see the file COPYING3.  If not see
 #include "omp-simd-clone.h"
 #include "predict.h"
 #include "tree-vector-builder.h"
+#include "tree-ssa-loop-ivopts.h"
 #include "vec-perm-indices.h"
 #include "gimple-range.h"
+#include "alias.h"
 
 
 /* TODO:  Note the vectorizer still builds COND_EXPRs with GENERIC compares
@@ -6526,6 +6528,162 @@ vect_recog_gather_scatter_pattern (vec_info *vinfo,
   return pattern_stmt;
 }
 
+/* Helper method of vect_recog_cond_store_pattern,  checks to see if COND_ARG
+   is points to a load statement that reads the same data as that of
+   STORE_VINFO.  */
+
+static bool
+vect_cond_store_pattern_same_ref (vec_info *vinfo,
+                                 stmt_vec_info store_vinfo, tree cond_arg)
+{
+  stmt_vec_info load_stmt_vinfo = vinfo->lookup_def (cond_arg);
+  if (!load_stmt_vinfo
+      || !STMT_VINFO_DATA_REF (load_stmt_vinfo)
+      || DR_IS_WRITE (STMT_VINFO_DATA_REF (load_stmt_vinfo))
+      || !same_data_refs (STMT_VINFO_DATA_REF (store_vinfo),
+                         STMT_VINFO_DATA_REF (load_stmt_vinfo)))
+    return false;
+
+  return true;
+}
+
+/* Function vect_recog_cond_store_pattern
+
+   Try to find the following pattern:
+
+   x = *_3;
+   c = a CMP b;
+   y = c ? t_20 : x;
+   *_3 = y;
+
+   where the store of _3 happens on a conditional select on a value loaded
+   from the same location.  In such case we can elide the initial load if
+   MASK_STORE is supported and instead only conditionally write out the result.
+
+   The pattern produces for the above:
+
+   c = a CMP b;
+   .MASK_STORE (_3, c, t_20)
+
+   Input:
+
+   * STMT_VINFO: The stmt from which the pattern search begins.  In the
+   example, when this function is called with _3 then the search begins.
+
+   Output:
+
+   * TYPE_OUT: The type of the output  of this pattern.
+
+   * Return value: A new stmt that will be used to replace the sequence.  */
+
+static gimple *
+vect_recog_cond_store_pattern (vec_info *vinfo,
+                              stmt_vec_info stmt_vinfo, tree *type_out)
+{
+  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
+  if (!loop_vinfo)
+    return NULL;
+
+  gimple *store_stmt = STMT_VINFO_STMT (stmt_vinfo);
+
+  /* Needs to be a gimple store where we have DR info for.  */
+  if (!STMT_VINFO_DATA_REF (stmt_vinfo)
+      || DR_IS_READ (STMT_VINFO_DATA_REF (stmt_vinfo))
+      || !gimple_store_p (store_stmt))
+    return NULL;
+
+  tree st_rhs = gimple_assign_rhs1 (store_stmt);
+
+  if (TREE_CODE (st_rhs) != SSA_NAME)
+    return NULL;
+
+  gassign *cond_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (st_rhs));
+  if (!cond_stmt || gimple_assign_rhs_code (cond_stmt) != COND_EXPR)
+    return NULL;
+
+  /* Check if the else value matches the original loaded one.  */
+  bool invert = false;
+  tree cmp_ls = gimple_arg (cond_stmt, 0);
+  tree cond_arg1 = gimple_arg (cond_stmt, 1);
+  tree cond_arg2 = gimple_arg (cond_stmt, 2);
+
+  if (!vect_cond_store_pattern_same_ref (vinfo, stmt_vinfo, cond_arg2)
+      && !(invert = vect_cond_store_pattern_same_ref (vinfo, stmt_vinfo,
+                                                     cond_arg1)))
+    return NULL;
+
+  vect_pattern_detected ("vect_recog_cond_store_pattern", store_stmt);
+
+  tree scalar_type = TREE_TYPE (st_rhs);
+  if (VECTOR_TYPE_P (scalar_type))
+    return NULL;
+
+  tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
+  if (vectype == NULL_TREE)
+    return NULL;
+
+  machine_mode mask_mode;
+  machine_mode vecmode = TYPE_MODE (vectype);
+  if (targetm.vectorize.conditional_operation_is_expensive (IFN_MASK_STORE)
+      || !targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode)
+      || !can_vec_mask_load_store_p (vecmode, mask_mode, false))
+    return NULL;
+
+  tree base = DR_REF (STMT_VINFO_DATA_REF (stmt_vinfo));
+  if (may_be_nonaddressable_p (base))
+    return NULL;
+
+  /* We need to use the false parameter of the conditional select.  */
+  tree cond_store_arg = invert ? cond_arg2 : cond_arg1;
+  tree cond_load_arg = invert ? cond_arg1 : cond_arg2;
+  gimple *load_stmt = SSA_NAME_DEF_STMT (cond_load_arg);
+
+  /* This is a rough estimation to check that there aren't any aliasing stores
+     in between the load and store.  It's a bit strict, but for now it's good
+     enough.  */
+  if (gimple_vuse (load_stmt) != gimple_vuse (store_stmt))
+    return NULL;
+
+  /* If we have to invert the condition, i.e. use the true argument rather than
+     the false argument, we have to negate the mask.  */
+  if (invert)
+    {
+      tree var = vect_recog_temp_ssa_var (boolean_type_node, NULL);
+
+      /* Invert the mask using ^ 1.  */
+      tree itype = TREE_TYPE (cmp_ls);
+      gassign *conv = gimple_build_assign (var, BIT_XOR_EXPR, cmp_ls,
+                                          build_int_cst (itype, 1));
+
+      tree mask_vec_type = get_mask_type_for_scalar_type (vinfo, itype);
+      append_pattern_def_seq (vinfo, stmt_vinfo, conv, mask_vec_type, itype);
+      cmp_ls= var;
+    }
+
+  if (TREE_CODE (base) != MEM_REF)
+   base = build_fold_addr_expr (base);
+
+  tree ptr = build_int_cst (reference_alias_ptr_type (base),
+                           get_object_alignment (base));
+
+  /* Convert the mask to the right form.  */
+  tree mask = vect_convert_mask_for_vectype (cmp_ls, vectype, stmt_vinfo,
+                                            vinfo);
+
+  gcall *call
+    = gimple_build_call_internal (IFN_MASK_STORE, 4, base, ptr, mask,
+                                 cond_store_arg);
+  gimple_set_location (call, gimple_location (store_stmt));
+
+  /* Copy across relevant vectorization info and associate DR with the
+     new pattern statement instead of the original statement.  */
+  stmt_vec_info pattern_stmt_info = loop_vinfo->add_stmt (call);
+  loop_vinfo->move_dr (pattern_stmt_info, stmt_vinfo);
+
+  *type_out = vectype;
+  return call;
+}
+
 /* Return true if TYPE is a non-boolean integer type.  These are the types
    that we want to consider for narrowing.  */
 
@@ -7191,6 +7349,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = {
      of mask conversion that are needed for gather and scatter
      internal functions.  */
   { vect_recog_gather_scatter_pattern, "gather_scatter" },
+  { vect_recog_cond_store_pattern, "cond_store" },
   { vect_recog_mask_conversion_pattern, "mask_conversion" },
   { vect_recog_widen_plus_pattern, "widen_plus" },
   { vect_recog_widen_minus_pattern, "widen_minus" },

Reply via email to