https://gcc.gnu.org/g:af792f0226e479b165a49de5e8f9e1d16a4b26c0
commit r15-2191-gaf792f0226e479b165a49de5e8f9e1d16a4b26c0 Author: Tamar Christina <tamar.christ...@arm.com> Date: Mon Jul 22 10:26:14 2024 +0100 middle-end: Implement conditonal store vectorizer pattern [PR115531] This adds a conditional store optimization for the vectorizer as a pattern. The vectorizer already supports modifying memory accesses because of the pattern based gather/scatter recognition. Doing it in the vectorizer allows us to still keep the ability to vectorize such loops for architectures that don't have MASK_STORE support, whereas doing this in ifcvt makes us commit to MASK_STORE. Concretely for this loop: void foo1 (char *restrict a, int *restrict b, int *restrict c, int n, int stride) { if (stride <= 1) return; for (int i = 0; i < n; i++) { int res = c[i]; int t = b[i+stride]; if (a[i] != 0) res = t; c[i] = res; } } today we generate: .L3: ld1b z29.s, p7/z, [x0, x5] ld1w z31.s, p7/z, [x2, x5, lsl 2] ld1w z30.s, p7/z, [x1, x5, lsl 2] cmpne p15.b, p6/z, z29.b, #0 sel z30.s, p15, z30.s, z31.s st1w z30.s, p7, [x2, x5, lsl 2] add x5, x5, x4 whilelo p7.s, w5, w3 b.any .L3 which in gimple is: vect_res_18.9_68 = .MASK_LOAD (vectp_c.7_65, 32B, loop_mask_67); vect_t_20.12_74 = .MASK_LOAD (vectp.10_72, 32B, loop_mask_67); vect__9.15_77 = .MASK_LOAD (vectp_a.13_75, 8B, loop_mask_67); mask__34.16_79 = vect__9.15_77 != { 0, ... }; vect_res_11.17_80 = VEC_COND_EXPR <mask__34.16_79, vect_t_20.12_74, vect_res_18.9_68>; .MASK_STORE (vectp_c.18_81, 32B, loop_mask_67, vect_res_11.17_80); A MASK_STORE is already conditional, so there's no need to perform the load of the old values and the VEC_COND_EXPR. This patch makes it so we generate: vect_res_18.9_68 = .MASK_LOAD (vectp_c.7_65, 32B, loop_mask_67); vect__9.15_77 = .MASK_LOAD (vectp_a.13_75, 8B, loop_mask_67); mask__34.16_79 = vect__9.15_77 != { 0, ... }; .MASK_STORE (vectp_c.18_81, 32B, mask__34.16_79, vect_res_18.9_68); which generates: .L3: ld1b z30.s, p7/z, [x0, x5] ld1w z31.s, p7/z, [x1, x5, lsl 2] cmpne p7.b, p7/z, z30.b, #0 st1w z31.s, p7, [x2, x5, lsl 2] add x5, x5, x4 whilelo p7.s, w5, w3 b.any .L3 gcc/ChangeLog: PR tree-optimization/115531 * tree-vect-patterns.cc (vect_cond_store_pattern_same_ref): New. (vect_recog_cond_store_pattern): New. (vect_vect_recog_func_ptrs): Use it. * target.def (conditional_operation_is_expensive): New. * doc/tm.texi: Regenerate. * doc/tm.texi.in: Document it. * targhooks.cc (default_conditional_operation_is_expensive): New. * targhooks.h (default_conditional_operation_is_expensive): New. Diff: --- gcc/doc/tm.texi | 7 ++ gcc/doc/tm.texi.in | 2 + gcc/target.def | 12 ++++ gcc/targhooks.cc | 8 +++ gcc/targhooks.h | 1 + gcc/tree-vect-patterns.cc | 159 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 189 insertions(+) diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index f10d9a59c667..c7535d07f4dd 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6449,6 +6449,13 @@ The default implementation returns a @code{MODE_VECTOR_INT} with the same size and number of elements as @var{mode}, if such a mode exists. @end deftypefn +@deftypefn {Target Hook} bool TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE (unsigned @var{ifn}) +This hook returns true if masked operation @var{ifn} (really of +type @code{internal_fn}) should be considered more expensive to use than +implementing the same operation without masking. GCC can then try to use +unconditional operations instead with extra selects. +@end deftypefn + @deftypefn {Target Hook} bool TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE (unsigned @var{ifn}) This hook returns true if masked internal function @var{ifn} (really of type @code{internal_fn}) should be considered expensive when the mask is diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 24596eb2f6b4..64cea3b1edaf 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4290,6 +4290,8 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_VECTORIZE_GET_MASK_MODE +@hook TARGET_VECTORIZE_CONDITIONAL_OPERATION_IS_EXPENSIVE + @hook TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE @hook TARGET_VECTORIZE_CREATE_COSTS diff --git a/gcc/target.def b/gcc/target.def index ce4d1ecd58be..3de1aad4c84d 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -2033,6 +2033,18 @@ same size and number of elements as @var{mode}, if such a mode exists.", (machine_mode mode), default_get_mask_mode) +/* Function to say whether a conditional operation is expensive when + compared to non-masked operations. */ +DEFHOOK +(conditional_operation_is_expensive, + "This hook returns true if masked operation @var{ifn} (really of\n\ +type @code{internal_fn}) should be considered more expensive to use than\n\ +implementing the same operation without masking. GCC can then try to use\n\ +unconditional operations instead with extra selects.", + bool, + (unsigned ifn), + default_conditional_operation_is_expensive) + /* Function to say whether a masked operation is expensive when the mask is all zeros. */ DEFHOOK diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc index b10104c363bf..793932a77c60 100644 --- a/gcc/targhooks.cc +++ b/gcc/targhooks.cc @@ -1608,6 +1608,14 @@ default_get_mask_mode (machine_mode mode) /* By default consider masked stores to be expensive. */ +bool +default_conditional_operation_is_expensive (unsigned ifn) +{ + return ifn == IFN_MASK_STORE; +} + +/* By default consider masked stores to be expensive. */ + bool default_empty_mask_is_expensive (unsigned ifn) { diff --git a/gcc/targhooks.h b/gcc/targhooks.h index 3cbca0f13a5e..2704d6008f14 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -123,6 +123,7 @@ extern opt_machine_mode default_vectorize_related_mode (machine_mode, poly_uint64); extern opt_machine_mode default_get_mask_mode (machine_mode); extern bool default_empty_mask_is_expensive (unsigned); +extern bool default_conditional_operation_is_expensive (unsigned); extern vector_costs *default_vectorize_create_costs (vec_info *, bool); /* OpenACC hooks. */ diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index 4570c25b6647..b0821c74c1d8 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -51,8 +51,10 @@ along with GCC; see the file COPYING3. If not see #include "omp-simd-clone.h" #include "predict.h" #include "tree-vector-builder.h" +#include "tree-ssa-loop-ivopts.h" #include "vec-perm-indices.h" #include "gimple-range.h" +#include "alias.h" /* TODO: Note the vectorizer still builds COND_EXPRs with GENERIC compares @@ -6526,6 +6528,162 @@ vect_recog_gather_scatter_pattern (vec_info *vinfo, return pattern_stmt; } +/* Helper method of vect_recog_cond_store_pattern, checks to see if COND_ARG + is points to a load statement that reads the same data as that of + STORE_VINFO. */ + +static bool +vect_cond_store_pattern_same_ref (vec_info *vinfo, + stmt_vec_info store_vinfo, tree cond_arg) +{ + stmt_vec_info load_stmt_vinfo = vinfo->lookup_def (cond_arg); + if (!load_stmt_vinfo + || !STMT_VINFO_DATA_REF (load_stmt_vinfo) + || DR_IS_WRITE (STMT_VINFO_DATA_REF (load_stmt_vinfo)) + || !same_data_refs (STMT_VINFO_DATA_REF (store_vinfo), + STMT_VINFO_DATA_REF (load_stmt_vinfo))) + return false; + + return true; +} + +/* Function vect_recog_cond_store_pattern + + Try to find the following pattern: + + x = *_3; + c = a CMP b; + y = c ? t_20 : x; + *_3 = y; + + where the store of _3 happens on a conditional select on a value loaded + from the same location. In such case we can elide the initial load if + MASK_STORE is supported and instead only conditionally write out the result. + + The pattern produces for the above: + + c = a CMP b; + .MASK_STORE (_3, c, t_20) + + Input: + + * STMT_VINFO: The stmt from which the pattern search begins. In the + example, when this function is called with _3 then the search begins. + + Output: + + * TYPE_OUT: The type of the output of this pattern. + + * Return value: A new stmt that will be used to replace the sequence. */ + +static gimple * +vect_recog_cond_store_pattern (vec_info *vinfo, + stmt_vec_info stmt_vinfo, tree *type_out) +{ + loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); + if (!loop_vinfo) + return NULL; + + gimple *store_stmt = STMT_VINFO_STMT (stmt_vinfo); + + /* Needs to be a gimple store where we have DR info for. */ + if (!STMT_VINFO_DATA_REF (stmt_vinfo) + || DR_IS_READ (STMT_VINFO_DATA_REF (stmt_vinfo)) + || !gimple_store_p (store_stmt)) + return NULL; + + tree st_rhs = gimple_assign_rhs1 (store_stmt); + + if (TREE_CODE (st_rhs) != SSA_NAME) + return NULL; + + gassign *cond_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (st_rhs)); + if (!cond_stmt || gimple_assign_rhs_code (cond_stmt) != COND_EXPR) + return NULL; + + /* Check if the else value matches the original loaded one. */ + bool invert = false; + tree cmp_ls = gimple_arg (cond_stmt, 0); + tree cond_arg1 = gimple_arg (cond_stmt, 1); + tree cond_arg2 = gimple_arg (cond_stmt, 2); + + if (!vect_cond_store_pattern_same_ref (vinfo, stmt_vinfo, cond_arg2) + && !(invert = vect_cond_store_pattern_same_ref (vinfo, stmt_vinfo, + cond_arg1))) + return NULL; + + vect_pattern_detected ("vect_recog_cond_store_pattern", store_stmt); + + tree scalar_type = TREE_TYPE (st_rhs); + if (VECTOR_TYPE_P (scalar_type)) + return NULL; + + tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type); + if (vectype == NULL_TREE) + return NULL; + + machine_mode mask_mode; + machine_mode vecmode = TYPE_MODE (vectype); + if (targetm.vectorize.conditional_operation_is_expensive (IFN_MASK_STORE) + || !targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode) + || !can_vec_mask_load_store_p (vecmode, mask_mode, false)) + return NULL; + + tree base = DR_REF (STMT_VINFO_DATA_REF (stmt_vinfo)); + if (may_be_nonaddressable_p (base)) + return NULL; + + /* We need to use the false parameter of the conditional select. */ + tree cond_store_arg = invert ? cond_arg2 : cond_arg1; + tree cond_load_arg = invert ? cond_arg1 : cond_arg2; + gimple *load_stmt = SSA_NAME_DEF_STMT (cond_load_arg); + + /* This is a rough estimation to check that there aren't any aliasing stores + in between the load and store. It's a bit strict, but for now it's good + enough. */ + if (gimple_vuse (load_stmt) != gimple_vuse (store_stmt)) + return NULL; + + /* If we have to invert the condition, i.e. use the true argument rather than + the false argument, we have to negate the mask. */ + if (invert) + { + tree var = vect_recog_temp_ssa_var (boolean_type_node, NULL); + + /* Invert the mask using ^ 1. */ + tree itype = TREE_TYPE (cmp_ls); + gassign *conv = gimple_build_assign (var, BIT_XOR_EXPR, cmp_ls, + build_int_cst (itype, 1)); + + tree mask_vec_type = get_mask_type_for_scalar_type (vinfo, itype); + append_pattern_def_seq (vinfo, stmt_vinfo, conv, mask_vec_type, itype); + cmp_ls= var; + } + + if (TREE_CODE (base) != MEM_REF) + base = build_fold_addr_expr (base); + + tree ptr = build_int_cst (reference_alias_ptr_type (base), + get_object_alignment (base)); + + /* Convert the mask to the right form. */ + tree mask = vect_convert_mask_for_vectype (cmp_ls, vectype, stmt_vinfo, + vinfo); + + gcall *call + = gimple_build_call_internal (IFN_MASK_STORE, 4, base, ptr, mask, + cond_store_arg); + gimple_set_location (call, gimple_location (store_stmt)); + + /* Copy across relevant vectorization info and associate DR with the + new pattern statement instead of the original statement. */ + stmt_vec_info pattern_stmt_info = loop_vinfo->add_stmt (call); + loop_vinfo->move_dr (pattern_stmt_info, stmt_vinfo); + + *type_out = vectype; + return call; +} + /* Return true if TYPE is a non-boolean integer type. These are the types that we want to consider for narrowing. */ @@ -7191,6 +7349,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = { of mask conversion that are needed for gather and scatter internal functions. */ { vect_recog_gather_scatter_pattern, "gather_scatter" }, + { vect_recog_cond_store_pattern, "cond_store" }, { vect_recog_mask_conversion_pattern, "mask_conversion" }, { vect_recog_widen_plus_pattern, "widen_plus" }, { vect_recog_widen_minus_pattern, "widen_minus" },