Robin Dapp <rd...@linux.ibm.com> writes: > Hi, > > as discussed in > https://gcc.gnu.org/pipermail/gcc-patches/2021-October/582627.html this > introduces a bias parameter for the len_load/len_store ifns as well as > optabs that is meant to distinguish between Power and s390 variants. > The default is a bias of 0, while in s390's case vll/vstl do not support > lengths of zero bytes and a bias of -1 should be used. > > Bootstrapped and regtested on Power9 (--with-cpu=power9) and s390 > (--with-arch=z15). > > The tiny changes in the Power backend I will post separately.
Some comments in addition to what Kewen said: > diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c > index 8312d08aab2..993e32c1854 100644 > --- a/gcc/internal-fn.c > +++ b/gcc/internal-fn.c > @@ -2696,9 +2696,9 @@ expand_call_mem_ref (tree type, gcall *stmt, int index) > static void > expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > { > - class expand_operand ops[3]; > - tree type, lhs, rhs, maskt; > - rtx mem, target, mask; > + class expand_operand ops[4]; > + tree type, lhs, rhs, maskt, biast; > + rtx mem, target, mask, bias; > insn_code icode; > > maskt = gimple_call_arg (stmt, 2); > @@ -2727,7 +2727,16 @@ expand_partial_load_optab_fn (internal_fn, gcall > *stmt, convert_optab optab) > TYPE_UNSIGNED (TREE_TYPE (maskt))); > else > create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt))); > - expand_insn (icode, 3, ops); > + if (optab == len_load_optab) > + { > + biast = gimple_call_arg (stmt, 3); > + bias = expand_normal (biast); > + create_input_operand (&ops[3], bias, QImode); > + expand_insn (icode, 4, ops); > + } > + else > + expand_insn (icode, 3, ops); > + The previous “if“ is also for len_load_optab, so it seems better to combine the two. > if (!rtx_equal_p (target, ops[0].value)) > emit_move_insn (target, ops[0].value); > } > @@ -2741,9 +2750,9 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, > convert_optab optab) > static void > expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > { > - class expand_operand ops[3]; > - tree type, lhs, rhs, maskt; > - rtx mem, reg, mask; > + class expand_operand ops[4]; > + tree type, lhs, rhs, maskt, biast; > + rtx mem, reg, mask, bias; > insn_code icode; > > maskt = gimple_call_arg (stmt, 2); > @@ -2770,7 +2779,16 @@ expand_partial_store_optab_fn (internal_fn, gcall > *stmt, convert_optab optab) > TYPE_UNSIGNED (TREE_TYPE (maskt))); > else > create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt))); > - expand_insn (icode, 3, ops); > + > + if (optab == len_store_optab) > + { > + biast = gimple_call_arg (stmt, 4); > + bias = expand_normal (biast); > + create_input_operand (&ops[3], bias, QImode); > + expand_insn (icode, 4, ops); > + } > + else > + expand_insn (icode, 3, ops); Same idea here. > } > > #define expand_mask_store_optab_fn expand_partial_store_optab_fn > @@ -4172,6 +4190,30 @@ internal_check_ptrs_fn_supported_p (internal_fn ifn, > tree type, > && insn_operand_matches (icode, 4, GEN_INT (align))); > } > > +/* Return the supported bias for the len_load IFN. For now we support a > + default bias of 0 and -1 in case 0 is not an allowable length for > len_load. Neither bias is really the default. How about just “For now we only support the biases 0 and -1.”? > + If none of these biases match what the backend provides, return > + VECT_PARTIAL_BIAS_UNSUPPORTED. */ > + > +signed char > +internal_len_load_bias_supported (internal_fn ifn, machine_mode mode) Since this is now providing the bias rather than asking about a given bias (thanks), I think we should drop “_supported”. > +{ > + optab optab = direct_internal_fn_optab (ifn); > + insn_code icode = direct_optab_handler (optab, mode); > + > + if (icode != CODE_FOR_nothing) > + { > + /* We only support a bias of 0 (default) or -1. Try both > + of them. */ > + if (insn_operand_matches (icode, 3, GEN_INT (0))) > + return 0; > + else if (insn_operand_matches (icode, 3, GEN_INT (-1))) Minor nit, but: redundant else. > + return -1; > + } > + > + return VECT_PARTIAL_BIAS_UNSUPPORTED; > +} > + > /* Expand STMT as though it were a call to internal function FN. */ > > void > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h > index 19d0f849a5a..af28cf0d566 100644 > --- a/gcc/internal-fn.h > +++ b/gcc/internal-fn.h > @@ -227,6 +227,10 @@ extern bool internal_gather_scatter_fn_supported_p > (internal_fn, tree, > tree, tree, int); > extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree, > poly_uint64, unsigned int); > +#define VECT_PARTIAL_BIAS_UNSUPPORTED 127 > + > +extern signed char internal_len_load_bias_supported (internal_fn ifn, > + machine_mode); > > extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree, > bool, bool, bool, bool, tree *); > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > index e94356d76e9..cd2c33fc4a7 100644 > --- a/gcc/tree-vect-loop.c > +++ b/gcc/tree-vect-loop.c > @@ -1163,6 +1163,15 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo) > if (LOOP_VINFO_LENS (loop_vinfo).is_empty ()) > return false; > > + opt_machine_mode len_load_mode = get_len_load_store_mode > + (loop_vinfo->vector_mode, false); > + /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit > + len_loads with a length of zero. In order to avoid that we prohibit > + more than one loop length here. */ > + if (internal_len_load_bias_supported (IFN_LEN_LOAD, len_load_mode.require > ()) > + == -1 && LOOP_VINFO_LENS (loop_vinfo).length () > 1) Might as well add the require () to the get_len_load_store_mode call. As Kewen says, it would be good to store the bias computed here in the loop_vec_info and assert that all users agree. If we do that, then there's also the option of adding the bias to the lengths in vect_set_loop_controls_directly. > + return false; > + > unsigned int max_nitems_per_iter = 1; > unsigned int i; > rgroup_controls *rgl; > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > index 17849b575b7..c3df26c8009 100644 > --- a/gcc/tree-vect-stmts.c > +++ b/gcc/tree-vect-stmts.c > @@ -8289,9 +8289,30 @@ vectorizable_store (vec_info *vinfo, > gsi); > vec_oprnd = var; > } > + > + /* Check which bias value to use. Default is 0. > + A bias of -1 means that we cannot emit a LEN_LOAD with > + a length of 0 and need to subtract 1 from the length. */ > + char biasval = internal_len_load_bias_supported > + (IFN_LEN_STORE, new_vmode); > + tree bias = build_int_cst (intQI_type_node, biasval); > + tree new_len = final_len; > + if (biasval != 0 > + && biasval != VECT_PARTIAL_BIAS_UNSUPPORTED) > + { > + new_len = make_ssa_name (TREE_TYPE (final_len)); > + gassign *m1 > + = gimple_build_assign (new_len, MINUS_EXPR, > + final_len, > + build_one_cst (TREE_TYPE > + (final_len))); > + vect_finish_stmt_generation (vinfo, stmt_info, m1, > + gsi); > + } If we don't do that and add the bias at the use site instead, then it'd be good to split this out into a subroutine so that the load and store code can share it. Thanks, Richard > gcall *call > - = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr, > - ptr, final_len, vec_oprnd); > + = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr, > + ptr, new_len, vec_oprnd, > + bias); > gimple_call_set_nothrow (call, true); > vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); > new_stmt = call; > @@ -9588,24 +9609,46 @@ vectorizable_load (vec_info *vinfo, > vec_num * j + i); > tree ptr = build_int_cst (ref_type, > align * BITS_PER_UNIT); > + > + machine_mode vmode = TYPE_MODE (vectype); > + opt_machine_mode new_ovmode > + = get_len_load_store_mode (vmode, true); > + machine_mode new_vmode = new_ovmode.require (); > + tree qi_type = unsigned_intQI_type_node; > + tree new_vtype > + = build_vector_type_for_mode (qi_type, new_vmode); > + > + /* Check which bias value to use. Default is 0. */ > + char biasval = internal_len_load_bias_supported > + (IFN_LEN_LOAD, new_vmode); > + > + tree bias = build_int_cst (intQI_type_node, biasval); > + tree new_len = final_len; > + if (biasval != 0 > + && biasval != VECT_PARTIAL_BIAS_UNSUPPORTED) > + { > + new_len = make_ssa_name (TREE_TYPE (final_len)); > + gassign *m1 = gimple_build_assign (new_len, > + MINUS_EXPR, > + final_len, > + build_one_cst > + (TREE_TYPE > + (final_len))); > + vect_finish_stmt_generation (vinfo, stmt_info, m1, > + gsi); > + } > + > gcall *call > - = gimple_build_call_internal (IFN_LEN_LOAD, 3, > + = gimple_build_call_internal (IFN_LEN_LOAD, 4, > dataref_ptr, ptr, > - final_len); > + new_len, bias); > gimple_call_set_nothrow (call, true); > new_stmt = call; > data_ref = NULL_TREE; > > /* Need conversion if it's wrapped with VnQI. */ > - machine_mode vmode = TYPE_MODE (vectype); > - opt_machine_mode new_ovmode > - = get_len_load_store_mode (vmode, true); > - machine_mode new_vmode = new_ovmode.require (); > if (vmode != new_vmode) > { > - tree qi_type = unsigned_intQI_type_node; > - tree new_vtype > - = build_vector_type_for_mode (qi_type, new_vmode); > tree var = vect_get_new_ssa_name (new_vtype, > vect_simple_var); > gimple_set_lhs (call, var);