On Wed, 12 Jul 2023, juzhe.zh...@rivai.ai wrote: > Thank you so much. > I am gonna wait for Richi's final approval.
It's good enough when either of us approves unless we explicitely ask to wait for somebody else. LGTM anyway. Richard. > Thanks. > > > juzhe.zh...@rivai.ai > > From: Richard Sandiford > Date: 2023-07-12 18:53 > To: juzhe.zhong > CC: gcc-patches; rguenther > Subject: Re: [PATCH V2] VECT: Apply COND_LEN_* into vectorizable_operation > juzhe.zh...@rivai.ai writes: > > From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai> > > > > Hi, Richard and Richi. > > As we disscussed before, COND_LEN_* patterns were added for multiple > > situations. > > This patch apply CON_LEN_* for the following situation: > > > > Support for the situation that in "vectorizable_operation": > > /* If operating on inactive elements could generate spurious traps, > > we need to restrict the operation to active lanes. Note that this > > specifically doesn't apply to unhoisted invariants, since they > > operate on the same value for every lane. > > > > Similarly, if this operation is part of a reduction, a fully-masked > > loop should only change the active lanes of the reduction chain, > > keeping the inactive lanes as-is. */ > > bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt)) > > || reduc_idx >= 0); > > > > For mask_out_inactive is true with length loop control. > > > > So, we can these 2 following cases: > > > > 1. Integer division: > > > > #define TEST_TYPE(TYPE) \ > > __attribute__((noipa)) \ > > void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \ > > { \ > > for (int i = 0; i < n; i++) \ > > dst[i] = a[i] % b[i]; \ > > } > > #define TEST_ALL() \ > > TEST_TYPE(int8_t) \ > > TEST_ALL() > > > > With this patch: > > > > _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]); > > ivtmp_45 = _61 * 4; > > vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... }); > > vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... }); > > vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, > > vect__4.8_48, _61, 0); > > .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, > > vect__8.12_53); > > > > 2. Floating-point arithmetic **WITHOUT** -ffast-math > > > > #define TEST_TYPE(TYPE) \ > > __attribute__((noipa)) \ > > void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \ > > { \ > > for (int i = 0; i < n; i++) \ > > dst[i] = a[i] + b[i]; \ > > } > > #define TEST_ALL() \ > > TEST_TYPE(float) \ > > TEST_ALL() > > > > With this patch: > > > > _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]); > > ivtmp_45 = _61 * 4; > > vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... }); > > vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... }); > > vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, > > vect__4.8_48, _61, 0); > > .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, > > vect__8.12_53); > > > > With this patch, we can make sure operations won't trap for elements that > > "mask_out_inactive". > > > > gcc/ChangeLog: > > > > * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* > > support. > > (CASE): Ditto. > > (get_conditional_len_internal_fn): New function. > > * internal-fn.h (get_conditional_len_internal_fn): Ditto. > > * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* > > support. > > > > --- > > gcc/internal-fn.cc | 65 ++++++++++++++++++++++++++++++------------ > > gcc/internal-fn.h | 1 + > > gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++++++------- > > 3 files changed, 85 insertions(+), 29 deletions(-) > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > > index f9aaf66cf2a..7e3a8cc8412 100644 > > --- a/gcc/internal-fn.cc > > +++ b/gcc/internal-fn.cc > > @@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) > > (internal_fn, gcall *) = { > > 0 > > }; > > > > -/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a > > - tree code CODE. */ > > +/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX > > + that maps to a tree code CODE. There is also an IFN_COND_LEN_##SUFFIX > > + for each such IFN_COND_##SUFFIX. */ > > #define FOR_EACH_CODE_MAPPING(T) \ > > - T (PLUS_EXPR, IFN_COND_ADD) \ > > - T (MINUS_EXPR, IFN_COND_SUB) \ > > - T (MULT_EXPR, IFN_COND_MUL) \ > > - T (TRUNC_DIV_EXPR, IFN_COND_DIV) \ > > - T (TRUNC_MOD_EXPR, IFN_COND_MOD) \ > > - T (RDIV_EXPR, IFN_COND_RDIV) \ > > - T (MIN_EXPR, IFN_COND_MIN) \ > > - T (MAX_EXPR, IFN_COND_MAX) \ > > - T (BIT_AND_EXPR, IFN_COND_AND) \ > > - T (BIT_IOR_EXPR, IFN_COND_IOR) \ > > - T (BIT_XOR_EXPR, IFN_COND_XOR) \ > > - T (LSHIFT_EXPR, IFN_COND_SHL) \ > > - T (RSHIFT_EXPR, IFN_COND_SHR) \ > > - T (NEGATE_EXPR, IFN_COND_NEG) > > + T (PLUS_EXPR, ADD) \ > > + T (MINUS_EXPR, SUB) \ > > + T (MULT_EXPR, MUL) \ > > + T (TRUNC_DIV_EXPR, DIV) \ > > + T (TRUNC_MOD_EXPR, MOD) \ > > + T (RDIV_EXPR, RDIV) \ > > + T (MIN_EXPR, MIN) \ > > + T (MAX_EXPR, MAX) \ > > + T (BIT_AND_EXPR, AND) \ > > + T (BIT_IOR_EXPR, IOR) \ > > + T (BIT_XOR_EXPR, XOR) \ > > + T (LSHIFT_EXPR, SHL) \ > > + T (RSHIFT_EXPR, SHR) \ > > + T (NEGATE_EXPR, NEG) > > > > /* Return a function that only performs CODE when a certain condition is > > met > > and that uses a given fallback value otherwise. For example, if CODE is > > @@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code) > > { > > switch (code) > > { > > -#define CASE(CODE, IFN) case CODE: return IFN; > > +#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN; > > FOR_EACH_CODE_MAPPING(CASE) > > #undef CASE > > default: > > @@ -4329,7 +4330,7 @@ conditional_internal_fn_code (internal_fn ifn) > > { > > switch (ifn) > > { > > -#define CASE(CODE, IFN) case IFN: return CODE; > > +#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE; > > FOR_EACH_CODE_MAPPING(CASE) > > #undef CASE > > default: > > @@ -4337,6 +4338,34 @@ conditional_internal_fn_code (internal_fn ifn) > > } > > } > > > > +/* Return a function that only performs CODE when a certain condition is > > met > > + and that uses a given fallback value otherwise. For example, if CODE is > > This describes COND_* rather than COND_LEN_*. How about: > > /* Like get_conditional_internal_fn, but return a function that > additionally restricts the operation to the leading elements > of a vector. The number of elements to process is given by > a length and bias pair, as for IFN_LOAD_LEN. The values of > the remaining elements are undefined. > > For example, if CODE is [...] > > (Please confirm that the last sentence of the first paragraph is true.) > > > + a binary operation associated with conditional function FN: > > + > > + LHS = FN (COND, A, B, ELSE, LEN, BIAS) > > + > > + is equivalent to the C expression: > > + > > + for (int i = 0; i < LEN + BIAS; i++) > > + LHS[i] = COND[i] ? A[i] CODE B[i] : ELSE[i]; > > + > > + operating elementwise if the operands are vectors. > > + > > The "operating elementwise" doesn't make sense in this context, > since the example is a loop that always operates elementwise. > Seems better to drop that line. > > OK with those changes, thanks. > > Richard > > > + Return IFN_LAST if no such function exists. */ > > + > > +internal_fn > > +get_conditional_len_internal_fn (tree_code code) > > +{ > > + switch (code) > > + { > > +#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN; > > + FOR_EACH_CODE_MAPPING(CASE) > > +#undef CASE > > + default: > > + return IFN_LAST; > > + } > > +} > > + > > /* Invoke T(IFN) for each internal function IFN that also has an > > IFN_COND_* form. */ > > #define FOR_EACH_COND_FN_PAIR(T) \ > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h > > index 4234bbfed87..dd1bab0bddf 100644 > > --- a/gcc/internal-fn.h > > +++ b/gcc/internal-fn.h > > @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void); > > > > extern internal_fn get_conditional_internal_fn (tree_code); > > extern internal_fn get_conditional_internal_fn (internal_fn); > > +extern internal_fn get_conditional_len_internal_fn (tree_code); > > extern tree_code conditional_internal_fn_code (internal_fn); > > extern internal_fn get_unconditional_internal_fn (internal_fn); > > extern bool can_interpret_as_conditional_op_p (gimple *, tree *, > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > index 10e71178ce7..dd24f017235 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo, > > > > int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info); > > vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : > > NULL); > > + vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : > > NULL); > > internal_fn cond_fn = get_conditional_internal_fn (code); > > + internal_fn cond_len_fn = get_conditional_len_internal_fn (code); > > > > /* If operating on inactive elements could generate spurious traps, > > we need to restrict the operation to active lanes. Note that this > > @@ -6730,9 +6732,17 @@ vectorizable_operation (vec_info *vinfo, > > && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) > > && mask_out_inactive) > > { > > - if (cond_fn == IFN_LAST > > - || !direct_internal_fn_supported_p (cond_fn, vectype, > > - OPTIMIZE_FOR_SPEED)) > > + if (cond_fn != IFN_LAST > > + && direct_internal_fn_supported_p (cond_fn, vectype, > > + OPTIMIZE_FOR_SPEED)) > > + vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, > > + vectype, NULL); > > + else if (cond_len_fn != IFN_LAST > > + && direct_internal_fn_supported_p (cond_len_fn, vectype, > > + OPTIMIZE_FOR_SPEED)) > > + vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype, > > + 1); > > + else > > { > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > @@ -6740,9 +6750,6 @@ vectorizable_operation (vec_info *vinfo, > > " conditional operation is available.\n"); > > LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > > } > > - else > > - vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, > > - vectype, NULL); > > } > > > > /* Put types on constant and invariant SLP children. */ > > @@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo, > > "transform binary/unary operation.\n"); > > > > bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P > > (loop_vinfo); > > + bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P > > (loop_vinfo); > > > > /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as > > vectors with unsigned elements, but the result is signed. So, we > > @@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo, > > gimple_assign_set_lhs (new_stmt, new_temp); > > vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); > > } > > - else if (masked_loop_p && mask_out_inactive) > > + else if ((masked_loop_p || len_loop_p) && mask_out_inactive) > > { > > - tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks, > > - vec_num * ncopies, vectype, i); > > - auto_vec<tree> vops (5); > > + tree mask; > > + if (masked_loop_p) > > + mask = vect_get_loop_mask (loop_vinfo, gsi, masks, > > + vec_num * ncopies, vectype, i); > > + else > > + /* Dummy mask. */ > > + mask = build_minus_one_cst (truth_type_for (vectype)); > > + auto_vec<tree> vops (6); > > vops.quick_push (mask); > > vops.quick_push (vop0); > > if (vop1) > > @@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo, > > (cond_fn, vectype, vops.length () - 1, &vops[1]); > > vops.quick_push (else_value); > > } > > - gcall *call = gimple_build_call_internal_vec (cond_fn, vops); > > + if (len_loop_p) > > + { > > + tree len = vect_get_loop_len (loop_vinfo, gsi, lens, > > + vec_num * ncopies, vectype, i, 1); > > + signed char biasval > > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > > + tree bias = build_int_cst (intQI_type_node, biasval); > > + vops.quick_push (len); > > + vops.quick_push (bias); > > + } > > + gcall *call > > + = gimple_build_call_internal_vec (masked_loop_p ? cond_fn > > + : cond_len_fn, > > + vops); > > new_temp = make_ssa_name (vec_dest, call); > > gimple_call_set_lhs (call, new_temp); > > gimple_call_set_nothrow (call, true); > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman; HRB 36809 (AG Nuernberg)