On Thu, 24 Aug 2023, Juzhe-Zhong wrote: > Hi. > > This patch is apply LEN_FOLD_EXTRACT_LAST into loop vectorizer. > > Consider this following case: > #include <stdint.h> > > #define N 32 > > /* Simple condition reduction. */ > > int __attribute__ ((noinline, noclone)) > condition_reduction (int *a, int min_v) > { > int last = 66; /* High start value. */ > > for (int i = 0; i < N; i++) > if (a[i] < min_v) > last = i; > > return last; > } > > With this patch, we can generate this following IR: > > _44 = .SELECT_VL (ivtmp_42, POLY_INT_CST [4, 4]); > _34 = vect_vec_iv_.5_33 + { POLY_INT_CST [4, 4], ... }; > ivtmp_36 = _44 * 4; > vect__4.8_39 = .MASK_LEN_LOAD (vectp_a.6_37, 32B, { -1, ... }, _44, 0); > > mask__11.9_41 = vect__4.8_39 < vect_cst__40; > last_5 = .LEN_FOLD_EXTRACT_LAST (last_14, mask__11.9_41, vect_vec_iv_.5_33, > _44, 0); > ...
LGTM. Thanks, Richard. > gcc/ChangeLog: > > * tree-vect-loop.cc (vectorizable_reduction): Apply > LEN_FOLD_EXTRACT_LAST. > * tree-vect-stmts.cc (vectorizable_condition): Ditto. > > --- > gcc/tree-vect-loop.cc | 7 ++++-- > gcc/tree-vect-stmts.cc | 52 ++++++++++++++++++++++++++++++++++++------ > 2 files changed, 50 insertions(+), 9 deletions(-) > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index 1cd6c291377..ebee8037e02 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -7494,8 +7494,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > } > > if (reduc_chain_length == 1 > - && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, > - vectype_in, OPTIMIZE_FOR_SPEED)) > + && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in, > + OPTIMIZE_FOR_SPEED) > + || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST, > + vectype_in, > + OPTIMIZE_FOR_SPEED))) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index 413a88750d6..be9f3a280bd 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -11740,8 +11740,17 @@ vectorizable_condition (vec_info *vinfo, > && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > { > if (reduction_type == EXTRACT_LAST_REDUCTION) > - vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo), > - ncopies * vec_num, vectype, NULL); > + { > + if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST, > + vectype, OPTIMIZE_FOR_SPEED)) > + vect_record_loop_len (loop_vinfo, > + &LOOP_VINFO_LENS (loop_vinfo), > + ncopies * vec_num, vectype, 1); > + else > + vect_record_loop_mask (loop_vinfo, > + &LOOP_VINFO_MASKS (loop_vinfo), > + ncopies * vec_num, vectype, NULL); > + } > /* Extra inactive lanes should be safe for vect_nested_cycle. */ > else if (STMT_VINFO_DEF_TYPE (reduc_info) != vect_nested_cycle) > { > @@ -11772,7 +11781,13 @@ vectorizable_condition (vec_info *vinfo, > mask to the condition, or to its inverse. */ > > vec_loop_masks *masks = NULL; > - if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > + vec_loop_lens *lens = NULL; > + if (loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) > + { > + if (reduction_type == EXTRACT_LAST_REDUCTION) > + lens = &LOOP_VINFO_LENS (loop_vinfo); > + } > + else if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > { > if (reduction_type == EXTRACT_LAST_REDUCTION) > masks = &LOOP_VINFO_MASKS (loop_vinfo); > @@ -11910,7 +11925,8 @@ vectorizable_condition (vec_info *vinfo, > /* Force vec_compare to be an SSA_NAME rather than a comparison, > in cases where that's necessary. */ > > - if (masks || reduction_type == EXTRACT_LAST_REDUCTION) > + tree len = NULL_TREE, bias = NULL_TREE; > + if (masks || lens || reduction_type == EXTRACT_LAST_REDUCTION) > { > if (!is_gimple_val (vec_compare)) > { > @@ -11931,6 +11947,23 @@ vectorizable_condition (vec_info *vinfo, > vec_compare = vec_compare_name; > } > > + if (direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST, > + vectype, OPTIMIZE_FOR_SPEED)) > + { > + if (lens) > + { > + len = vect_get_loop_len (loop_vinfo, gsi, lens, > + vec_num * ncopies, vectype, i, 1); > + signed char biasval > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + bias = build_int_cst (intQI_type_node, biasval); > + } > + else > + { > + len = size_int (TYPE_VECTOR_SUBPARTS (vectype)); > + bias = build_int_cst (intQI_type_node, 0); > + } > + } > if (masks) > { > tree loop_mask > @@ -11950,9 +11983,14 @@ vectorizable_condition (vec_info *vinfo, > { > gimple *old_stmt = vect_orig_stmt (stmt_info)->stmt; > tree lhs = gimple_get_lhs (old_stmt); > - new_stmt = gimple_build_call_internal > - (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare, > - vec_then_clause); > + if (len) > + new_stmt = gimple_build_call_internal > + (IFN_LEN_FOLD_EXTRACT_LAST, 5, else_clause, vec_compare, > + vec_then_clause, len, bias); > + else > + new_stmt = gimple_build_call_internal > + (IFN_FOLD_EXTRACT_LAST, 3, else_clause, vec_compare, > + vec_then_clause); > gimple_call_set_lhs (new_stmt, lhs); > SSA_NAME_DEF_STMT (lhs) = new_stmt; > if (old_stmt == gsi_stmt (*gsi)) > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)