I fix comments as you suggested.

Thanks a lot!
 Soon will merge it when I finish the bootstrap && regression.



juzhe.zh...@rivai.ai
 
From: Richard Sandiford
Date: 2023-07-12 20:14
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V3] VECT: Apply COND_LEN_* into vectorizable_operation
juzhe.zh...@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai>
>
> Hi, Richard and Richi.
> As we disscussed before, COND_LEN_* patterns were added for multiple 
> situations.
> This patch apply CON_LEN_* for the following situation:
>
> Support for the situation that in "vectorizable_operation":
>   /* If operating on inactive elements could generate spurious traps,
>      we need to restrict the operation to active lanes.  Note that this
>      specifically doesn't apply to unhoisted invariants, since they
>      operate on the same value for every lane.
>
>      Similarly, if this operation is part of a reduction, a fully-masked
>      loop should only change the active lanes of the reduction chain,
>      keeping the inactive lanes as-is.  */
>   bool mask_out_inactive = ((!is_invariant && gimple_could_trap_p (stmt))
>     || reduc_idx >= 0);
>
> For mask_out_inactive is true with length loop control.
>
> So, we can these 2 following cases:
>
> 1. Integer division:
>
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vrem_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] % b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(int8_t) \
>    TEST_ALL()
>
> With this patch:
>   
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, 
> vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> 2. Floating-point arithmetic **WITHOUT** -ffast-math
>   
>    #define TEST_TYPE(TYPE) \
>    __attribute__((noipa)) \
>    void vadd_##TYPE (TYPE *dst, TYPE *a, TYPE *b, int n) \
>    { \
>      for (int i = 0; i < n; i++) \
>        dst[i] = a[i] + b[i]; \
>    }
>    #define TEST_ALL() \
>    TEST_TYPE(float) \
>    TEST_ALL()
>
> With this patch:
>    
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [4, 4]);
>   ivtmp_45 = _61 * 4;
>   vect__4.8_48 = .LEN_MASK_LOAD (vectp_a.6_46, 32B, _61, 0, { -1, ... });
>   vect__6.11_52 = .LEN_MASK_LOAD (vectp_b.9_50, 32B, _61, 0, { -1, ... });
>   vect__8.12_53 = .COND_LEN_ADD ({ -1, ... }, vect__4.8_48, vect__6.11_52, 
> vect__4.8_48, _61, 0);
>   .LEN_MASK_STORE (vectp_dst.13_55, 32B, _61, 0, { -1, ... }, vect__8.12_53);
>
> With this patch, we can make sure operations won't trap for elements that 
> "mask_out_inactive".
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (FOR_EACH_CODE_MAPPING): Adapt for COND_LEN_* 
> support.
>         (CASE): Ditto.
>         (get_conditional_len_internal_fn): New function.
>         * internal-fn.h (get_conditional_len_internal_fn): Ditto.
>         * tree-vect-stmts.cc (vectorizable_operation): Adapt for COND_LEN_* 
> support.
>
> ---
>  gcc/internal-fn.cc     | 73 +++++++++++++++++++++++++++++++-----------
>  gcc/internal-fn.h      |  1 +
>  gcc/tree-vect-stmts.cc | 48 ++++++++++++++++++++-------
>  3 files changed, 93 insertions(+), 29 deletions(-)
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index f9aaf66cf2a..b288ac6fe6b 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4276,23 +4276,24 @@ static void (*const internal_fn_expanders[]) 
> (internal_fn, gcall *) = {
>    0
>  };
>  
> -/* Invoke T(CODE, IFN) for each conditional function IFN that maps to a
> -   tree code CODE.  */
> +/* Invoke T(CODE, SUFFIX) for each conditional function IFN_COND_##SUFFIX
> +   that maps to a tree code CODE.  There is also an IFN_COND_LEN_##SUFFIX
> +   for each such IFN_COND_##SUFFIX.  */
>  #define FOR_EACH_CODE_MAPPING(T) \
> -  T (PLUS_EXPR, IFN_COND_ADD) \
> -  T (MINUS_EXPR, IFN_COND_SUB) \
> -  T (MULT_EXPR, IFN_COND_MUL) \
> -  T (TRUNC_DIV_EXPR, IFN_COND_DIV) \
> -  T (TRUNC_MOD_EXPR, IFN_COND_MOD) \
> -  T (RDIV_EXPR, IFN_COND_RDIV) \
> -  T (MIN_EXPR, IFN_COND_MIN) \
> -  T (MAX_EXPR, IFN_COND_MAX) \
> -  T (BIT_AND_EXPR, IFN_COND_AND) \
> -  T (BIT_IOR_EXPR, IFN_COND_IOR) \
> -  T (BIT_XOR_EXPR, IFN_COND_XOR) \
> -  T (LSHIFT_EXPR, IFN_COND_SHL) \
> -  T (RSHIFT_EXPR, IFN_COND_SHR) \
> -  T (NEGATE_EXPR, IFN_COND_NEG)
> +  T (PLUS_EXPR, ADD) \
> +  T (MINUS_EXPR, SUB) \
> +  T (MULT_EXPR, MUL) \
> +  T (TRUNC_DIV_EXPR, DIV) \
> +  T (TRUNC_MOD_EXPR, MOD) \
> +  T (RDIV_EXPR, RDIV) \
> +  T (MIN_EXPR, MIN) \
> +  T (MAX_EXPR, MAX) \
> +  T (BIT_AND_EXPR, AND) \
> +  T (BIT_IOR_EXPR, IOR) \
> +  T (BIT_XOR_EXPR, XOR) \
> +  T (LSHIFT_EXPR, SHL) \
> +  T (RSHIFT_EXPR, SHR) \
> +  T (NEGATE_EXPR, NEG)
>  
>  /* Return a function that only performs CODE when a certain condition is met
>     and that uses a given fallback value otherwise.  For example, if CODE is
> @@ -4313,7 +4314,7 @@ get_conditional_internal_fn (tree_code code)
>  {
>    switch (code)
>      {
> -#define CASE(CODE, IFN) case CODE: return IFN;
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_##IFN;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4329,7 +4330,7 @@ conditional_internal_fn_code (internal_fn ifn)
>  {
>    switch (ifn)
>      {
> -#define CASE(CODE, IFN) case IFN: return CODE;
> +#define CASE(CODE, IFN) case IFN_COND_##IFN: return CODE;
>        FOR_EACH_CODE_MAPPING(CASE)
>  #undef CASE
>      default:
> @@ -4337,6 +4338,42 @@ conditional_internal_fn_code (internal_fn ifn)
>      }
>  }
>  
> +/* Like get_conditional_internal_fn, but return a function that
> +   additionally restricts the operation to the leading elements
> +   of a vector.  The number of elements to process is given by
> +   a length and bias pair.  The function only performs the CODE
> +   when a certain condition is met as well as the element is located
> +   within LEN + BIAS (i < LEN + BIAS) and that uses a given fallback value
> +   otherwise.
 
How about:
 
/* Like get_conditional_internal_fn, but return a function that
   additionally restricts the operation to the leading elements
   of a vector.  The number of elements to process is given by a length
   and bias pair, as for IFN_LOAD_LEN.  The values of the remaining
   elements are taken from the fallback ("else") argument.
 
I'd like to keep the reference to IFN_LOAD_LEN.  Without it, people
might wonder why there are two length parameters, rather than one.
 
> +
> +   For example, if CODE is [PLUS, MINUS, ... etc]:
 
   For example, if CODE is a binary operation associated with FN:
 
> +
> +     LHS = FN (COND, A, B, ELSE, LEN, BIAS)
> +
> +   is equivalent to the C expression:
 
s/expression/code/ (sorry for not noticing last time).
 
> +
> +     for (int i = 0; i < NUNITS; i++)
> +      {
> + if (COND[i] && i < (LEN + BIAS))
 
IMO it's more natural to put the condition the other way:
 
  if (i < LEN + BIAS && COND[i])
 
OK with those changes, thanks.
 
Richard
 
> +   LHS[i] = A[i] CODE B[i];
> + else
> +   LHS[i] = ELSE[i];
> +      }
> +*/
> +
> +internal_fn
> +get_conditional_len_internal_fn (tree_code code)
> +{
> +  switch (code)
> +    {
> +#define CASE(CODE, IFN) case CODE: return IFN_COND_LEN_##IFN;
> +      FOR_EACH_CODE_MAPPING(CASE)
> +#undef CASE
> +    default:
> +      return IFN_LAST;
> +    }
> +}
> +
>  /* Invoke T(IFN) for each internal function IFN that also has an
>     IFN_COND_* form.  */
>  #define FOR_EACH_COND_FN_PAIR(T) \
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 4234bbfed87..dd1bab0bddf 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -224,6 +224,7 @@ extern bool set_edom_supported_p (void);
>  
>  extern internal_fn get_conditional_internal_fn (tree_code);
>  extern internal_fn get_conditional_internal_fn (internal_fn);
> +extern internal_fn get_conditional_len_internal_fn (tree_code);
>  extern tree_code conditional_internal_fn_code (internal_fn);
>  extern internal_fn get_unconditional_internal_fn (internal_fn);
>  extern bool can_interpret_as_conditional_op_p (gimple *, tree *,
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 10e71178ce7..dd24f017235 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -6711,7 +6711,9 @@ vectorizable_operation (vec_info *vinfo,
>  
>    int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
>    vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : 
> NULL);
> +  vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
>    internal_fn cond_fn = get_conditional_internal_fn (code);
> +  internal_fn cond_len_fn = get_conditional_len_internal_fn (code);
>  
>    /* If operating on inactive elements could generate spurious traps,
>       we need to restrict the operation to active lanes.  Note that this
> @@ -6730,9 +6732,17 @@ vectorizable_operation (vec_info *vinfo,
>    && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
>    && mask_out_inactive)
>  {
> -   if (cond_fn == IFN_LAST
> -       || !direct_internal_fn_supported_p (cond_fn, vectype,
> -   OPTIMIZE_FOR_SPEED))
> +   if (cond_fn != IFN_LAST
> +       && direct_internal_fn_supported_p (cond_fn, vectype,
> + OPTIMIZE_FOR_SPEED))
> +     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> +    vectype, NULL);
> +   else if (cond_len_fn != IFN_LAST
> +    && direct_internal_fn_supported_p (cond_len_fn, vectype,
> +       OPTIMIZE_FOR_SPEED))
> +     vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num, vectype,
> +   1);
> +   else
>      {
>        if (dump_enabled_p ())
>  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -6740,9 +6750,6 @@ vectorizable_operation (vec_info *vinfo,
>  " conditional operation is available.\n");
>        LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
>      }
> -   else
> -     vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
> -    vectype, NULL);
>  }
>  
>        /* Put types on constant and invariant SLP children.  */
> @@ -6805,6 +6812,7 @@ vectorizable_operation (vec_info *vinfo,
>                       "transform binary/unary operation.\n");
>  
>    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P 
> (loop_vinfo);
>  
>    /* POINTER_DIFF_EXPR has pointer arguments which are vectorized as
>       vectors with unsigned elements, but the result is signed.  So, we
> @@ -6971,11 +6979,16 @@ vectorizable_operation (vec_info *vinfo,
>    gimple_assign_set_lhs (new_stmt, new_temp);
>    vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
>  }
> -      else if (masked_loop_p && mask_out_inactive)
> +      else if ((masked_loop_p || len_loop_p) && mask_out_inactive)
>  {
> -   tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> -   vec_num * ncopies, vectype, i);
> -   auto_vec<tree> vops (5);
> +   tree mask;
> +   if (masked_loop_p)
> +     mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
> +        vec_num * ncopies, vectype, i);
> +   else
> +     /* Dummy mask.  */
> +     mask = build_minus_one_cst (truth_type_for (vectype));
> +   auto_vec<tree> vops (6);
>    vops.quick_push (mask);
>    vops.quick_push (vop0);
>    if (vop1)
> @@ -6995,7 +7008,20 @@ vectorizable_operation (vec_info *vinfo,
>  (cond_fn, vectype, vops.length () - 1, &vops[1]);
>        vops.quick_push (else_value);
>      }
> -   gcall *call = gimple_build_call_internal_vec (cond_fn, vops);
> +   if (len_loop_p)
> +     {
> +       tree len = vect_get_loop_len (loop_vinfo, gsi, lens,
> +     vec_num * ncopies, vectype, i, 1);
> +       signed char biasval
> + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +       tree bias = build_int_cst (intQI_type_node, biasval);
> +       vops.quick_push (len);
> +       vops.quick_push (bias);
> +     }
> +   gcall *call
> +     = gimple_build_call_internal_vec (masked_loop_p ? cond_fn
> +     : cond_len_fn,
> +       vops);
>    new_temp = make_ssa_name (vec_dest, call);
>    gimple_call_set_lhs (call, new_temp);
>    gimple_call_set_nothrow (call, true);
 

Reply via email to