On Wed, 19 Apr 2023, Jakub Jelinek wrote:

> Hi!
> 
> For __builtin_popcountll tree-vect-patterns.cc has
> vect_recog_popcount_pattern, which improves the vectorized code.
> Without that the vectorization is always multi-type vectorization
> in the loop (at least int and long long types) where we emit two
> .POPCOUNT calls with long long arguments and int return value and then
> widen to long long, so effectively after vectorization do the
> V?DImode -> V?DImode popcount twice, then pack the result into V?SImode
> and immediately unpack.
> 
> The following patch extends that handling to __builtin_{clz,ctz,ffs}ll
> builtins as well (as long as there is an optab for them; more to come
> laster).
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, plus tested on
> the testcase in crosses to powerpc64le-linux and s390x-linux.  Ok
> for trunk?

OK.

Richard.

> x86 can do __builtin_popcountll with -mavx512vpopcntdq, __builtin_clzll
> with -mavx512cd, ppc can do __builtin_popcountll and __builtin_clzll
> with -mpower8-vector and __builtin_ctzll with -mpower9-vector, s390
> can do __builtin_{popcount,clz,ctz}ll with -march=z13 -mzarch (i.e. VX).
> 
> 2023-04-19  Jakub Jelinek  <ja...@redhat.com>
> 
>       PR tree-optimization/109011
>       * tree-vect-patterns.cc (vect_recog_popcount_pattern): Rename to ...
>       (vect_recog_popcount_clz_ctz_ffs_pattern): ... this.  Handle also
>       CLZ, CTZ and FFS.  Remove vargs variable, use
>       gimple_build_call_internal rather than gimple_build_call_internal_vec.
>       (vect_vect_recog_func_ptrs): Adjust popcount entry.
> 
>       * gcc.dg/vect/pr109011-1.c: New test.
> 
> --- gcc/tree-vect-patterns.cc.jj      2023-03-01 09:51:27.995362601 +0100
> +++ gcc/tree-vect-patterns.cc 2023-04-18 17:16:42.733935262 +0200
> @@ -1501,7 +1501,7 @@ vect_recog_widen_minus_pattern (vec_info
>                                     "vect_recog_widen_minus_pattern");
>  }
>  
> -/* Function vect_recog_popcount_pattern
> +/* Function vect_recog_popcount_clz_ctz_ffs_pattern
>  
>     Try to find the following pattern:
>  
> @@ -1530,16 +1530,20 @@ vect_recog_widen_minus_pattern (vec_info
>     * Return value: A new stmt that will be used to replace the sequence of
>     stmts that constitute the pattern. In this case it will be:
>     B = .POPCOUNT (A);
> +
> +   Similarly for clz, ctz and ffs.
>  */
>  
>  static gimple *
> -vect_recog_popcount_pattern (vec_info *vinfo,
> -                          stmt_vec_info stmt_vinfo, tree *type_out)
> +vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
> +                                      stmt_vec_info stmt_vinfo,
> +                                      tree *type_out)
>  {
>    gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt);
> -  gimple *popcount_stmt, *pattern_stmt;
> +  gimple *call_stmt, *pattern_stmt;
>    tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var;
> -  auto_vec<tree> vargs;
> +  internal_fn ifn = IFN_LAST;
> +  int addend = 0;
>  
>    /* Find B = (TYPE1) temp_out. */
>    if (!last_stmt)
> @@ -1557,51 +1561,137 @@ vect_recog_popcount_pattern (vec_info *v
>    if (TREE_CODE (rhs_oprnd) != SSA_NAME
>        || !has_single_use (rhs_oprnd))
>      return NULL;
> -  popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
> +  call_stmt = SSA_NAME_DEF_STMT (rhs_oprnd);
>  
>    /* Find temp_out = __builtin_popcount{,l,ll} (temp_in);  */
> -  if (!is_gimple_call (popcount_stmt))
> +  if (!is_gimple_call (call_stmt))
>      return NULL;
> -  switch (gimple_call_combined_fn (popcount_stmt))
> +  switch (gimple_call_combined_fn (call_stmt))
>      {
> +      int val;
>      CASE_CFN_POPCOUNT:
> +      ifn = IFN_POPCOUNT;
> +      break;
> +    CASE_CFN_CLZ:
> +      ifn = IFN_CLZ;
> +      /* Punt if call result is unsigned and defined value at zero
> +      is negative, as the negative value doesn't extend correctly.  */
> +      if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd))
> +       && gimple_call_internal_p (call_stmt)
> +       && CLZ_DEFINED_VALUE_AT_ZERO
> +            (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2
> +       && val < 0)
> +     return NULL;
> +      break;
> +    CASE_CFN_CTZ:
> +      ifn = IFN_CTZ;
> +      /* Punt if call result is unsigned and defined value at zero
> +      is negative, as the negative value doesn't extend correctly.  */
> +      if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd))
> +       && gimple_call_internal_p (call_stmt)
> +       && CTZ_DEFINED_VALUE_AT_ZERO
> +            (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2
> +       && val < 0)
> +     return NULL;
> +      break;
> +    CASE_CFN_FFS:
> +      ifn = IFN_FFS;
>        break;
>      default:
>        return NULL;
>      }
>  
> -  if (gimple_call_num_args (popcount_stmt) != 1)
> +  if (gimple_call_num_args (call_stmt) != 1)
>      return NULL;
>  
> -  rhs_oprnd = gimple_call_arg (popcount_stmt, 0);
> +  rhs_oprnd = gimple_call_arg (call_stmt, 0);
>    vect_unpromoted_value unprom_diff;
> -  rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd,
> -                                                 &unprom_diff);
> +  rhs_origin
> +    = vect_look_through_possible_promotion (vinfo, rhs_oprnd, &unprom_diff);
>  
>    if (!rhs_origin)
>      return NULL;
>  
> -  /* Input and output of .POPCOUNT should be same-precision integer.
> -     Also A should be unsigned or same precision as temp_in,
> -     otherwise there would be sign_extend from A to temp_in.  */
> -  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)
> -      || (!TYPE_UNSIGNED (unprom_diff.type)
> -       && (TYPE_PRECISION (unprom_diff.type)
> -           != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))))
> +  /* Input and output of .POPCOUNT should be same-precision integer.  */
> +  if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type))
>      return NULL;
> -  vargs.safe_push (unprom_diff.op);
>  
> -  vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt);
> +  /* Also A should be unsigned or same precision as temp_in, otherwise
> +     different builtins/internal functions have different behaviors.  */
> +  if (TYPE_PRECISION (unprom_diff.type)
> +      != TYPE_PRECISION (TREE_TYPE (rhs_oprnd)))
> +    switch (ifn)
> +      {
> +      case IFN_POPCOUNT:
> +     /* For popcount require zero extension, which doesn't add any
> +        further bits to the count.  */
> +     if (!TYPE_UNSIGNED (unprom_diff.type))
> +       return NULL;
> +     break;
> +      case IFN_CLZ:
> +     /* clzll (x) == clz (x) + 32 for unsigned x != 0, so ok
> +        if it is undefined at zero or if it matches also for the
> +        defined value there.  */
> +     if (!TYPE_UNSIGNED (unprom_diff.type))
> +       return NULL;
> +     if (!type_has_mode_precision_p (lhs_type)
> +         || !type_has_mode_precision_p (TREE_TYPE (rhs_oprnd)))
> +       return NULL;
> +     addend = (TYPE_PRECISION (TREE_TYPE (rhs_oprnd))
> +               - TYPE_PRECISION (lhs_type));
> +     if (gimple_call_internal_p (call_stmt))
> +       {
> +         int val1, val2;
> +         int d1
> +           = CLZ_DEFINED_VALUE_AT_ZERO
> +               (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1);
> +         int d2
> +           = CLZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type),
> +                                        val2);
> +         if (d1 != 2)
> +           break;
> +         if (d2 != 2 || val1 != val2 + addend)
> +           return NULL;
> +       }
> +     break;
> +      case IFN_CTZ:
> +     /* ctzll (x) == ctz (x) for unsigned or signed x != 0, so ok
> +        if it is undefined at zero or if it matches also for the
> +        defined value there.  */
> +     if (gimple_call_internal_p (call_stmt))
> +       {
> +         int val1, val2;
> +         int d1
> +           = CTZ_DEFINED_VALUE_AT_ZERO
> +               (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1);
> +         int d2
> +           = CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type),
> +                                        val2);
> +         if (d1 != 2)
> +           break;
> +         if (d2 != 2 || val1 != val2)
> +           return NULL;
> +       }
> +     break;
> +      case IFN_FFS:
> +     /* ffsll (x) == ffs (x) for unsigned or signed x.  */
> +     break;
> +      default:
> +     gcc_unreachable ();
> +      }
> +
> +  vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
> +                      call_stmt);
>    vec_type = get_vectype_for_scalar_type (vinfo, lhs_type);
> -  /* Do it only if the backend has popcount<vector_mode>2 pattern.  */
> +  /* Do it only if the backend has popcount<vector_mode>2 etc. pattern.  */
>    if (!vec_type
> -      || !direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
> +      || !direct_internal_fn_supported_p (ifn, vec_type,
>                                         OPTIMIZE_FOR_SPEED))
>      return NULL;
>  
>    /* Create B = .POPCOUNT (A).  */
>    new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> -  pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs);
> +  pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
>    gimple_call_set_lhs (pattern_stmt, new_var);
>    gimple_set_location (pattern_stmt, gimple_location (last_stmt));
>    *type_out = vec_type;
> @@ -1609,6 +1699,14 @@ vect_recog_popcount_pattern (vec_info *v
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
>                    "created pattern stmt: %G", pattern_stmt);
> +
> +  if (addend)
> +    {
> +      append_pattern_def_seq (vinfo, stmt_vinfo, pattern_stmt, vec_type);
> +      tree ret_var = vect_recog_temp_ssa_var (lhs_type, NULL);
> +      pattern_stmt = gimple_build_assign (ret_var, PLUS_EXPR, new_var,
> +                                       build_int_cst (lhs_type, addend));
> +    }
>    return pattern_stmt;
>  }
>  
> @@ -6051,7 +6149,7 @@ static vect_recog_func vect_vect_recog_f
>    { vect_recog_sad_pattern, "sad" },
>    { vect_recog_widen_sum_pattern, "widen_sum" },
>    { vect_recog_pow_pattern, "pow" },
> -  { vect_recog_popcount_pattern, "popcount" },
> +  { vect_recog_popcount_clz_ctz_ffs_pattern, "popcount_clz_ctz_ffs" },
>    { vect_recog_widen_shift_pattern, "widen_shift" },
>    { vect_recog_rotate_pattern, "rotate" },
>    { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },
> --- gcc/testsuite/gcc.dg/vect/pr109011-1.c.jj 2023-04-18 14:40:47.117397908 
> +0200
> +++ gcc/testsuite/gcc.dg/vect/pr109011-1.c    2023-04-18 14:40:05.124004362 
> +0200
> @@ -0,0 +1,48 @@
> +/* PR tree-optimization/109011 */
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-unroll-loops --param=vect-epilogues-nomask=0 
> -fdump-tree-optimized" } */
> +/* { dg-additional-options "-mavx512cd" { target { { i?86-*-* x86_64-*-* } 
> && avx512cd } } } */
> +/* { dg-additional-options "-mavx512vpopcntdq" { target { { i?86-*-* 
> x86_64-*-* } && avx512vpopcntdq } } } */
> +/* { dg-additional-options "-mpower8-vector" { target powerpc_p8vector_ok } 
> } */
> +/* { dg-additional-options "-mpower9-vector" { target powerpc_p9vector_ok } 
> } */
> +/* { dg-additional-options "-march=z13 -mzarch" { target s390_vx } } */
> +
> +void
> +foo (long long *p, long long *q)
> +{
> +#pragma omp simd
> +  for (int i = 0; i < 2048; ++i)
> +    p[i] = __builtin_popcountll (q[i]);
> +}
> +
> +/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { 
> target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } } */
> +/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { 
> target { powerpc_p8vector_ok || s390_vx } } } } */
> +
> +void
> +bar (long long *p, long long *q)
> +{
> +#pragma omp simd
> +  for (int i = 0; i < 2048; ++i)
> +    p[i] = __builtin_clzll (q[i]);
> +}
> +
> +/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target 
> { { i?86-*-* x86_64-*-* } && avx512cd } } } } */
> +/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target 
> { powerpc_p8vector_ok || s390_vx } } } } */
> +
> +void
> +baz (long long *p, long long *q)
> +{
> +#pragma omp simd
> +  for (int i = 0; i < 2048; ++i)
> +    p[i] = __builtin_ctzll (q[i]);
> +}
> +
> +/* { dg-final { scan-tree-dump-times " = \.CTZ \\\(" 1 "optimized" { target 
> { powerpc_p9vector_ok || s390_vx } } } } */
> +
> +void
> +qux (long long *p, long long *q)
> +{
> +#pragma omp simd
> +  for (int i = 0; i < 2048; ++i)
> +    p[i] = __builtin_ffsll (q[i]);
> +}
> 
>       Jakub
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

Reply via email to