On Wed, 19 Apr 2023, Jakub Jelinek wrote: > Hi! > > For __builtin_popcountll tree-vect-patterns.cc has > vect_recog_popcount_pattern, which improves the vectorized code. > Without that the vectorization is always multi-type vectorization > in the loop (at least int and long long types) where we emit two > .POPCOUNT calls with long long arguments and int return value and then > widen to long long, so effectively after vectorization do the > V?DImode -> V?DImode popcount twice, then pack the result into V?SImode > and immediately unpack. > > The following patch extends that handling to __builtin_{clz,ctz,ffs}ll > builtins as well (as long as there is an optab for them; more to come > laster). > > Bootstrapped/regtested on x86_64-linux and i686-linux, plus tested on > the testcase in crosses to powerpc64le-linux and s390x-linux. Ok > for trunk?
OK. Richard. > x86 can do __builtin_popcountll with -mavx512vpopcntdq, __builtin_clzll > with -mavx512cd, ppc can do __builtin_popcountll and __builtin_clzll > with -mpower8-vector and __builtin_ctzll with -mpower9-vector, s390 > can do __builtin_{popcount,clz,ctz}ll with -march=z13 -mzarch (i.e. VX). > > 2023-04-19 Jakub Jelinek <ja...@redhat.com> > > PR tree-optimization/109011 > * tree-vect-patterns.cc (vect_recog_popcount_pattern): Rename to ... > (vect_recog_popcount_clz_ctz_ffs_pattern): ... this. Handle also > CLZ, CTZ and FFS. Remove vargs variable, use > gimple_build_call_internal rather than gimple_build_call_internal_vec. > (vect_vect_recog_func_ptrs): Adjust popcount entry. > > * gcc.dg/vect/pr109011-1.c: New test. > > --- gcc/tree-vect-patterns.cc.jj 2023-03-01 09:51:27.995362601 +0100 > +++ gcc/tree-vect-patterns.cc 2023-04-18 17:16:42.733935262 +0200 > @@ -1501,7 +1501,7 @@ vect_recog_widen_minus_pattern (vec_info > "vect_recog_widen_minus_pattern"); > } > > -/* Function vect_recog_popcount_pattern > +/* Function vect_recog_popcount_clz_ctz_ffs_pattern > > Try to find the following pattern: > > @@ -1530,16 +1530,20 @@ vect_recog_widen_minus_pattern (vec_info > * Return value: A new stmt that will be used to replace the sequence of > stmts that constitute the pattern. In this case it will be: > B = .POPCOUNT (A); > + > + Similarly for clz, ctz and ffs. > */ > > static gimple * > -vect_recog_popcount_pattern (vec_info *vinfo, > - stmt_vec_info stmt_vinfo, tree *type_out) > +vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo, > + stmt_vec_info stmt_vinfo, > + tree *type_out) > { > gassign *last_stmt = dyn_cast <gassign *> (stmt_vinfo->stmt); > - gimple *popcount_stmt, *pattern_stmt; > + gimple *call_stmt, *pattern_stmt; > tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var; > - auto_vec<tree> vargs; > + internal_fn ifn = IFN_LAST; > + int addend = 0; > > /* Find B = (TYPE1) temp_out. */ > if (!last_stmt) > @@ -1557,51 +1561,137 @@ vect_recog_popcount_pattern (vec_info *v > if (TREE_CODE (rhs_oprnd) != SSA_NAME > || !has_single_use (rhs_oprnd)) > return NULL; > - popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd); > + call_stmt = SSA_NAME_DEF_STMT (rhs_oprnd); > > /* Find temp_out = __builtin_popcount{,l,ll} (temp_in); */ > - if (!is_gimple_call (popcount_stmt)) > + if (!is_gimple_call (call_stmt)) > return NULL; > - switch (gimple_call_combined_fn (popcount_stmt)) > + switch (gimple_call_combined_fn (call_stmt)) > { > + int val; > CASE_CFN_POPCOUNT: > + ifn = IFN_POPCOUNT; > + break; > + CASE_CFN_CLZ: > + ifn = IFN_CLZ; > + /* Punt if call result is unsigned and defined value at zero > + is negative, as the negative value doesn't extend correctly. */ > + if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd)) > + && gimple_call_internal_p (call_stmt) > + && CLZ_DEFINED_VALUE_AT_ZERO > + (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2 > + && val < 0) > + return NULL; > + break; > + CASE_CFN_CTZ: > + ifn = IFN_CTZ; > + /* Punt if call result is unsigned and defined value at zero > + is negative, as the negative value doesn't extend correctly. */ > + if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd)) > + && gimple_call_internal_p (call_stmt) > + && CTZ_DEFINED_VALUE_AT_ZERO > + (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2 > + && val < 0) > + return NULL; > + break; > + CASE_CFN_FFS: > + ifn = IFN_FFS; > break; > default: > return NULL; > } > > - if (gimple_call_num_args (popcount_stmt) != 1) > + if (gimple_call_num_args (call_stmt) != 1) > return NULL; > > - rhs_oprnd = gimple_call_arg (popcount_stmt, 0); > + rhs_oprnd = gimple_call_arg (call_stmt, 0); > vect_unpromoted_value unprom_diff; > - rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd, > - &unprom_diff); > + rhs_origin > + = vect_look_through_possible_promotion (vinfo, rhs_oprnd, &unprom_diff); > > if (!rhs_origin) > return NULL; > > - /* Input and output of .POPCOUNT should be same-precision integer. > - Also A should be unsigned or same precision as temp_in, > - otherwise there would be sign_extend from A to temp_in. */ > - if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type) > - || (!TYPE_UNSIGNED (unprom_diff.type) > - && (TYPE_PRECISION (unprom_diff.type) > - != TYPE_PRECISION (TREE_TYPE (rhs_oprnd))))) > + /* Input and output of .POPCOUNT should be same-precision integer. */ > + if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)) > return NULL; > - vargs.safe_push (unprom_diff.op); > > - vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt); > + /* Also A should be unsigned or same precision as temp_in, otherwise > + different builtins/internal functions have different behaviors. */ > + if (TYPE_PRECISION (unprom_diff.type) > + != TYPE_PRECISION (TREE_TYPE (rhs_oprnd))) > + switch (ifn) > + { > + case IFN_POPCOUNT: > + /* For popcount require zero extension, which doesn't add any > + further bits to the count. */ > + if (!TYPE_UNSIGNED (unprom_diff.type)) > + return NULL; > + break; > + case IFN_CLZ: > + /* clzll (x) == clz (x) + 32 for unsigned x != 0, so ok > + if it is undefined at zero or if it matches also for the > + defined value there. */ > + if (!TYPE_UNSIGNED (unprom_diff.type)) > + return NULL; > + if (!type_has_mode_precision_p (lhs_type) > + || !type_has_mode_precision_p (TREE_TYPE (rhs_oprnd))) > + return NULL; > + addend = (TYPE_PRECISION (TREE_TYPE (rhs_oprnd)) > + - TYPE_PRECISION (lhs_type)); > + if (gimple_call_internal_p (call_stmt)) > + { > + int val1, val2; > + int d1 > + = CLZ_DEFINED_VALUE_AT_ZERO > + (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1); > + int d2 > + = CLZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type), > + val2); > + if (d1 != 2) > + break; > + if (d2 != 2 || val1 != val2 + addend) > + return NULL; > + } > + break; > + case IFN_CTZ: > + /* ctzll (x) == ctz (x) for unsigned or signed x != 0, so ok > + if it is undefined at zero or if it matches also for the > + defined value there. */ > + if (gimple_call_internal_p (call_stmt)) > + { > + int val1, val2; > + int d1 > + = CTZ_DEFINED_VALUE_AT_ZERO > + (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1); > + int d2 > + = CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type), > + val2); > + if (d1 != 2) > + break; > + if (d2 != 2 || val1 != val2) > + return NULL; > + } > + break; > + case IFN_FFS: > + /* ffsll (x) == ffs (x) for unsigned or signed x. */ > + break; > + default: > + gcc_unreachable (); > + } > + > + vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern", > + call_stmt); > vec_type = get_vectype_for_scalar_type (vinfo, lhs_type); > - /* Do it only if the backend has popcount<vector_mode>2 pattern. */ > + /* Do it only if the backend has popcount<vector_mode>2 etc. pattern. */ > if (!vec_type > - || !direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type, > + || !direct_internal_fn_supported_p (ifn, vec_type, > OPTIMIZE_FOR_SPEED)) > return NULL; > > /* Create B = .POPCOUNT (A). */ > new_var = vect_recog_temp_ssa_var (lhs_type, NULL); > - pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs); > + pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op); > gimple_call_set_lhs (pattern_stmt, new_var); > gimple_set_location (pattern_stmt, gimple_location (last_stmt)); > *type_out = vec_type; > @@ -1609,6 +1699,14 @@ vect_recog_popcount_pattern (vec_info *v > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, > "created pattern stmt: %G", pattern_stmt); > + > + if (addend) > + { > + append_pattern_def_seq (vinfo, stmt_vinfo, pattern_stmt, vec_type); > + tree ret_var = vect_recog_temp_ssa_var (lhs_type, NULL); > + pattern_stmt = gimple_build_assign (ret_var, PLUS_EXPR, new_var, > + build_int_cst (lhs_type, addend)); > + } > return pattern_stmt; > } > > @@ -6051,7 +6149,7 @@ static vect_recog_func vect_vect_recog_f > { vect_recog_sad_pattern, "sad" }, > { vect_recog_widen_sum_pattern, "widen_sum" }, > { vect_recog_pow_pattern, "pow" }, > - { vect_recog_popcount_pattern, "popcount" }, > + { vect_recog_popcount_clz_ctz_ffs_pattern, "popcount_clz_ctz_ffs" }, > { vect_recog_widen_shift_pattern, "widen_shift" }, > { vect_recog_rotate_pattern, "rotate" }, > { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" }, > --- gcc/testsuite/gcc.dg/vect/pr109011-1.c.jj 2023-04-18 14:40:47.117397908 > +0200 > +++ gcc/testsuite/gcc.dg/vect/pr109011-1.c 2023-04-18 14:40:05.124004362 > +0200 > @@ -0,0 +1,48 @@ > +/* PR tree-optimization/109011 */ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -fno-unroll-loops --param=vect-epilogues-nomask=0 > -fdump-tree-optimized" } */ > +/* { dg-additional-options "-mavx512cd" { target { { i?86-*-* x86_64-*-* } > && avx512cd } } } */ > +/* { dg-additional-options "-mavx512vpopcntdq" { target { { i?86-*-* > x86_64-*-* } && avx512vpopcntdq } } } */ > +/* { dg-additional-options "-mpower8-vector" { target powerpc_p8vector_ok } > } */ > +/* { dg-additional-options "-mpower9-vector" { target powerpc_p9vector_ok } > } */ > +/* { dg-additional-options "-march=z13 -mzarch" { target s390_vx } } */ > + > +void > +foo (long long *p, long long *q) > +{ > +#pragma omp simd > + for (int i = 0; i < 2048; ++i) > + p[i] = __builtin_popcountll (q[i]); > +} > + > +/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { > target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } } */ > +/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { > target { powerpc_p8vector_ok || s390_vx } } } } */ > + > +void > +bar (long long *p, long long *q) > +{ > +#pragma omp simd > + for (int i = 0; i < 2048; ++i) > + p[i] = __builtin_clzll (q[i]); > +} > + > +/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target > { { i?86-*-* x86_64-*-* } && avx512cd } } } } */ > +/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target > { powerpc_p8vector_ok || s390_vx } } } } */ > + > +void > +baz (long long *p, long long *q) > +{ > +#pragma omp simd > + for (int i = 0; i < 2048; ++i) > + p[i] = __builtin_ctzll (q[i]); > +} > + > +/* { dg-final { scan-tree-dump-times " = \.CTZ \\\(" 1 "optimized" { target > { powerpc_p9vector_ok || s390_vx } } } } */ > + > +void > +qux (long long *p, long long *q) > +{ > +#pragma omp simd > + for (int i = 0; i < 2048; ++i) > + p[i] = __builtin_ffsll (q[i]); > +} > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman; HRB 36809 (AG Nuernberg)