> 70a27555d900caf71932a2c794d9a77d06048b6a..b677a5308fb3498314cbbfc1 > 865eae6bf1c815b9 100644 > > --- a/gcc/doc/md.texi > > +++ b/gcc/doc/md.texi > > @@ -6087,6 +6087,25 @@ vectors with N signed/unsigned elements of size > S@. Find the absolute > > difference between operands 1 and 2 and widen the resulting elements. > > Put the N/2 results of size 2*S in the output vector (operand 0). > > > > +@cindex @code{vec_trunc_add_high@var{m}} instruction pattern > > +@item @samp{vec_trunc_add_high@var{m}} > > +Signed or unsigned addition of two input vectors, then extracts the > > +most significant half of each result element and narrows it back to the > > +original element width. > > It narrows it to a vector with elements of half width? > > > + > > +Concretely, it computes: > > +@code{(bits(a)/2)((a + b) >> bits(a)/2)} > > + > > +where @code{bits(a)} is the width in bits of each input element. > > + > > +Its operands (@code{1} and @code{2}) are vectors containing the same > number > > +of signed or unsigned integral elements (@code{N}) of size @code{S}. The > > +result (operand @code{0}) is a vector of length @code{N}, with elements of > > +an integral type whose size is half that of @code{S}. > > We are usually documenting optabs in terms of modes. So > "Operand 1 and 2 are of integer vector mode @var{m}" (? or is the result > of mode @var{m}?) ", operand 0 is of an integer vector mode with the > same number of elements but elements of half of the width of those of > mode @var{m}."? >
Given a sequence such as int foo () { #pragma GCC unroll 4 for (int i = 0; i < N; i++) if (a[i] == 124) return 1; return 0; } where a[i] is long long, we will unroll the loop and use an OR reduction for early break on Adv. SIMD. Afterwards the sequence is followed by a compression sequence to compress the 128-bit vectors into 64-bits for use by the branch. However if we have support for add halving and narrowing then we can instead of using an OR, use an ADDHN which will do the combining and narrowing. Note that for now I only do the last OR, however if we have more than one level of unrolling we could technically chain them. I will revisit this in another up coming early break series, however an unroll of 2 is fairly common. Bootstrapped Regtested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf, x86_64-pc-linux-gnu -m32, -m64 and no issues and about a 10% improvements in this sequence for Adv. SIMD. Ok for master? Thanks, Tamar gcc/ChangeLog: * internal-fn.def (VEC_TRUNC_ADD_HIGH): New. * doc/generic.texi: Document it. * optabs.def (vec_trunc_add_high): New. * doc/md.texi: Document it. * tree-vect-stmts.cc (vectorizable_early_exit): Use addhn if supported. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vect-early-break-addhn_1.c: New test. * gcc.target/aarch64/vect-early-break-addhn_2.c: New test. * gcc.target/aarch64/vect-early-break-addhn_3.c: New test. * gcc.target/aarch64/vect-early-break-addhn_4.c: New test. -- inline copy of patch -- diff --git a/gcc/doc/generic.texi b/gcc/doc/generic.texi index 55083b278da4dacf4e9114ca444bf11b2ae0caf0..0ea4860703e616315acd6d8d4f51c76598f6e3ef 100644 --- a/gcc/doc/generic.texi +++ b/gcc/doc/generic.texi @@ -1833,6 +1833,7 @@ a value from @code{enum annot_expr_kind}, the third is an @code{INTEGER_CST}. @tindex IFN_VEC_WIDEN_MINUS_LO @tindex IFN_VEC_WIDEN_MINUS_EVEN @tindex IFN_VEC_WIDEN_MINUS_ODD +@tindex IFN_VEC_TRUNC_ADD_HIGH @tindex VEC_UNPACK_HI_EXPR @tindex VEC_UNPACK_LO_EXPR @tindex VEC_UNPACK_FLOAT_HI_EXPR @@ -1955,6 +1956,24 @@ vector of @code{N/2} subtractions. In the case of vector are subtracted from the odd @code{N/2} of the first to produce the vector of @code{N/2} subtractions. +@item IFN_VEC_TRUNC_ADD_HIGH +This internal function performs an addition of two input vectors, +then extracts the most significant half of each result element and +narrows it to elements of half the original width. + +Concretely, it computes: +@code{(bits(a)/2)((a + b) >> bits(a)/2)} + +where @code{bits(a)} is the width in bits of each input element. + +Its operands are vectors containing the same number of elements (@code{N}) +of the same integral type. The result is a vector of length @code{N}, with +elements of an integral type whose size is half that of the input element +type. + +This operation currently only used for early break result compression when the +result of a vector boolean can be represented as 0 or -1. + @item VEC_UNPACK_HI_EXPR @itemx VEC_UNPACK_LO_EXPR These nodes represent unpacking of the high and low parts of the input vector, diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 973c0dd302964966a91fa8dbab85930d6dbeec9e..bac22b6338042a5a546db7854988eab628f08eea 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -6087,6 +6087,25 @@ vectors with N signed/unsigned elements of size S@. Find the absolute difference between operands 1 and 2 and widen the resulting elements. Put the N/2 results of size 2*S in the output vector (operand 0). +@cindex @code{vec_trunc_add_high@var{m}} instruction pattern +@item @samp{vec_trunc_add_high@var{m}} +Signed or unsigned addition of two input integer vectors of mode @var{m}, then +extracts the most significant half of each result element and narrows it to +elements of half the original width. + +Concretely, it computes: +@code{(bits(a)/2)((a + b) >> bits(a)/2)} + +where @code{bits(a)} is the width in bits of each input element. + +Operand 1 and 2 are of integer vector mode @var{m} containing the same number +of signed or unsigned integral elements. The result (operand @code{0}) is of an +integer vector mode with the same number of elements but elements of half of the +width of those of mode @var{m}. + +This operation currently only used for early break result compression when the +result of a vector boolean can be represented as 0 or -1. + @cindex @code{vec_addsub@var{m}3} instruction pattern @item @samp{vec_addsub@var{m}3} Alternating subtract, add with even lanes doing subtract and odd diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index d2480a1bf7927476215bc7bb99c0b74197d2b7e9..8434a805e289e109c49c53ef887a519112af1f33 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -422,6 +422,8 @@ DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary) DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary) DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST, cmul_conj, binary) DEF_INTERNAL_OPTAB_FN (VEC_ADDSUB, ECF_CONST, vec_addsub, binary) +DEF_INTERNAL_OPTAB_FN (VEC_TRUNC_ADD_HIGH, ECF_CONST | ECF_NOTHROW, + vec_trunc_add_high, binary) DEF_INTERNAL_WIDENING_OPTAB_FN (VEC_WIDEN_PLUS, ECF_CONST | ECF_NOTHROW, first, diff --git a/gcc/optabs.def b/gcc/optabs.def index b59d02bce14cd8cd4392ac568d2547601aac4481..790e43f08f476c8025dc2797f9ecaffe5b66acc5 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -493,6 +493,7 @@ OPTAB_D (vec_widen_uabd_hi_optab, "vec_widen_uabd_hi_$a") OPTAB_D (vec_widen_uabd_lo_optab, "vec_widen_uabd_lo_$a") OPTAB_D (vec_widen_uabd_odd_optab, "vec_widen_uabd_odd_$a") OPTAB_D (vec_widen_uabd_even_optab, "vec_widen_uabd_even_$a") +OPTAB_D (vec_trunc_add_high_optab, "vec_trunc_add_high$a") OPTAB_D (vec_addsub_optab, "vec_addsub$a3") OPTAB_D (vec_fmaddsub_optab, "vec_fmaddsub$a4") OPTAB_D (vec_fmsubadd_optab, "vec_fmsubadd$a4") diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c new file mode 100644 index 0000000000000000000000000000000000000000..b22e7d9c49d3588fa7e1e2c8eac43074109ccaed --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#define TYPE int +#define N 800 + +#pragma GCC target "+nosve" + +TYPE a[N]; + +/* +** foo: +** ... +** ldp q[0-9]+, q[0-9]+, \[x[0-9]+\], 32 +** cmeq v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** cmeq v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s +** fmov x[0-9]+, d[0-9]+ +** ... +*/ + +int foo () +{ +#pragma GCC unroll 8 + for (int i = 0; i < N; i++) + if (a[i] == 124) + return 1; + + return 0; +} + +/* { dg-final { scan-tree-dump "VEC_TRUNC_ADD_HIGH" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c new file mode 100644 index 0000000000000000000000000000000000000000..31d2515dcb907dc32a1eae7a31d89ecd64a06e60 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#define TYPE long long +#define N 800 + +#pragma GCC target "+nosve" + +TYPE a[N]; + +/* +** foo: +** ... +** ldp q[0-9]+, q[0-9]+, \[x[0-9]+\], 32 +** cmeq v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d +** cmeq v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d +** fmov x[0-9]+, d[0-9]+ +** ... +*/ + +int foo () +{ +#pragma GCC unroll 4 + for (int i = 0; i < N; i++) + if (a[i] == 124) + return 1; + + return 0; +} + +/* { dg-final { scan-tree-dump "VEC_TRUNC_ADD_HIGH" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c new file mode 100644 index 0000000000000000000000000000000000000000..375fe1788af76138d0d3798eec1a128e7c8f9a04 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#define TYPE short +#define N 800 + +#pragma GCC target "+nosve" + +TYPE a[N]; + +/* +** foo: +** ... +** ldp q[0-9]+, q[0-9]+, \[x[0-9]+\], 32 +** cmeq v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h +** cmeq v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h +** fmov x[0-9]+, d[0-9]+ +** ... +*/ + +int foo () +{ +#pragma GCC unroll 16 + for (int i = 0; i < N; i++) + if (a[i] == 124) + return 1; + + return 0; +} + +/* { dg-final { scan-tree-dump "VEC_TRUNC_ADD_HIGH" "vect" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c new file mode 100644 index 0000000000000000000000000000000000000000..e584bfac6271a07680b09a5aad586f6dbdd53f1d --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */ + +#define TYPE char +#define N 800 + +#pragma GCC target "+nosve" + +TYPE a[N]; + +int foo () +{ +#pragma GCC unroll 32 + for (int i = 0; i < N; i++) + if (a[i] == 124) + return 1; + + return 0; +} + +/* { dg-final { scan-tree-dump-not "VEC_TRUNC_ADD_HIGH" "vect" } } */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 9fcc2fd084987e564f496a02af7d2b8547c11cd5..a945cce0e67a28694ac7016714f9450bfa7b9aa9 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -12336,7 +12336,7 @@ vectorizable_early_exit (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info)); gcond *cond_stmt = as_a <gcond *>(orig_stmt); - tree cst = build_zero_cst (vectype); + tree vectype_out = vectype; auto bb = gimple_bb (cond_stmt); edge exit_true_edge = EDGE_SUCC (bb, 0); if (exit_true_edge->flags & EDGE_FALSE_VALUE) @@ -12353,10 +12353,37 @@ vectorizable_early_exit (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, bool flipped = flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo), exit_true_edge->dest); + /* See if we support ADDHN and use that for the reduction. */ + internal_fn ifn = IFN_VEC_TRUNC_ADD_HIGH; + bool addhn_supported_p + = direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_BOTH); + tree narrow_type = NULL_TREE; + if (addhn_supported_p) + { + /* Calculate the narrowing type for the result. */ + auto halfprec = TYPE_PRECISION (TREE_TYPE (vectype)) / 2; + auto unsignedp = TYPE_UNSIGNED (TREE_TYPE (vectype)); + tree itype = build_nonstandard_integer_type (halfprec, unsignedp); + tree tmp_type = build_vector_type (itype, TYPE_VECTOR_SUBPARTS (vectype)); + narrow_type = truth_type_for (tmp_type); + + if (direct_optab_handler (cbranch_optab, TYPE_MODE (narrow_type)) + == CODE_FOR_nothing) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "can't use ADDHN reduction because cbranch for " + "the narrowed type is not supported by the " + "target.\n"); + addhn_supported_p = false; + } + } + /* Analyze only. */ if (cost_vec) { - if (direct_optab_handler (cbranch_optab, mode) == CODE_FOR_nothing) + if (!addhn_supported_p + && direct_optab_handler (cbranch_optab, mode) == CODE_FOR_nothing) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -12462,10 +12489,22 @@ vectorizable_early_exit (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, while (workset.length () > 1) { - new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc"); tree arg0 = workset.pop (); tree arg1 = workset.pop (); - new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1); + if (addhn_supported_p && workset.length () == 0) + { + new_stmt = gimple_build_call_internal (ifn, 2, arg0, arg1); + vectype_out = narrow_type; + new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc"); + gimple_call_set_lhs (as_a <gcall *> (new_stmt), new_temp); + gimple_call_set_nothrow (as_a <gcall *> (new_stmt), true); + } + else + { + new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc"); + new_stmt + = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1); + } vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, &cond_gsi); workset.quick_insert (0, new_temp); @@ -12488,6 +12527,7 @@ vectorizable_early_exit (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, gcc_assert (new_temp); + tree cst = build_zero_cst (vectype_out); gimple_cond_set_condition (cond_stmt, NE_EXPR, new_temp, cst); update_stmt (orig_stmt);
rb19746.patch
Description: rb19746.patch