Given a sequence such as int foo () { #pragma GCC unroll 4 for (int i = 0; i < N; i++) if (a[i] == 124) return 1;
return 0; } where a[i] is long long, we will unroll the loop and use an OR reduction for early break on Adv. SIMD. Afterwards the sequence is followed by a compression sequence to compress the 128-bit vectors into 64-bits for use by the branch. However if we have support for add halfing and narrowing then we can instead of using an OR, use an ADDHN which will do the combining and narrowing. Note that for now I only do the last OR, however if we have more than one level of unrolling we could technically chain them. I will revisit this in another up coming early break series, however an unroll of 2 is fairly common. Bootstrapped Regtested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf, x86_64-pc-linux-gnu -m32, -m64 and no issues and about a 10% improvements in this sequence for Adv. SIMD. Ok for master? Thanks, Tamar gcc/ChangeLog: * tree-vect-stmts.cc (vectorizable_early_exit): Use addhn if supported. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vect-early-break-addhn_1.c: New test. * gcc.target/aarch64/vect-early-break-addhn_2.c: New test. * gcc.target/aarch64/vect-early-break-addhn_3.c: New test. * gcc.target/aarch64/vect-early-break-addhn_4.c: New test. --- diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c new file mode 100644 index 0000000000000000000000000000000000000000..0fce36f277f389d5f43174e398b8800ab11b31da --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#define TYPE int +#define N 800 + +#pragma GCC target "+nosve" + +TYPE a[N]; + +/* +** foo: +** ... +** ldp q[0-9]+, q[0-9]+, \[x[0-9]+\], 32 +** cmeq v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** cmeq v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s +** fmov x[0-9]+, d[0-9]+ +** ... +*/ + +int foo () +{ +#pragma GCC unroll 8 + for (int i = 0; i < N; i++) + if (a[i] == 124) + return 1; + + return 0; +} + +/* { dg-final { scan-tree-dump "VEC_ADD_HALFING_NARROW_LO" "vect" } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c new file mode 100644 index 0000000000000000000000000000000000000000..9c781620749c1bd4ea6b0290d862f8ff5c84e6db --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#define TYPE long long +#define N 800 + +#pragma GCC target "+nosve" + +TYPE a[N]; + +/* +** foo: +** ... +** ldp q[0-9]+, q[0-9]+, \[x[0-9]+\], 32 +** cmeq v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d +** cmeq v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d +** fmov x[0-9]+, d[0-9]+ +** ... +*/ + +int foo () +{ +#pragma GCC unroll 4 + for (int i = 0; i < N; i++) + if (a[i] == 124) + return 1; + + return 0; +} + +/* { dg-final { scan-tree-dump "VEC_ADD_HALFING_NARROW_LO" "vect" } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c new file mode 100644 index 0000000000000000000000000000000000000000..0cebe9bdf4a1b8ba576f9c04fc7d2b8d79b97a9e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#define TYPE short +#define N 800 + +#pragma GCC target "+nosve" + +TYPE a[N]; + +/* +** foo: +** ... +** ldp q[0-9]+, q[0-9]+, \[x[0-9]+\], 32 +** cmeq v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h +** cmeq v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h +** fmov x[0-9]+, d[0-9]+ +** ... +*/ + +int foo () +{ +#pragma GCC unroll 16 + for (int i = 0; i < N; i++) + if (a[i] == 124) + return 1; + + return 0; +} + +/* { dg-final { scan-tree-dump "VEC_ADD_HALFING_NARROW_LO" "vect" } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c new file mode 100644 index 0000000000000000000000000000000000000000..9e35329cb271d38eff845c49df406f8501870b36 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */ + +#define TYPE char +#define N 800 + +#pragma GCC target "+nosve" + +TYPE a[N]; + +int foo () +{ +#pragma GCC unroll 32 + for (int i = 0; i < N; i++) + if (a[i] == 124) + return 1; + + return 0; +} + +/* { dg-final { scan-tree-dump-not "VEC_ADD_HALFING_NARROW_LO" "vect" } } */ + diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 97b3d4801d19f3168b91c91271e882bad3f99f13..a1ecce8ea227654907c59828ff34c177cf680061 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -12292,7 +12292,7 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info)); gcond *cond_stmt = as_a <gcond *>(orig_stmt); - tree cst = build_zero_cst (vectype); + tree vectype_out = vectype; auto bb = gimple_bb (cond_stmt); edge exit_true_edge = EDGE_SUCC (bb, 0); if (exit_true_edge->flags & EDGE_FALSE_VALUE) @@ -12416,12 +12416,40 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, else workset.splice (stmts); + /* See if we support ADDHN and use that for the reduction. */ + internal_fn ifn = IFN_VEC_ADD_HALFING_NARROW_LO; + bool addhn_supported_p + = direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED); + tree narrow_type = NULL_TREE; + if (addhn_supported_p) + { + /* Calculate the narrowing type for the result. */ + auto halfprec = TYPE_PRECISION (TREE_TYPE (vectype)) / 2; + auto unsignedp = TYPE_UNSIGNED (TREE_TYPE (vectype)); + tree itype = build_nonstandard_integer_type (halfprec, unsignedp); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + tree tmp_type = build_vector_type (itype, nunits); + narrow_type = truth_type_for (tmp_type); + } + while (workset.length () > 1) { - new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc"); tree arg0 = workset.pop (); tree arg1 = workset.pop (); - new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1); + if (addhn_supported_p && workset.length () == 0) + { + new_stmt = gimple_build_call_internal (ifn, 2, arg0, arg1); + vectype_out = narrow_type; + new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc"); + gimple_call_set_lhs (as_a <gcall *> (new_stmt), new_temp); + gimple_call_set_nothrow (as_a <gcall *> (new_stmt), true); + } + else + { + new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc"); + new_stmt + = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1); + } vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, &cond_gsi); workset.quick_insert (0, new_temp); @@ -12444,6 +12472,7 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, gcc_assert (new_temp); + tree cst = build_zero_cst (vectype_out); gimple_cond_set_condition (cond_stmt, NE_EXPR, new_temp, cst); update_stmt (orig_stmt); --
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c new file mode 100644 index 0000000000000000000000000000000000000000..0fce36f277f389d5f43174e398b8800ab11b31da --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_1.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#define TYPE int +#define N 800 + +#pragma GCC target "+nosve" + +TYPE a[N]; + +/* +** foo: +** ... +** ldp q[0-9]+, q[0-9]+, \[x[0-9]+\], 32 +** cmeq v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** cmeq v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s +** fmov x[0-9]+, d[0-9]+ +** ... +*/ + +int foo () +{ +#pragma GCC unroll 8 + for (int i = 0; i < N; i++) + if (a[i] == 124) + return 1; + + return 0; +} + +/* { dg-final { scan-tree-dump "VEC_ADD_HALFING_NARROW_LO" "vect" } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c new file mode 100644 index 0000000000000000000000000000000000000000..9c781620749c1bd4ea6b0290d862f8ff5c84e6db --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_2.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#define TYPE long long +#define N 800 + +#pragma GCC target "+nosve" + +TYPE a[N]; + +/* +** foo: +** ... +** ldp q[0-9]+, q[0-9]+, \[x[0-9]+\], 32 +** cmeq v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d +** cmeq v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d +** fmov x[0-9]+, d[0-9]+ +** ... +*/ + +int foo () +{ +#pragma GCC unroll 4 + for (int i = 0; i < N; i++) + if (a[i] == 124) + return 1; + + return 0; +} + +/* { dg-final { scan-tree-dump "VEC_ADD_HALFING_NARROW_LO" "vect" } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c new file mode 100644 index 0000000000000000000000000000000000000000..0cebe9bdf4a1b8ba576f9c04fc7d2b8d79b97a9e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_3.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#define TYPE short +#define N 800 + +#pragma GCC target "+nosve" + +TYPE a[N]; + +/* +** foo: +** ... +** ldp q[0-9]+, q[0-9]+, \[x[0-9]+\], 32 +** cmeq v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h +** cmeq v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h +** fmov x[0-9]+, d[0-9]+ +** ... +*/ + +int foo () +{ +#pragma GCC unroll 16 + for (int i = 0; i < N; i++) + if (a[i] == 124) + return 1; + + return 0; +} + +/* { dg-final { scan-tree-dump "VEC_ADD_HALFING_NARROW_LO" "vect" } } */ + diff --git a/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c new file mode 100644 index 0000000000000000000000000000000000000000..9e35329cb271d38eff845c49df406f8501870b36 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/vect-early-break-addhn_4.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-vect-details -std=c99" } */ + +#define TYPE char +#define N 800 + +#pragma GCC target "+nosve" + +TYPE a[N]; + +int foo () +{ +#pragma GCC unroll 32 + for (int i = 0; i < N; i++) + if (a[i] == 124) + return 1; + + return 0; +} + +/* { dg-final { scan-tree-dump-not "VEC_ADD_HALFING_NARROW_LO" "vect" } } */ + diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 97b3d4801d19f3168b91c91271e882bad3f99f13..a1ecce8ea227654907c59828ff34c177cf680061 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -12292,7 +12292,7 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info)); gcond *cond_stmt = as_a <gcond *>(orig_stmt); - tree cst = build_zero_cst (vectype); + tree vectype_out = vectype; auto bb = gimple_bb (cond_stmt); edge exit_true_edge = EDGE_SUCC (bb, 0); if (exit_true_edge->flags & EDGE_FALSE_VALUE) @@ -12416,12 +12416,40 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, else workset.splice (stmts); + /* See if we support ADDHN and use that for the reduction. */ + internal_fn ifn = IFN_VEC_ADD_HALFING_NARROW_LO; + bool addhn_supported_p + = direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED); + tree narrow_type = NULL_TREE; + if (addhn_supported_p) + { + /* Calculate the narrowing type for the result. */ + auto halfprec = TYPE_PRECISION (TREE_TYPE (vectype)) / 2; + auto unsignedp = TYPE_UNSIGNED (TREE_TYPE (vectype)); + tree itype = build_nonstandard_integer_type (halfprec, unsignedp); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + tree tmp_type = build_vector_type (itype, nunits); + narrow_type = truth_type_for (tmp_type); + } + while (workset.length () > 1) { - new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc"); tree arg0 = workset.pop (); tree arg1 = workset.pop (); - new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1); + if (addhn_supported_p && workset.length () == 0) + { + new_stmt = gimple_build_call_internal (ifn, 2, arg0, arg1); + vectype_out = narrow_type; + new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc"); + gimple_call_set_lhs (as_a <gcall *> (new_stmt), new_temp); + gimple_call_set_nothrow (as_a <gcall *> (new_stmt), true); + } + else + { + new_temp = make_temp_ssa_name (vectype_out, NULL, "vexit_reduc"); + new_stmt + = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1); + } vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, &cond_gsi); workset.quick_insert (0, new_temp); @@ -12444,6 +12472,7 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info, gcc_assert (new_temp); + tree cst = build_zero_cst (vectype_out); gimple_cond_set_condition (cond_stmt, NE_EXPR, new_temp, cst); update_stmt (orig_stmt);