The following changes the gate to perform vectorization of BB reductions to use needs_fold_left_reduction_p which in turn requires handling TYPE_OVERFLOW_UNDEFINED types in the epilogue code generation by promoting any operations generated there to use unsigned arithmetic.
The following does this, there's currently only v16qi where x86 supports a .REDUC_PLUS reduction for integral modes so I had to add a x86 specific testcase using GIMPLE IL. Bootstrap and regtest ongoing on x86_64-unknown-linux-gnu. The next plan is to remove the restriction to .REDUC_PLUS, factoring out some of the general non-ifn way of doing a reduction epilog from loop reduction handling. I had a stab at doing in-order reductions already but then those are really too similar to having general SLP discovery from N scalar defs (and then replacing those with extracts), at least since there's no fold_left_plus that doesn't add to an existing scalar I can't seem to easily just handle that case, possibly discovering { x_0, x_1, ..., x_n-1 }, extracting x_0, shifting the vector to { x_1, ..., x_n-1, <undef> } and using mask_fold_left_plus with accumulating to x_0 and the <undef> element masked would do. But I'm not sure that's worth the trouble? In principle with generic N scalar defs we could do a forward discovery from grouped loads and see where that goes (and of course handle in-order reductions that way). * tree-vect-slp.cc (vect_slp_check_for_roots): Use !needs_fold_left_reduction_p to decide whether we can handle the reduction with association. (vectorize_slp_instance_root_stmt): For TYPE_OVERFLOW_UNDEFINED reductions perform all arithmetic in an unsigned type. * gcc.target/i386/vect-reduc-2.c: New testcase. --- gcc/testsuite/gcc.target/i386/vect-reduc-2.c | 77 ++++++++++++++++++++ gcc/tree-vect-slp.cc | 44 +++++++---- 2 files changed, 107 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/vect-reduc-2.c diff --git a/gcc/testsuite/gcc.target/i386/vect-reduc-2.c b/gcc/testsuite/gcc.target/i386/vect-reduc-2.c new file mode 100644 index 00000000000..62559ef8e7b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-reduc-2.c @@ -0,0 +1,77 @@ +/* { dg-do compile } */ +/* { dg-options "-fgimple -O2 -msse2 -fdump-tree-slp2-optimized" } */ + +signed char x[16]; + +signed char __GIMPLE (ssa,guessed_local(1073741824)) +foo () +{ + signed char _1; + signed char _3; + signed char _5; + signed char _6; + signed char _8; + signed char _9; + signed char _11; + signed char _12; + signed char _14; + signed char _15; + signed char _17; + signed char _18; + signed char _20; + signed char _21; + signed char _23; + signed char _24; + signed char _26; + signed char _27; + signed char _29; + signed char _30; + signed char _32; + signed char _33; + signed char _35; + signed char _36; + signed char _38; + signed char _39; + signed char _41; + signed char _42; + signed char _44; + signed char _45; + signed char _47; + + __BB(2,guessed_local(1073741824)): + _1 = x[15]; + _3 = x[1]; + _5 = _1 + _3; + _6 = x[2]; + _8 = _5 + _6; + _9 = x[3]; + _11 = _8 + _9; + _12 = x[4]; + _14 = _11 + _12; + _15 = x[5]; + _17 = _14 + _15; + _18 = x[6]; + _20 = _17 + _18; + _21 = x[7]; + _23 = _20 + _21; + _24 = x[8]; + _26 = _23 + _24; + _27 = x[9]; + _29 = _26 + _27; + _30 = x[10]; + _32 = _29 + _30; + _33 = x[11]; + _35 = _32 + _33; + _36 = x[12]; + _38 = _35 + _36; + _39 = x[13]; + _41 = _38 + _39; + _42 = x[14]; + _44 = _41 + _42; + _45 = x[0]; + _47 = _44 + _45; + return _47; + +} + +/* { dg-final { scan-tree-dump "optimized: basic block part vectorized" "slp2" } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index 7020bd9fa0e..07d68f2052b 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -7217,13 +7217,10 @@ vect_slp_check_for_roots (bb_vec_info bb_vinfo) } else if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) && (associative_tree_code (code) || code == MINUS_EXPR) - /* ??? The flag_associative_math and TYPE_OVERFLOW_WRAPS - checks pessimize a two-element reduction. PR54400. + /* ??? This pessimizes a two-element reduction. PR54400. ??? In-order reduction could be handled if we only traverse one operand chain in vect_slp_linearize_chain. */ - && ((FLOAT_TYPE_P (TREE_TYPE (rhs)) && flag_associative_math) - || (INTEGRAL_TYPE_P (TREE_TYPE (rhs)) - && TYPE_OVERFLOW_WRAPS (TREE_TYPE (rhs)))) + && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code) /* Ops with constants at the tail can be stripped here. */ && TREE_CODE (rhs) == SSA_NAME && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME @@ -9161,9 +9158,23 @@ vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance) /* We may end up with more than one vector result, reduce them to one vector. */ tree vec_def = vec_defs[0]; + tree vectype = TREE_TYPE (vec_def); + tree compute_vectype = vectype; + if (TYPE_OVERFLOW_UNDEFINED (vectype)) + { + compute_vectype = unsigned_type_for (vectype); + vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR, + compute_vectype, vec_def); + } for (unsigned i = 1; i < vec_defs.length (); ++i) - vec_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (vec_def), - vec_def, vec_defs[i]); + { + tree def = vec_defs[i]; + if (TYPE_OVERFLOW_UNDEFINED (vectype)) + def = gimple_build (&epilogue, VIEW_CONVERT_EXPR, + compute_vectype, def); + vec_def = gimple_build (&epilogue, reduc_code, compute_vectype, + vec_def, def); + } vec_defs.release (); /* ??? Support other schemes than direct internal fn. */ internal_fn reduc_fn; @@ -9171,21 +9182,26 @@ vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance) || reduc_fn == IFN_LAST) gcc_unreachable (); tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn), - TREE_TYPE (TREE_TYPE (vec_def)), vec_def); + TREE_TYPE (compute_vectype), vec_def); if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ()) { tree rem_def = NULL_TREE; for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance)) - if (!rem_def) - rem_def = def; - else - rem_def = gimple_build (&epilogue, reduc_code, - TREE_TYPE (scalar_def), - rem_def, def); + { + def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def); + if (!rem_def) + rem_def = def; + else + rem_def = gimple_build (&epilogue, reduc_code, + TREE_TYPE (scalar_def), + rem_def, def); + } scalar_def = gimple_build (&epilogue, reduc_code, TREE_TYPE (scalar_def), scalar_def, rem_def); } + scalar_def = gimple_convert (&epilogue, + TREE_TYPE (vectype), scalar_def); gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt); gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT); gimple_assign_set_rhs_from_tree (&rgsi, scalar_def); -- 2.35.3