On Fri, Jan 12, 2024 at 7:15 AM Feng Xue OS <[email protected]> wrote:
>
> Add a depth parameter to limit recursion of vec_slp_has_scalar_use.
OK.
> Feng
> -------
>
> .../gcc.target/aarch64/bb-slp-pr113091.c | 22 ++
> gcc/tree-vect-slp.cc | 207 ++++++++++++++----
> 2 files changed, 190 insertions(+), 39 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> new file mode 100644
> index 00000000000..ff822e90b4a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -fdump-tree-slp-details
> -ftree-slp-vectorize" } */
> +
> +int test(unsigned array[8]);
> +
> +int foo(char *a, char *b)
> +{
> + unsigned array[8];
> +
> + array[0] = (a[0] - b[0]);
> + array[1] = (a[1] - b[1]);
> + array[2] = (a[2] - b[2]);
> + array[3] = (a[3] - b[3]);
> + array[4] = (a[4] - b[4]);
> + array[5] = (a[5] - b[5]);
> + array[6] = (a[6] - b[6]);
> + array[7] = (a[7] - b[7]);
> +
> + return test(array);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized using
> SLP" 1 "slp2" } } */
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index b6cce55ce90..086377a9ac0 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -6418,6 +6418,102 @@ vect_slp_analyze_node_operations (vec_info *vinfo,
> slp_tree node,
> return res;
> }
>
> +/* Given a definition DEF, analyze if it will have any live scalar use after
> + performing SLP vectorization whose information is represented by BB_VINFO,
> + and record result into hash map SCALAR_USE_MAP as cache for later fast
> + check. If recursion DEPTH exceeds a limit, stop analysis and make a
> + conservative assumption. Return 0 if no scalar use, 1 if there is, -1
> + means recursion is limited. */
> +
> +static int
> +vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
> + hash_map<tree, int> &scalar_use_map,
> + int depth = 0)
> +{
> + const int depth_limit = 2;
> + imm_use_iterator use_iter;
> + gimple *use_stmt;
> +
> + if (int *res = scalar_use_map.get (def))
> + return *res;
> +
> + int scalar_use = 1;
> +
> + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
> + {
> + if (is_gimple_debug (use_stmt))
> + continue;
> +
> + stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> +
> + if (!use_stmt_info)
> + break;
> +
> + if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
> + continue;
> +
> + /* Do not step forward when encounter PHI statement, since it may
> + involve cyclic reference and cause infinite recursive invocation. */
> + if (gimple_code (use_stmt) == GIMPLE_PHI)
> + break;
> +
> + /* When pattern recognition is involved, a statement whose definition
> is
> + consumed in some pattern, may not be included in the final
> replacement
> + pattern statements, so would be skipped when building SLP graph.
> +
> + * Original
> + char a_c = *(char *) a;
> + char b_c = *(char *) b;
> + unsigned short a_s = (unsigned short) a_c;
> + int a_i = (int) a_s;
> + int b_i = (int) b_c;
> + int r_i = a_i - b_i;
> +
> + * After pattern replacement
> + a_s = (unsigned short) a_c;
> + a_i = (int) a_s;
> +
> + patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
> + patt_b_i = (int) patt_b_s; // b_i = (int) b_c
> +
> + patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
> + patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
> +
> + The definitions of a_i(original statement) and b_i(pattern statement)
> + are related to, but actually not part of widen_minus pattern.
> + Vectorizing the pattern does not cause these definition statements to
> + be marked as PURE_SLP. For this case, we need to recursively check
> + whether their uses are all absorbed into vectorized code. But there
> + is an exception that some use may participate in an vectorized
> + operation via an external SLP node containing that use as an element.
> + The parameter "scalar_use_map" tags such kind of SSA as having scalar
> + use in advance. */
> + tree lhs = gimple_get_lhs (use_stmt);
> +
> + if (!lhs || TREE_CODE (lhs) != SSA_NAME)
> + break;
> +
> + if (depth_limit && depth >= depth_limit)
> + return -1;
> +
> + if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs,
> scalar_use_map,
> + depth + 1)))
> + break;
> + }
> +
> + if (end_imm_use_stmt_p (&use_iter))
> + scalar_use = 0;
> +
> + /* If recursion is limited, do not cache result for non-root defs. */
> + if (!depth || scalar_use >= 0)
> + {
> + bool added = scalar_use_map.put (def, scalar_use);
> + gcc_assert (!added);
> + }
> +
> + return scalar_use;
> +}
> +
> /* Mark lanes of NODE that are live outside of the basic-block vectorized
> region and that can be vectorized using vectorizable_live_operation
> with STMT_VINFO_LIVE_P. Not handled live operations will cause the
> @@ -6427,6 +6523,7 @@ static void
> vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> slp_instance instance,
> stmt_vector_for_cost *cost_vec,
> + hash_map<tree, int> &scalar_use_map,
> hash_set<stmt_vec_info> &svisited,
> hash_set<slp_tree> &visited)
> {
> @@ -6451,32 +6548,22 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo,
> slp_tree node,
> def_operand_p def_p;
> FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
> {
> - imm_use_iterator use_iter;
> - gimple *use_stmt;
> - stmt_vec_info use_stmt_info;
> - FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
> - if (!is_gimple_debug (use_stmt))
> - {
> - use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> - if (!use_stmt_info
> - || !PURE_SLP_STMT (vect_stmt_to_vectorize
> (use_stmt_info)))
> - {
> - STMT_VINFO_LIVE_P (stmt_info) = true;
> - if (vectorizable_live_operation (bb_vinfo, stmt_info,
> - node, instance, i,
> - false, cost_vec))
> - /* ??? So we know we can vectorize the live stmt
> - from one SLP node. If we cannot do so from all
> - or none consistently we'd have to record which
> - SLP node (and lane) we want to use for the live
> - operation. So make sure we can code-generate
> - from all nodes. */
> - mark_visited = false;
> - else
> - STMT_VINFO_LIVE_P (stmt_info) = false;
> - break;
> - }
> - }
> + if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
> + scalar_use_map))
> + {
> + STMT_VINFO_LIVE_P (stmt_info) = true;
> + if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
> + instance, i, false, cost_vec))
> + /* ??? So we know we can vectorize the live stmt from one SLP
> + node. If we cannot do so from all or none consistently
> + we'd have to record which SLP node (and lane) we want to
> + use for the live operation. So make sure we can
> + code-generate from all nodes. */
> + mark_visited = false;
> + else
> + STMT_VINFO_LIVE_P (stmt_info) = false;
> + }
> +
> /* We have to verify whether we can insert the lane extract
> before all uses. The following is a conservative approximation.
> We cannot put this into vectorizable_live_operation because
> @@ -6495,6 +6582,10 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo,
> slp_tree node,
> from the latest stmt in a node. So we compensate for this
> during code-generation, simply not replacing uses for those
> hopefully rare cases. */
> + imm_use_iterator use_iter;
> + gimple *use_stmt;
> + stmt_vec_info use_stmt_info;
> +
> if (STMT_VINFO_LIVE_P (stmt_info))
> FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
> if (!is_gimple_debug (use_stmt)
> @@ -6517,8 +6608,56 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo,
> slp_tree node,
> slp_tree child;
> FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
> if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
> - vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
> - cost_vec, svisited, visited);
> + vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
> + scalar_use_map, svisited, visited);
> +}
> +
> +/* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
> + are live outside of the basic-block vectorized region and that can be
> + vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
> +
> +static void
> +vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
> +{
> + if (bb_vinfo->slp_instances.is_empty ())
> + return;
> +
> + hash_set<stmt_vec_info> svisited;
> + hash_set<slp_tree> visited;
> + hash_map<tree, int> scalar_use_map;
> + auto_vec<slp_tree> worklist;
> +
> + for (slp_instance instance : bb_vinfo->slp_instances)
> + if (!visited.add (SLP_INSTANCE_TREE (instance)))
> + worklist.safe_push (SLP_INSTANCE_TREE (instance));
> +
> + do
> + {
> + slp_tree node = worklist.pop ();
> +
> + if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
> + {
> + for (tree op : SLP_TREE_SCALAR_OPS (node))
> + if (TREE_CODE (op) == SSA_NAME)
> + scalar_use_map.put (op, 1);
> + }
> + else
> + {
> + for (slp_tree child : SLP_TREE_CHILDREN (node))
> + if (child && !visited.add (child))
> + worklist.safe_push (child);
> + }
> + } while (!worklist.is_empty ());
> +
> + visited.empty ();
> +
> + for (slp_instance instance : bb_vinfo->slp_instances)
> + {
> + vect_location = instance->location ();
> + vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> + instance, &instance->cost_vec,
> + scalar_use_map, svisited, visited);
> + }
> }
>
> /* Determine whether we can vectorize the reduction epilogue for INSTANCE.
> */
> @@ -6684,17 +6823,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
>
> /* Compute vectorizable live stmts. */
> if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
> - {
> - hash_set<stmt_vec_info> svisited;
> - hash_set<slp_tree> visited;
> - for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
> - {
> - vect_location = instance->location ();
> - vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
> - instance, &instance->cost_vec,
> svisited,
> - visited);
> - }
> - }
> + vect_bb_slp_mark_live_stmts (bb_vinfo);
>
> return !vinfo->slp_instances.is_empty ();
> }
> --
> 2.17.1
>
> ________________________________________
> From: Richard Biener <[email protected]>
> Sent: Thursday, January 11, 2024 5:52 PM
> To: Feng Xue OS; Richard Sandiford
> Cc: [email protected]
> Subject: Re: [PATCH] Do not count unused scalar use when marking
> STMT_VINFO_LIVE_P [PR113091]
>
> On Thu, Jan 11, 2024 at 10:46 AM Richard Biener
> <[email protected]> wrote:
> >
> > On Fri, Dec 29, 2023 at 11:29 AM Feng Xue OS
> > <[email protected]> wrote:
> > >
> > > This patch is meant to fix over-estimation about SLP vector-to-scalar
> > > cost for
> > > STMT_VINFO_LIVE_P statement. When pattern recognition is involved, a
> > > statement whose definition is consumed in some pattern, may not be
> > > included in the final replacement pattern statements, and would be skipped
> > > when building SLP graph.
> > >
> > > * Original
> > > char a_c = *(char *) a;
> > > char b_c = *(char *) b;
> > > unsigned short a_s = (unsigned short) a_c;
> > > int a_i = (int) a_s;
> > > int b_i = (int) b_c;
> > > int r_i = a_i - b_i;
> > >
> > > * After pattern replacement
> > > a_s = (unsigned short) a_c;
> > > a_i = (int) a_s;
> > >
> > > patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
> > > patt_b_i = (int) patt_b_s; // b_i = (int) b_c
> > >
> > > patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
> > > patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
> > >
> > > The definitions of a_i(original statement) and b_i(pattern statement)
> > > are related to, but actually not part of widen_minus pattern.
> > > Vectorizing the pattern does not cause these definition statements to
> > > be marked as PURE_SLP. For this case, we need to recursively check
> > > whether their uses are all absorbed into vectorized code. But there
> > > is an exception that some use may participate in an vectorized
> > > operation via an external SLP node containing that use as an element.
> > >
> > > Feng
> > >
> > > ---
> > > .../gcc.target/aarch64/bb-slp-pr113091.c | 22 ++
> > > gcc/tree-vect-slp.cc | 189 ++++++++++++++----
> > > 2 files changed, 172 insertions(+), 39 deletions(-)
> > > create mode 100644 gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> > >
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> > > b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> > > new file mode 100644
> > > index 00000000000..ff822e90b4a
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/bb-slp-pr113091.c
> > > @@ -0,0 +1,22 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-additional-options "-O3 -fdump-tree-slp-details
> > > -ftree-slp-vectorize" } */
> > > +
> > > +int test(unsigned array[8]);
> > > +
> > > +int foo(char *a, char *b)
> > > +{
> > > + unsigned array[8];
> > > +
> > > + array[0] = (a[0] - b[0]);
> > > + array[1] = (a[1] - b[1]);
> > > + array[2] = (a[2] - b[2]);
> > > + array[3] = (a[3] - b[3]);
> > > + array[4] = (a[4] - b[4]);
> > > + array[5] = (a[5] - b[5]);
> > > + array[6] = (a[6] - b[6]);
> > > + array[7] = (a[7] - b[7]);
> > > +
> > > + return test(array);
> > > +}
> > > +
> > > +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized
> > > using SLP" 1 "slp2" } } */
> > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> > > index a82fca45161..d36ff37114e 100644
> > > --- a/gcc/tree-vect-slp.cc
> > > +++ b/gcc/tree-vect-slp.cc
> > > @@ -6418,6 +6418,84 @@ vect_slp_analyze_node_operations (vec_info *vinfo,
> > > slp_tree node,
> > > return res;
> > > }
> > >
> > > +/* Given a definition DEF, analyze if it will have any live scalar use
> > > after
> > > + performing SLP vectorization whose information is represented by
> > > BB_VINFO,
> > > + and record result into hash map SCALAR_USE_MAP as cache for later fast
> > > + check. */
> > > +
> > > +static bool
> > > +vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
> > > + hash_map<tree, bool> &scalar_use_map)
> > > +{
> > > + imm_use_iterator use_iter;
> > > + gimple *use_stmt;
> > > +
> > > + if (bool *res = scalar_use_map.get (def))
> > > + return *res;
> > > +
> > > + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
> > > + {
> > > + if (is_gimple_debug (use_stmt))
> > > + continue;
> > > +
> > > + stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> > > +
> > > + if (!use_stmt_info)
> > > + break;
> > > +
> > > + if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
> > > + continue;
> > > +
> > > + /* Do not step forward when encounter PHI statement, since it may
> > > + involve cyclic reference and cause infinite recursive
> > > invocation. */
> > > + if (gimple_code (use_stmt) == GIMPLE_PHI)
> > > + break;
> > > +
> > > + /* When pattern recognition is involved, a statement whose
> > > definition is
> > > + consumed in some pattern, may not be included in the final
> > > replacement
> > > + pattern statements, so would be skipped when building SLP graph.
> > > +
> > > + * Original
> > > + char a_c = *(char *) a;
> > > + char b_c = *(char *) b;
> > > + unsigned short a_s = (unsigned short) a_c;
> > > + int a_i = (int) a_s;
> > > + int b_i = (int) b_c;
> > > + int r_i = a_i - b_i;
> > > +
> > > + * After pattern replacement
> > > + a_s = (unsigned short) a_c;
> > > + a_i = (int) a_s;
> > > +
> > > + patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
> > > + patt_b_i = (int) patt_b_s; // b_i = (int) b_c
> > > +
> > > + patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
> > > + patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
> > > +
> > > + The definitions of a_i(original statement) and b_i(pattern
> > > statement)
> > > + are related to, but actually not part of widen_minus pattern.
> > > + Vectorizing the pattern does not cause these definition
> > > statements to
> > > + be marked as PURE_SLP. For this case, we need to recursively
> > > check
> > > + whether their uses are all absorbed into vectorized code. But
> > > there
> > > + is an exception that some use may participate in an vectorized
> > > + operation via an external SLP node containing that use as an
> > > element.
> > > + The parameter "scalar_use_map" tags such kind of SSA as having
> > > scalar
> > > + use in advance. */
> > > + tree lhs = gimple_get_lhs (use_stmt);
> > > +
> > > + if (!lhs || TREE_CODE (lhs) != SSA_NAME
> > > + || vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map))
> >
> > This looks mostly good, the only worry I have is that this recursion
> > will - for unvectorized uses - eventually go all the way down to the end
> > of the vectorized region (aka the whole function). Since it's really
> > patterns we are after it should be possible to limit the recursion to
> > use_stmt which are covered by a pattern, aka STMT_VINFO_IN_PATTERN_P?
>
> Hmm, in_pattern_p doesn't seem to be set for original stmts replaced
> by a pattern.
> In fact we don't seem to mark those in any way? It should be possible to
> programmatically do this looking at the root stmt uses maybe. Maybe we can
> instead simply limit recursion depth to one and only increase when we get
> testcases requiring more? We should eventually revisit how we mark patterns
> in next stage1.
>
> > PHIs should be then magically covered since we have no patterns
> > for them (though explicitly handling is good for documentation purposes).
> >
> > Thanks,
> > Richard.
> >
> > > + break;
> > > + }
> > > +
> > > + bool found = !end_imm_use_stmt_p (&use_iter);
> > > + bool added = scalar_use_map.put (def, found);
> > > +
> > > + gcc_assert (!added);
> > > + return found;
> > > +}
> > > +
> > > /* Mark lanes of NODE that are live outside of the basic-block vectorized
> > > region and that can be vectorized using vectorizable_live_operation
> > > with STMT_VINFO_LIVE_P. Not handled live operations will cause the
> > > @@ -6427,6 +6505,7 @@ static void
> > > vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
> > > slp_instance instance,
> > > stmt_vector_for_cost *cost_vec,
> > > + hash_map<tree, bool> &scalar_use_map,
> > > hash_set<stmt_vec_info> &svisited,
> > > hash_set<slp_tree> &visited)
> > > {
> > > @@ -6451,32 +6530,22 @@ vect_bb_slp_mark_live_stmts (bb_vec_info
> > > bb_vinfo, slp_tree node,
> > > def_operand_p def_p;
> > > FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
> > > {
> > > - imm_use_iterator use_iter;
> > > - gimple *use_stmt;
> > > - stmt_vec_info use_stmt_info;
> > > - FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
> > > - if (!is_gimple_debug (use_stmt))
> > > - {
> > > - use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
> > > - if (!use_stmt_info
> > > - || !PURE_SLP_STMT (vect_stmt_to_vectorize
> > > (use_stmt_info)))
> > > - {
> > > - STMT_VINFO_LIVE_P (stmt_info) = true;
> > > - if (vectorizable_live_operation (bb_vinfo, stmt_info,
> > > - node, instance, i,
> > > - false, cost_vec))
> > > - /* ??? So we know we can vectorize the live stmt
> > > - from one SLP node. If we cannot do so from all
> > > - or none consistently we'd have to record which
> > > - SLP node (and lane) we want to use for the live
> > > - operation. So make sure we can code-generate
> > > - from all nodes. */
> > > - mark_visited = false;
> > > - else
> > > - STMT_VINFO_LIVE_P (stmt_info) = false;
> > > - break;
> > > - }
> > > - }
> > > + if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
> > > + scalar_use_map))
> > > + {
> > > + STMT_VINFO_LIVE_P (stmt_info) = true;
> > > + if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
> > > + instance, i, false,
> > > cost_vec))
> > > + /* ??? So we know we can vectorize the live stmt from
> > > one SLP
> > > + node. If we cannot do so from all or none consistently
> > > + we'd have to record which SLP node (and lane) we want
> > > to
> > > + use for the live operation. So make sure we can
> > > + code-generate from all nodes. */
> > > + mark_visited = false;
> > > + else
> > > + STMT_VINFO_LIVE_P (stmt_info) = false;
> > > + }
> > > +
> > > /* We have to verify whether we can insert the lane extract
> > > before all uses. The following is a conservative
> > > approximation.
> > > We cannot put this into vectorizable_live_operation because
> > > @@ -6495,6 +6564,10 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo,
> > > slp_tree node,
> > > from the latest stmt in a node. So we compensate for this
> > > during code-generation, simply not replacing uses for those
> > > hopefully rare cases. */
> > > + imm_use_iterator use_iter;
> > > + gimple *use_stmt;
> > > + stmt_vec_info use_stmt_info;
> > > +
> > > if (STMT_VINFO_LIVE_P (stmt_info))
> > > FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR
> > > (def_p))
> > > if (!is_gimple_debug (use_stmt)
> > > @@ -6517,8 +6590,56 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo,
> > > slp_tree node,
> > > slp_tree child;
> > > FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
> > > if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
> > > - vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance,
> > > - cost_vec, svisited, visited);
> > > + vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
> > > + scalar_use_map, svisited, visited);
> > > +}
> > > +
> > > +/* Traverse all slp instances of BB_VINFO, and mark lanes of every node
> > > that
> > > + are live outside of the basic-block vectorized region and that can be
> > > + vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.
> > > */
> > > +
> > > +static void
> > > +vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
> > > +{
> > > + if (bb_vinfo->slp_instances.is_empty ())
> > > + return;
> > > +
> > > + hash_set<stmt_vec_info> svisited;
> > > + hash_set<slp_tree> visited;
> > > + hash_map<tree, bool> scalar_use_map;
> > > + auto_vec<slp_tree> worklist;
> > > +
> > > + for (slp_instance instance : bb_vinfo->slp_instances)
> > > + if (!visited.add (SLP_INSTANCE_TREE (instance)))
> > > + worklist.safe_push (SLP_INSTANCE_TREE (instance));
> > > +
> > > + do
> > > + {
> > > + slp_tree node = worklist.pop ();
> > > +
> > > + if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
> > > + {
> > > + for (tree op : SLP_TREE_SCALAR_OPS (node))
> > > + if (TREE_CODE (op) == SSA_NAME)
> > > + scalar_use_map.put (op, true);
> > > + }
> > > + else
> > > + {
> > > + for (slp_tree child : SLP_TREE_CHILDREN (node))
> > > + if (child && !visited.add (child))
> > > + worklist.safe_push (child);
> > > + }
> > > + } while (!worklist.is_empty ());
> > > +
> > > + visited.empty ();
> > > +
> > > + for (slp_instance instance : bb_vinfo->slp_instances)
> > > + {
> > > + vect_location = instance->location ();
> > > + vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE
> > > (instance),
> > > + instance, &instance->cost_vec,
> > > + scalar_use_map, svisited, visited);
> > > + }
> > > }
> > >
> > > /* Determine whether we can vectorize the reduction epilogue for
> > > INSTANCE. */
> > > @@ -6684,17 +6805,7 @@ vect_slp_analyze_operations (vec_info *vinfo)
> > >
> > > /* Compute vectorizable live stmts. */
> > > if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
> > > - {
> > > - hash_set<stmt_vec_info> svisited;
> > > - hash_set<slp_tree> visited;
> > > - for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
> > > - {
> > > - vect_location = instance->location ();
> > > - vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE
> > > (instance),
> > > - instance, &instance->cost_vec,
> > > svisited,
> > > - visited);
> > > - }
> > > - }
> > > + vect_bb_slp_mark_live_stmts (bb_vinfo);
> > >
> > > return !vinfo->slp_instances.is_empty ();
> > > }
> > > --
> > > 2.17.1