https://gcc.gnu.org/g:af9a3fe6a52974252516b3eea4c5ab5caae47b4b

commit r15-5894-gaf9a3fe6a52974252516b3eea4c5ab5caae47b4b
Author: Richard Biener <rguent...@suse.de>
Date:   Tue Dec 3 08:56:35 2024 +0100

    tree-optimization/117874 - optimize SLP discovery budget use
    
    The following tries to avoid eating into the SLP discovery limit
    when we can do cheaper checks first.  Together with the previous
    patch this allows to use two-lane SLP discovery for mult_su3_an
    in 433.milc.
    
            PR tree-optimization/117874
            * tree-vect-slp.cc (vect_build_slp_tree_2): Perform early
            reassoc checks before eating into discovery limit.

Diff:
---
 gcc/tree-vect-slp.cc | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 1799d5a619b1..425135a9ee0a 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -2292,6 +2292,9 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                }
            }
          /* 2. try to build children nodes, associating as necessary.  */
+         /* 2a. prepare and perform early checks to avoid eating into
+            discovery limit unnecessarily.  */
+         vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
          for (unsigned n = 0; n < chain_len; ++n)
            {
              vect_def_type dt = chains[0][n].dt;
@@ -2319,6 +2322,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                    matches[0] = false;
                  goto out;
                }
+             dts[n] = dt;
              if (dt == vect_constant_def
                  || dt == vect_external_def)
                {
@@ -2333,16 +2337,6 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                      matches[0] = false;
                      goto out;
                    }
-                 vec<tree> ops;
-                 ops.create (group_size);
-                 for (lane = 0; lane < group_size; ++lane)
-                   if (stmts[lane])
-                     ops.quick_push (chains[lane][n].op);
-                   else
-                     ops.quick_push (NULL_TREE);
-                 slp_tree child = vect_create_new_slp_node (ops);
-                 SLP_TREE_DEF_TYPE (child) = dt;
-                 children.safe_push (child);
                }
              else if (dt != vect_internal_def)
                {
@@ -2354,6 +2348,26 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                  hard_fail = false;
                  goto out;
                }
+           }
+         /* 2b. do the actual build.  */
+         for (unsigned n = 0; n < chain_len; ++n)
+           {
+             vect_def_type dt = dts[n];
+             unsigned lane;
+             if (dt == vect_constant_def
+                 || dt == vect_external_def)
+               {
+                 vec<tree> ops;
+                 ops.create (group_size);
+                 for (lane = 0; lane < group_size; ++lane)
+                   if (stmts[lane])
+                     ops.quick_push (chains[lane][n].op);
+                   else
+                     ops.quick_push (NULL_TREE);
+                 slp_tree child = vect_create_new_slp_node (ops);
+                 SLP_TREE_DEF_TYPE (child) = dt;
+                 children.safe_push (child);
+               }
              else
                {
                  vec<stmt_vec_info> op_stmts;
@@ -2396,6 +2410,11 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                                term = true;
                                break;
                              }
+                           if (dump_enabled_p ())
+                             dump_printf_loc (MSG_NOTE, vect_location,
+                                              "swapping operand %d and %d "
+                                              "of lane %d\n",
+                                              n, n + perms[lane] + 1, lane);
                            std::swap (chains[lane][n],
                                       chains[lane][n + perms[lane] + 1]);
                            perms[lane]++;

Reply via email to