https://gcc.gnu.org/g:56946c801a7cf3a831a11870b7e11ba08bf9bd87

commit r15-6120-g56946c801a7cf3a831a11870b7e11ba08bf9bd87
Author: Filip Kastl <fka...@suse.cz>
Date:   Wed Dec 11 19:57:04 2024 +0100

    gimple: Add limit after which slower switchlower algs are used [PR117091] 
[PR117352]
    
    This patch adds a limit on the number of cases of a switch.  When this
    limit is exceeded, switch lowering decides to use faster but less
    powerful algorithms.
    
    In particular this means that for finding bit tests switch lowering
    decides between the old dynamic programming O(n^2) algorithm and the
    new greedy algorithm that Andi Kleen recently added but then reverted
    due to PR117352.  It also means that switch lowering may bail out on
    finding jump tables if the switch is too large  (Btw it also may not
    bail!  It can happen that the greedy algorithms finds some bit tests
    which then basically split the switch into multiple smaller switches and
    those may be small enough to fit under the limit.)
    
    The limit is implemented as --param switch-lower-slow-alg-max-cases.
    Exceeding the limit is reported through -Wdisabled-optimization.
    
    This patch fixes the issue with the greedy algorithm described in
    PR117352.  The problem was incorrect usage of the is_beneficial()
    heuristic.
    
    gcc/ChangeLog:
    
            PR middle-end/117091
            PR middle-end/117352
            * doc/invoke.texi: Add switch-lower-slow-alg-max-cases.
            * params.opt: Add switch-lower-slow-alg-max-cases.
            * tree-switch-conversion.cc (jump_table_cluster::find_jump_tables):
            Note in a comment that we are looking for jump tables in
            case sequences delimited by the already found bit tests.
            (bit_test_cluster::find_bit_tests): Decide between
            find_bit_tests_fast() and find_bit_tests_slow().
            (bit_test_cluster::find_bit_tests_fast): New function.
            (bit_test_cluster::find_bit_tests_slow): New function.
            (switch_decision_tree::analyze_switch_statement): Report
            exceeding the limit.
            * tree-switch-conversion.h: Add find_bit_tests_fast() and
            find_bit_tests_slow().
    
    Co-Authored-By: Andi Kleen <a...@gcc.gnu.org>
    Signed-off-by: Filip Kastl <fka...@suse.cz>

Diff:
---
 gcc/doc/invoke.texi           |   3 ++
 gcc/params.opt                |   4 ++
 gcc/tree-switch-conversion.cc | 112 +++++++++++++++++++++++++++++++++++++++---
 gcc/tree-switch-conversion.h  |  18 +++++++
 4 files changed, 130 insertions(+), 7 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 14afd1934bd2..3cb9a50b6909 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -16500,6 +16500,9 @@ Switch initialization conversion refuses to create 
arrays that are
 bigger than @option{switch-conversion-max-branch-ratio} times the number of
 branches in the switch.
 
+@item switch-lower-slow-alg-max-cases
+Maximum number of cases for slow switch lowering algorithms to be used.
+
 @item max-partial-antic-length
 Maximum length of the partial antic set computed during the tree
 partial redundancy elimination optimization (@option{-ftree-pre}) when
diff --git a/gcc/params.opt b/gcc/params.opt
index 5853bf02f9ee..1c88d5212c40 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1052,6 +1052,10 @@ Maximum number of instruction distance that a small 
store forwarded to a larger
 Common Joined UInteger Var(param_switch_conversion_branch_ratio) Init(8) 
IntegerRange(1, 65536) Param Optimization
 The maximum ratio between array size and switch branches for a switch 
conversion to take place.
 
+-param=switch-lower-slow-alg-max-cases=
+Common Joined UInteger Var(param_switch_lower_slow_alg_max_cases) Init(1000) 
IntegerRange(1, 1000000000) Param Optimization
+Maximum number of cases for slow switch lowering algorithms to be used.
+
 -param=modref-max-bases=
 Common Joined UInteger Var(param_modref_max_bases) Init(32) Param Optimization
 Maximum number of bases stored in each modref tree.
diff --git a/gcc/tree-switch-conversion.cc b/gcc/tree-switch-conversion.cc
index 3436c2a8b98c..b98e70cf7d16 100644
--- a/gcc/tree-switch-conversion.cc
+++ b/gcc/tree-switch-conversion.cc
@@ -54,6 +54,7 @@ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, 
MA
 #include "tree-cfgcleanup.h"
 #include "hwint.h"
 #include "internal-fn.h"
+#include "diagnostic-core.h"
 
 /* ??? For lang_hooks.types.type_for_mode, but is there a word_mode
    type in the GIMPLE type system that is language-independent?  */
@@ -1641,6 +1642,11 @@ jump_table_cluster::find_jump_tables (vec<cluster *> 
&clusters)
     return clusters.copy ();
 
   unsigned l = clusters.length ();
+
+  /* Note: l + 1 is the number of cases of the switch.  */
+  if (l + 1 > (unsigned) param_switch_lower_slow_alg_max_cases)
+    return clusters.copy ();
+
   auto_vec<min_cluster_item> min;
   min.reserve (l + 1);
 
@@ -1771,16 +1777,80 @@ jump_table_cluster::is_beneficial (const vec<cluster *> 
&,
   return end - start + 1 >= case_values_threshold ();
 }
 
-/* Find bit tests of given CLUSTERS, where all members of the vector
-   are of type simple_cluster.   MAX_C is the approx max number of cases per
-   label.  New clusters are returned.  */
+/* Find bit tests of given CLUSTERS, where all members of the vector are of
+   type simple_cluster.  Use a fast algorithm that might not find the optimal
+   solution (minimal number of clusters on the output).  New clusters are
+   returned.
+
+   You should call find_bit_tests () instead of calling this function
+   directly.  */
 
 vec<cluster *>
-bit_test_cluster::find_bit_tests (vec<cluster *> &clusters, int max_c)
+bit_test_cluster::find_bit_tests_fast (vec<cluster *> &clusters)
 {
-  if (!is_enabled () || max_c == 1)
-    return clusters.copy ();
+  unsigned l = clusters.length ();
+  vec<cluster *> output;
+
+  output.create (l);
+
+  /* Look at sliding BITS_PER_WORD sized windows in the switch value space
+     and determine if they are suitable for a bit test cluster.  Worst case
+     this can examine every value BITS_PER_WORD-1 times.  */
+  unsigned k;
+  for (unsigned i = 0; i < l; i += k)
+    {
+      hash_set<basic_block> targets;
+      cluster *start_cluster = clusters[i];
+
+      /* Find the biggest k such that clusters i to i+k-1 can be turned into a
+        one big bit test cluster.  */
+      k = 0;
+      while (i + k < l)
+       {
+         cluster *end_cluster = clusters[i + k];
+
+         /* Does value range fit into the BITS_PER_WORD window?  */
+         HOST_WIDE_INT w = cluster::get_range (start_cluster->get_low (),
+                                               end_cluster->get_high ());
+         if (w == 0 || w > BITS_PER_WORD)
+           break;
+
+         /* Check for max # of targets.  */
+         if (targets.elements () == m_max_case_bit_tests
+             && !targets.contains (end_cluster->m_case_bb))
+           break;
+
+         targets.add (end_cluster->m_case_bb);
+         k++;
+       }
 
+      if (is_beneficial (k, targets.elements ()))
+       {
+         output.safe_push (new bit_test_cluster (clusters, i, i + k - 1,
+                                                 i == 0 && k == l));
+       }
+      else
+       {
+         output.safe_push (clusters[i]);
+         /* ??? Might be able to skip more.  */
+         k = 1;
+       }
+    }
+
+  return output;
+}
+
+/* Find bit tests of given CLUSTERS, where all members of the vector
+   are of type simple_cluster.  Use a slow (quadratic) algorithm that always
+   finds the optimal solution (minimal number of clusters on the output).  New
+   clusters are returned.
+
+   You should call find_bit_tests () instead of calling this function
+   directly.  */
+
+vec<cluster *>
+bit_test_cluster::find_bit_tests_slow (vec<cluster *> &clusters)
+{
   unsigned l = clusters.length ();
   auto_vec<min_cluster_item> min;
   min.reserve (l + 1);
@@ -1834,6 +1904,25 @@ bit_test_cluster::find_bit_tests (vec<cluster *> 
&clusters, int max_c)
   return output;
 }
 
+/* Find bit tests of given CLUSTERS, where all members of the vector
+   are of type simple_cluster.  MAX_C is the approx max number of cases per
+   label.  New clusters are returned.  */
+
+vec<cluster *>
+bit_test_cluster::find_bit_tests (vec<cluster *> &clusters, int max_c)
+{
+  if (!is_enabled () || max_c == 1)
+    return clusters.copy ();
+
+  unsigned l = clusters.length ();
+
+  /* Note: l + 1 is the number of cases of the switch.  */
+  if (l + 1 > (unsigned) param_switch_lower_slow_alg_max_cases)
+    return find_bit_tests_fast (clusters);
+  else
+    return find_bit_tests_slow (clusters);
+}
+
 /* Return true when RANGE of case values with UNIQ labels
    can build a bit test.  */
 
@@ -2264,10 +2353,19 @@ switch_decision_tree::analyze_switch_statement ()
 
   reset_out_edges_aux (m_switch);
 
+  if (l > (unsigned) param_switch_lower_slow_alg_max_cases)
+    warning_at (gimple_location (m_switch), OPT_Wdisabled_optimization,
+              "Using faster switch lowering algorithms. "
+              "Number of switch cases (%d) exceeds "
+              "%<--param=switch-lower-slow-alg-max-cases=%d%> limit.",
+              l, param_switch_lower_slow_alg_max_cases);
+
   /* Find bit-test clusters.  */
   vec<cluster *> output = bit_test_cluster::find_bit_tests (clusters, max_c);
 
-  /* Find jump table clusters.  */
+  /* Find jump table clusters.  We are looking for these in the sequences of
+     simple clusters which we didn't manage to convert into bit-test
+     clusters.  */
   vec<cluster *> output2;
   auto_vec<cluster *> tmp;
   output2.create (1);
diff --git a/gcc/tree-switch-conversion.h b/gcc/tree-switch-conversion.h
index e6a85fa60258..560620903a86 100644
--- a/gcc/tree-switch-conversion.h
+++ b/gcc/tree-switch-conversion.h
@@ -397,6 +397,24 @@ public:
             tree default_label_expr, basic_block default_bb, location_t loc)
      final override;
 
+  /* Find bit tests of given CLUSTERS, where all members of the vector are of
+     type simple_cluster.  Use a fast algorithm that might not find the optimal
+     solution (minimal number of clusters on the output).  New clusters are
+     returned.
+
+     You should call find_bit_tests () instead of calling this function
+     directly.  */
+  static vec<cluster *> find_bit_tests_fast (vec<cluster *> &clusters);
+
+  /* Find bit tests of given CLUSTERS, where all members of the vector
+     are of type simple_cluster.  Use a slow (quadratic) algorithm that always
+     finds the optimal solution (minimal number of clusters on the output).  
New
+     clusters are returned.
+
+     You should call find_bit_tests () instead of calling this function
+     directly.  */
+  static vec<cluster *> find_bit_tests_slow (vec<cluster *> &clusters);
+
   /* Find bit tests of given CLUSTERS, where all members of the vector
      are of type simple_cluster.  New clusters are returned.  */
   static vec<cluster *> find_bit_tests (vec<cluster *> &clusters, int max_c);

Reply via email to