https://gcc.gnu.org/g:4e7735a8d87559bbddfe3a985786996e22241f8d

commit r14-10588-g4e7735a8d87559bbddfe3a985786996e22241f8d
Author: liuhongt <hongtao....@intel.com>
Date:   Mon Aug 12 14:35:31 2024 +0800

    Move ix86_align_loops into a separate pass and insert the pass after 
pass_endbr_and_patchable_area.
    
    gcc/ChangeLog:
    
            PR target/116174
            * config/i386/i386.cc (ix86_align_loops): Move this to ..
            * config/i386/i386-features.cc (ix86_align_loops): .. here.
            (class pass_align_tight_loops): New class.
            (make_pass_align_tight_loops): New function.
            * config/i386/i386-passes.def: Insert pass_align_tight_loops
            after pass_insert_endbr_and_patchable_area.
            * config/i386/i386-protos.h (make_pass_align_tight_loops): New
            declare.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/pr116174.c: New test.
    
    (cherry picked from commit c3c83d22d212a35cb1bfb8727477819463f0dcd8)

Diff:
---
 gcc/config/i386/i386-features.cc         | 191 +++++++++++++++++++++++++++++++
 gcc/config/i386/i386-passes.def          |   3 +
 gcc/config/i386/i386-protos.h            |   1 +
 gcc/config/i386/i386.cc                  | 146 -----------------------
 gcc/testsuite/gcc.target/i386/pr116174.c |  12 ++
 5 files changed, 207 insertions(+), 146 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index e3e004d5526..7de19d42363 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3253,6 +3253,197 @@ make_pass_remove_partial_avx_dependency (gcc::context 
*ctxt)
   return new pass_remove_partial_avx_dependency (ctxt);
 }
 
+/* When a hot loop can be fit into one cacheline,
+   force align the loop without considering the max skip.  */
+static void
+ix86_align_loops ()
+{
+  basic_block bb;
+
+  /* Don't do this when we don't know cache line size.  */
+  if (ix86_cost->prefetch_block == 0)
+    return;
+
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      rtx_insn *label = BB_HEAD (bb);
+      bool has_fallthru = 0;
+      edge e;
+      edge_iterator ei;
+
+      if (!LABEL_P (label))
+       continue;
+
+      profile_count fallthru_count = profile_count::zero ();
+      profile_count branch_count = profile_count::zero ();
+
+      FOR_EACH_EDGE (e, ei, bb->preds)
+       {
+         if (e->flags & EDGE_FALLTHRU)
+           has_fallthru = 1, fallthru_count += e->count ();
+         else
+           branch_count += e->count ();
+       }
+
+      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
+       continue;
+
+      if (bb->loop_father
+         && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
+         && (has_fallthru
+             ? (!(single_succ_p (bb)
+                  && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
+                && optimize_bb_for_speed_p (bb)
+                && branch_count + fallthru_count > count_threshold
+                && (branch_count > fallthru_count * 
param_align_loop_iterations))
+             /* In case there'no fallthru for the loop.
+                Nops inserted won't be executed.  */
+             : (branch_count > count_threshold
+                || (bb->count > bb->prev_bb->count * 10
+                    && (bb->prev_bb->count
+                        <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
+       {
+         rtx_insn* insn, *end_insn;
+         HOST_WIDE_INT size = 0;
+         bool padding_p = true;
+         basic_block tbb = bb;
+         unsigned cond_branch_num = 0;
+         bool detect_tight_loop_p = false;
+
+         for (unsigned int i = 0; i != bb->loop_father->num_nodes;
+              i++, tbb = tbb->next_bb)
+           {
+             /* Only handle continuous cfg layout. */
+             if (bb->loop_father != tbb->loop_father)
+               {
+                 padding_p = false;
+                 break;
+               }
+
+             FOR_BB_INSNS (tbb, insn)
+               {
+                 if (!NONDEBUG_INSN_P (insn))
+                   continue;
+                 size += ix86_min_insn_size (insn);
+
+                 /* We don't know size of inline asm.
+                    Don't align loop for call.  */
+                 if (asm_noperands (PATTERN (insn)) >= 0
+                     || CALL_P (insn))
+                   {
+                     size = -1;
+                     break;
+                   }
+               }
+
+             if (size == -1 || size > ix86_cost->prefetch_block)
+               {
+                 padding_p = false;
+                 break;
+               }
+
+             FOR_EACH_EDGE (e, ei, tbb->succs)
+               {
+                 /* It could be part of the loop.  */
+                 if (e->dest == bb)
+                   {
+                     detect_tight_loop_p = true;
+                     break;
+                   }
+               }
+
+             if (detect_tight_loop_p)
+               break;
+
+             end_insn = BB_END (tbb);
+             if (JUMP_P (end_insn))
+               {
+                 /* For decoded icache:
+                    1. Up to two branches are allowed per Way.
+                    2. A non-conditional branch is the last micro-op in a Way.
+                 */
+                 if (onlyjump_p (end_insn)
+                     && (any_uncondjump_p (end_insn)
+                         || single_succ_p (tbb)))
+                   {
+                     padding_p = false;
+                     break;
+                   }
+                 else if (++cond_branch_num >= 2)
+                   {
+                     padding_p = false;
+                     break;
+                   }
+               }
+
+           }
+
+         if (padding_p && detect_tight_loop_p)
+           {
+             emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
+                                                   GEN_INT (0)), label);
+             /* End of function.  */
+             if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
+               break;
+             /* Skip bb which already fits into one cacheline.  */
+             bb = tbb;
+           }
+       }
+    }
+
+  loop_optimizer_finalize ();
+  free_dominance_info (CDI_DOMINATORS);
+}
+
+namespace {
+
+const pass_data pass_data_align_tight_loops =
+{
+  RTL_PASS, /* type */
+  "align_tight_loops", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
+class pass_align_tight_loops : public rtl_opt_pass
+{
+public:
+  pass_align_tight_loops (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  bool gate (function *) final override
+    {
+      return optimize && optimize_function_for_speed_p (cfun);
+    }
+
+  unsigned int execute (function *) final override
+    {
+      timevar_push (TV_MACH_DEP);
+#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
+      ix86_align_loops ();
+#endif
+      timevar_pop (TV_MACH_DEP);
+      return 0;
+    }
+}; // class pass_align_tight_loops
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_align_tight_loops (gcc::context *ctxt)
+{
+  return new pass_align_tight_loops (ctxt);
+}
+
 /* This compares the priority of target features in function DECL1
    and DECL2.  It returns positive value if DECL1 is higher priority,
    negative value if DECL2 is higher priority and 0 if they are the
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 7d96766f7b9..e500f15c997 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -31,5 +31,8 @@ along with GCC; see the file COPYING3.  If not see
   INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
 
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, 
pass_insert_endbr_and_patchable_area);
+  /* pass_align_tight_loops must be after pass_insert_endbr_and_patchable_area.
+     PR116174.  */
+  INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
 
   INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 46214a63974..36c7b1aed42 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -419,6 +419,7 @@ extern rtl_opt_pass 
*make_pass_insert_endbr_and_patchable_area
   (gcc::context *);
 extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
   (gcc::context *);
+extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
 
 extern bool ix86_has_no_direct_extern_access;
 
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 6f89891d3cb..288c69467d6 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23444,150 +23444,6 @@ ix86_split_stlf_stall_load ()
     }
 }
 
-/* When a hot loop can be fit into one cacheline,
-   force align the loop without considering the max skip.  */
-static void
-ix86_align_loops ()
-{
-  basic_block bb;
-
-  /* Don't do this when we don't know cache line size.  */
-  if (ix86_cost->prefetch_block == 0)
-    return;
-
-  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
-  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
-  FOR_EACH_BB_FN (bb, cfun)
-    {
-      rtx_insn *label = BB_HEAD (bb);
-      bool has_fallthru = 0;
-      edge e;
-      edge_iterator ei;
-
-      if (!LABEL_P (label))
-       continue;
-
-      profile_count fallthru_count = profile_count::zero ();
-      profile_count branch_count = profile_count::zero ();
-
-      FOR_EACH_EDGE (e, ei, bb->preds)
-       {
-         if (e->flags & EDGE_FALLTHRU)
-           has_fallthru = 1, fallthru_count += e->count ();
-         else
-           branch_count += e->count ();
-       }
-
-      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
-       continue;
-
-      if (bb->loop_father
-         && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
-         && (has_fallthru
-             ? (!(single_succ_p (bb)
-                  && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
-                && optimize_bb_for_speed_p (bb)
-                && branch_count + fallthru_count > count_threshold
-                && (branch_count > fallthru_count * 
param_align_loop_iterations))
-             /* In case there'no fallthru for the loop.
-                Nops inserted won't be executed.  */
-             : (branch_count > count_threshold
-                || (bb->count > bb->prev_bb->count * 10
-                    && (bb->prev_bb->count
-                        <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
-       {
-         rtx_insn* insn, *end_insn;
-         HOST_WIDE_INT size = 0;
-         bool padding_p = true;
-         basic_block tbb = bb;
-         unsigned cond_branch_num = 0;
-         bool detect_tight_loop_p = false;
-
-         for (unsigned int i = 0; i != bb->loop_father->num_nodes;
-              i++, tbb = tbb->next_bb)
-           {
-             /* Only handle continuous cfg layout. */
-             if (bb->loop_father != tbb->loop_father)
-               {
-                 padding_p = false;
-                 break;
-               }
-
-             FOR_BB_INSNS (tbb, insn)
-               {
-                 if (!NONDEBUG_INSN_P (insn))
-                   continue;
-                 size += ix86_min_insn_size (insn);
-
-                 /* We don't know size of inline asm.
-                    Don't align loop for call.  */
-                 if (asm_noperands (PATTERN (insn)) >= 0
-                     || CALL_P (insn))
-                   {
-                     size = -1;
-                     break;
-                   }
-               }
-
-             if (size == -1 || size > ix86_cost->prefetch_block)
-               {
-                 padding_p = false;
-                 break;
-               }
-
-             FOR_EACH_EDGE (e, ei, tbb->succs)
-               {
-                 /* It could be part of the loop.  */
-                 if (e->dest == bb)
-                   {
-                     detect_tight_loop_p = true;
-                     break;
-                   }
-               }
-
-             if (detect_tight_loop_p)
-               break;
-
-             end_insn = BB_END (tbb);
-             if (JUMP_P (end_insn))
-               {
-                 /* For decoded icache:
-                    1. Up to two branches are allowed per Way.
-                    2. A non-conditional branch is the last micro-op in a Way.
-                 */
-                 if (onlyjump_p (end_insn)
-                     && (any_uncondjump_p (end_insn)
-                         || single_succ_p (tbb)))
-                   {
-                     padding_p = false;
-                     break;
-                   }
-                 else if (++cond_branch_num >= 2)
-                   {
-                     padding_p = false;
-                     break;
-                   }
-               }
-
-           }
-
-         if (padding_p && detect_tight_loop_p)
-           {
-             emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
-                                                   GEN_INT (0)), label);
-             /* End of function.  */
-             if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
-               break;
-             /* Skip bb which already fits into one cacheline.  */
-             bb = tbb;
-           }
-       }
-    }
-
-  loop_optimizer_finalize ();
-  free_dominance_info (CDI_DOMINATORS);
-}
-
 /* Implement machine specific optimizations.  We implement padding of returns
    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
 static void
@@ -23611,8 +23467,6 @@ ix86_reorg (void)
 #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
       if (TARGET_FOUR_JUMP_LIMIT)
        ix86_avoid_jump_mispredicts ();
-
-      ix86_align_loops ();
 #endif
     }
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c 
b/gcc/testsuite/gcc.target/i386/pr116174.c
new file mode 100644
index 00000000000..8877d0b51af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116174.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target *-*-linux* } } */
+/* { dg-options "-O2 -fcf-protection=branch" } */
+
+char *
+foo (char *dest, const char *src)
+{
+  while ((*dest++ = *src++) != '\0')
+    /* nothing */;
+  return --dest;
+}
+
+/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */

Reply via email to