[PATCH] [x86_64] Add flag to control tight loops alignment opt

MayShao-oc Mon, 04 Nov 2024 22:23:48 -0800

Hi all:
    This patch add -malign-tight-loops flag to control
pass_align_tight_loops.
    The motivation is that pass_align_tight_loops may cause performance
regression in nested loops.


    The example code as follows:

    #define ITER 20000
    #define ITER_O 10

    int i, j,k;
    int array[ITER];

    void loop()
    {
      int i;
      for(k = 0; k < ITER_O; k++)
      for(j = 0; j < ITER; j++)
      for(i = 0; i < ITER; i++)
      {
        array[i] += j;
        array[i] += i;
        array[i] += 2*j;
        array[i] += 2*i;
      }
    }

    When I compile it with gcc -O1 loop.c, the output assembly as follows.
It is not optimal, because of too many nops insert in the outer loop.

0000000000400540 <loop>:
  400540:       48 83 ec 08             sub    $0x8,%rsp
  400544:       bf 0a 00 00 00          mov    $0xa,%edi
  400549:       b9 00 00 00 00          mov    $0x0,%ecx
  40054e:       8d 34 09                lea    (%rcx,%rcx,1),%esi
  400551:       b8 00 00 00 00          mov    $0x0,%eax
  400556:       66 66 2e 0f 1f 84 00    data16 nopw %cs:0x0(%rax,%rax,1)
  40055d:       00 00 00 00
  400561:       66 66 2e 0f 1f 84 00    data16 nopw %cs:0x0(%rax,%rax,1)
  400568:       00 00 00 00
  40056c:       66 66 2e 0f 1f 84 00    data16 nopw %cs:0x0(%rax,%rax,1)
  400573:       00 00 00 00
  400577:       66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
  40057e:       00 00
  400580:       89 ca                   mov    %ecx,%edx
  400582:       03 14 85 60 10 60 00    add    0x601060(,%rax,4),%edx
  400589:       01 c2                   add    %eax,%edx
  40058b:       01 f2                   add    %esi,%edx
  40058d:       8d 14 42                lea    (%rdx,%rax,2),%edx
  400590:       89 14 85 60 10 60 00    mov    %edx,0x601060(,%rax,4)
  400597:       48 83 c0 01             add    $0x1,%rax
  40059b:       48 3d 20 4e 00 00       cmp    $0x4e20,%rax
  4005a1:       75 dd                   jne    400580 <loop+0x40>

   I benchmark this program in the intel Xeon, and find the optimization may 
cause a 40% performance regression
(6.6B cycles VS 9.3B cycles).
   So I propose to add -malign-tight-loops flag to control tight loop 
optimization to avoid this, we could disalbe this optimization by default.
   Bootstrapped X86_64.
   Ok for trunk?

BR
Mayshao

gcc/ChangeLog:

        * config/i386/i386-features.cc (ix86_align_tight_loops): New flag.
        * config/i386/i386.opt (malign-tight-loops): New option.
        * doc/invoke.texi (-malign-tight-loops): Document.
---
 gcc/config/i386/i386-features.cc | 4 +++-
 gcc/config/i386/i386.opt         | 4 ++++
 gcc/doc/invoke.texi              | 7 ++++++-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index e2e85212a4f..f9546e00b07 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3620,7 +3620,9 @@ public:
   /* opt_pass methods: */
   bool gate (function *) final override
     {
-      return optimize && optimize_function_for_speed_p (cfun);
+      return ix86_align_tight_loops
+          && optimize
+          && optimize_function_for_speed_p (cfun);
     }
 
   unsigned int execute (function *) final override
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 64c295d344c..ec41de192bc 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1266,6 +1266,10 @@ mlam=
 Target RejectNegative Joined Enum(lam_type) Var(ix86_lam_type) Init(lam_none)
 -mlam=[none|u48|u57] Instrument meta data position in user data pointers.
 
+malign-tight-loops
+Target Var(ix86_align_tight_loops) Init(0) Optimization
+Enable align tight loops.
+
 Enum
 Name(lam_type) Type(enum lam_type) UnknownError(unknown lam type %qs)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 07920e07b4d..9ec1e1f0095 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1510,7 +1510,7 @@ See RS/6000 and PowerPC Options.
 -mindirect-branch=@var{choice}  -mfunction-return=@var{choice}
 -mindirect-branch-register -mharden-sls=@var{choice}
 -mindirect-branch-cs-prefix -mneeded -mno-direct-extern-access
--munroll-only-small-loops -mlam=@var{choice}}
+-munroll-only-small-loops -mlam=@var{choice} -malign-tight-loops}
 
 @emph{x86 Windows Options}
 
@@ -36530,6 +36530,11 @@ LAM(linear-address masking) allows special bits in the 
pointer to be used
 for metadata. The default is @samp{none}. With @samp{u48}, pointer bits in
 positions 62:48 can be used for metadata; With @samp{u57}, pointer bits in
 positions 62:57 can be used for metadata.
+
+@opindex malign-tight-loops
+@opindex mno-align-tight-loops
+@item -malign-tight-loops
+Controls tight loop alignment optimization.
 @end table
 
 @node x86 Windows Options
-- 
2.27.0

[PATCH] [x86_64] Add flag to control tight loops alignment opt

Reply via email to