https://gcc.gnu.org/g:c84be624e079cd748df93a3dc0b5168865fefee9

commit r15-7811-gc84be624e079cd748df93a3dc0b5168865fefee9
Author: Jan Hubicka <hubi...@ucw.cz>
Date:   Mon Mar 3 19:12:20 2025 +0100

    Make ix86_macro_fusion_pair_p and ix86_fuse_mov_alu_p match current CPUs
    
    The current implementation of fussion predicates misses some common
    fussion cases on zen and more recent cores.  I added knobs for
    individual conditionals we test.
    
     1) I split checks for fusing ALU with conditional operands when the ALU
     has memory operand.  This seems to be supported by zen3+ and by
     tigerlake and coperlake (according to Agner Fog's manual)
    
     2) znver4 and 5 supports fussion of ALU and conditional even if ALU has
        memory and immediate operands.
        This seems to be relatively important enabling 25% more fusions on
        gcc bootstrap.
    
     3) no CPU supports fusing when ALU contains IP relative memory
        references.  I added separate knob so we do not forger about this if
        this gets supoorted later.
    
    The patch does not solve the limitation of sched that fuse pairs must be
    adjacent on imput and the first operation must be signle-set.  Fixing
    single-set is easy (I have separate patch for this), for non-adjacent
    pairs we need bigger surgery.
    
    To verify what CPU really does I made simpe test script.
    
    jh@ryzen3:~> cat fuse-test.c
            int b;
            const int z = 0;
            const int o = 1;
            int
    main()
    {
            int a = 1000000000;
            int b;
            int z = 0;
            int o = 1;
            asm volatile ("\n"
    ".L1234:\n"
            "nop\n"
            "subl   %3, %0\n"
    
            "movl %0, %1\n"
            "cmpl     %2, %1\n"
            "movl %0, %1\n"
            "test %1, %1\n"
    
            "nop\n"
            "jne    .L1234":"=a"(a),
            "=m"(b)
            "=r"(b)
            :
            "m"(z),
            "m"(o),
            "i"(0),
            "i"(1),
            "0"(a)
                    );
    }
    jh@ryzen3:~> cat fuse-test.sh
    EVENT=ex_ret_fused_instr
    dotest()
    {
    gcc -O2  fuse-test.c $* -o fuse-cmp-imm-mem-nofuse
    perf stat -e $EVENT ./fuse-cmp-imm-mem-nofuse  2>&1 | grep $EVENT
    gcc -O2 fuse-test.c -DFUSE $* -o fuse-cmp-imm-mem-fuse
    perf stat  -e $EVENT ./fuse-cmp-imm-mem-fuse 2>&1 | grep $EVENT
    }
    
    echo ALU with immediate
    dotest
    echo ALU with memory
    dotest -D MEM
    echo ALU with IP relative memory
    dotest -D MEM -D IPRELATIVE
    echo CMP with immediate
    dotest -D CMP
    echo CMP with memory
    dotest -D CMP -D MEM
    echo CMP with memory and immediate
    dotest -D CMP -D MEMIMM
    echo CMP with IP relative memory
    dotest -D CMP -D MEM -D IPRELATIVE
    echo TEST
    dotest -D TEST
    
    On zen5 I get:
    ALU with immediate
                20,345      ex_ret_fused_instr:u
         1,000,020,278      ex_ret_fused_instr:u
    ALU with memory
                20,367      ex_ret_fused_instr:u
         1,000,020,290      ex_ret_fused_instr:u
    ALU with IP relative memory
                20,395      ex_ret_fused_instr:u
                20,403      ex_ret_fused_instr:u
    CMP with immediate
                20,369      ex_ret_fused_instr:u
         1,000,020,301      ex_ret_fused_instr:u
    CMP with memory
                20,314      ex_ret_fused_instr:u
         1,000,020,341      ex_ret_fused_instr:u
    CMP with memory and immediate
                20,372      ex_ret_fused_instr:u
         1,000,020,266      ex_ret_fused_instr:u
    CMP with IP relative memory
                20,382      ex_ret_fused_instr:u
                20,369      ex_ret_fused_instr:u
    TEST
                20,346      ex_ret_fused_instr:u
         1,000,020,301      ex_ret_fused_instr:u
    
    IP relative memory seems to not be documented.
    
    On zen3/4 I get:
    
    ALU with immediate
                20,263      ex_ret_fused_instr:u
         1,000,020,051      ex_ret_fused_instr:u
    ALU with memory
                20,255      ex_ret_fused_instr:u
         1,000,020,056      ex_ret_fused_instr:u
    ALU with IP relative memory
                20,253      ex_ret_fused_instr:u
                20,266      ex_ret_fused_instr:u
    CMP with immediate
                20,264      ex_ret_fused_instr:u
         1,000,020,052      ex_ret_fused_instr:u
    CMP with memory
                20,253      ex_ret_fused_instr:u
         1,000,019,794      ex_ret_fused_instr:u
    CMP with memory and immediate
                20,260      ex_ret_fused_instr:u
                20,264      ex_ret_fused_instr:u
    CMP with IP relative memory
                20,258      ex_ret_fused_instr:u
                20,256      ex_ret_fused_instr:u
    TEST
                20,261      ex_ret_fused_instr:u
         1,000,020,048      ex_ret_fused_instr:u
    
    zen1 and 2 gets:
    
    ALU with immediate
                21,610      ex_ret_fus_brnch_inst:u
                21,697      ex_ret_fus_brnch_inst:u
    ALU with memory
                21,479      ex_ret_fus_brnch_inst:u
                21,747      ex_ret_fus_brnch_inst:u
    ALU with IP relative memory
                21,623      ex_ret_fus_brnch_inst:u
                21,684      ex_ret_fus_brnch_inst:u
    CMP with immediate
                21,708      ex_ret_fus_brnch_inst:u
         1,000,021,288      ex_ret_fus_brnch_inst:u
    CMP with memory
                21,689      ex_ret_fus_brnch_inst:u
         1,000,004,270      ex_ret_fus_brnch_inst:u
    CMP with memory and immediate
                21,604      ex_ret_fus_brnch_inst:u
                21,671      ex_ret_fus_brnch_inst:u
    CMP with IP relative memory
                21,589      ex_ret_fus_brnch_inst:u
                21,602      ex_ret_fus_brnch_inst:u
    TEST
                21,600      ex_ret_fus_brnch_inst:u
         1,000,021,233      ex_ret_fus_brnch_inst:u
    
    I tested the patch on zen3 and zen5 and spec2k17 and it seems neutral, 
however
    the number of fussion does go up.
    
    Bootstrapped/regtested x86_64-linux, I plan to commit it tomorrow.
    
    Honza
    
    gcc/ChangeLog:
    
            * config/i386/i386.h (TARGET_FUSE_ALU_AND_BRANCH_MEM): New macro.
            (TARGET_FUSE_ALU_AND_BRANCH_MEM_IMM): New macro.
            (TARGET_FUSE_ALU_AND_BRANCH_RIP_RELATIVE): New macro.
            * config/i386/x86-tune-sched.cc (ix86_fuse_mov_alu_p): Support
            non-single-set.
            (ix86_macro_fusion_pair_p): Allow ALU which only clobbers;
            be more careful about immediates; check 
TARGET_FUSE_ALU_AND_BRANCH_MEM,
            TARGET_FUSE_ALU_AND_BRANCH_MEM_IMM, 
TARGET_FUSE_ALU_AND_BRANCH_RIP_RELATIVE;
            verify that we never use unsigned checks with inc/dec.
            * config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): New tune.
            (X86_TUNE_FUSE_ALU_AND_BRANCH_MEM): New tune.
            (X86_TUNE_FUSE_ALU_AND_BRANCH_MEM_IMM): New tune.
            (X86_TUNE_FUSE_ALU_AND_BRANCH_RIP_RELATIVE): New tune.

Diff:
---
 gcc/config/i386/i386.h            |  6 ++++
 gcc/config/i386/x86-tune-sched.cc | 72 ++++++++++++++++++++++++++++++---------
 gcc/config/i386/x86-tune.def      | 18 +++++++++-
 3 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 56ef11a58bbc..2696bfb3a81e 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -432,6 +432,12 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
        ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
 #define TARGET_FUSE_ALU_AND_BRANCH \
        ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
+#define TARGET_FUSE_ALU_AND_BRANCH_MEM \
+       ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH_MEM]
+#define TARGET_FUSE_ALU_AND_BRANCH_MEM_IMM \
+       ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH_MEM_IMM]
+#define TARGET_FUSE_ALU_AND_BRANCH_RIP_RELATIVE\
+       ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH_RIP_RELATIVE]
 #define TARGET_FUSE_MOV_AND_ALU \
        ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU]
 #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
diff --git a/gcc/config/i386/x86-tune-sched.cc 
b/gcc/config/i386/x86-tune-sched.cc
index a59d7c229c2c..a51764e078c8 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -32,6 +32,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "insn-attr.h"
 #include "insn-opinit.h"
 #include "recog.h"
+#include "tm-constrs.h"
 
 /* Return the maximum number of instructions a cpu can issue.  */
 
@@ -571,6 +572,9 @@ ix86_macro_fusion_p ()
   return TARGET_FUSE_CMP_AND_BRANCH;
 }
 
+/* Check whether MOV is a reg-reg move and ALU is an
+   ALU operation that allows macro-op fusion.  */
+
 static bool
 ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
 {
@@ -593,6 +597,16 @@ ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
   rtx set2 = XVECEXP (PATTERN (alu), 0, 0);
   if (GET_CODE (set2) != SET)
     return false;
+  /* If this is instruction setting both compare and normal
+     register, the first set always sets flags, while
+     second set writes to the output operan.  Pick
+     the second set.  */
+  if (GET_CODE (SET_SRC (set2)) == COMPARE)
+    {
+      set2 = XVECEXP (PATTERN (alu), 0, 1);
+      if (GET_CODE (set2) != SET)
+       return false;
+    }
   /* Match one of:
      ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR
      We also may add insn attribute to handle some of sporadic
@@ -635,10 +649,11 @@ ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn 
*condjmp)
   if (TARGET_FUSE_MOV_AND_ALU
       && ix86_fuse_mov_alu_p (condgen, condjmp))
     return true;
-  rtx src, dest;
+  rtx src, imm = NULL_RTX;
   enum rtx_code ccode;
   rtx compare_set = NULL_RTX, test_if, cond;
   rtx alu_set = NULL_RTX, addr = NULL_RTX;
+  rtx alu_clobber = NULL_RTX;
   enum attr_type condgen_type;
 
   if (!any_condjump_p (condjmp))
@@ -664,6 +679,9 @@ ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn 
*condjmp)
       alu_set = XVECEXP (PATTERN (condgen), 0, 1);
       goto handle_stack_protect_test;
     }
+  /* ??? zen5 can fuse cmp, test, sub, add, inc, dec, or, and xor.
+     Cores can not fuse or and xor which will pass the test below
+     since type is ALU.  */
   else if (condgen_type != TYPE_TEST
           && condgen_type != TYPE_ICMP
           && condgen_type != TYPE_INCDEC
@@ -687,6 +705,11 @@ ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn 
*condjmp)
            else
              alu_set = XVECEXP (pat, 0, i);
          }
+       /* We also possibly generated ALU instruction only to set
+          flags.  In this case there will be clobber.  */
+       else if (GET_CODE (XVECEXP (pat, 0, i)) == CLOBBER
+           && GENERAL_REG_P (XEXP (XVECEXP (pat, 0, i), 0)))
+         alu_clobber = XVECEXP (pat, 0, i);
     }
   if (compare_set == NULL_RTX)
     return false;
@@ -694,19 +717,30 @@ ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn 
*condjmp)
   if (GET_CODE (src) != COMPARE)
     return false;
 
-  /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
-     supported.  */
-  if ((MEM_P (XEXP (src, 0)) && CONST_INT_P (XEXP (src, 1)))
-      || (MEM_P (XEXP (src, 1)) && CONST_INT_P (XEXP (src, 0))))
-    return false;
-
-  /* No fusion for RIP-relative address.  */
+  /* Check for memory operand.  */
   if (MEM_P (XEXP (src, 0)))
     addr = XEXP (XEXP (src, 0), 0);
   else if (MEM_P (XEXP (src, 1)))
     addr = XEXP (XEXP (src, 1), 0);
+  /* Some CPUs, i.e. tigerlake and cooperlake does not fuse
+     ALU with memory operand.  */
+  if (addr && !TARGET_FUSE_ALU_AND_BRANCH_MEM)
+    return false;
+  if (CONST_INT_P (XEXP (src, 0)))
+    imm = XEXP (src, 0);
+  else if (CONST_INT_P (XEXP (src, 1)))
+    imm = XEXP (src, 1);
+  /* Check that the instruction really has immediate.
+     In particular compare with 0 is done using test with no immediate.  */
+  if (imm && !get_attr_length_immediate (condgen))
+    imm = NULL;
+  /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
+     supported.   */
+  if (addr && imm && !TARGET_FUSE_ALU_AND_BRANCH_MEM_IMM)
+    return false;
 
-  if (addr)
+  /* No fusion for RIP-relative address.   */
+  if (addr && !TARGET_FUSE_ALU_AND_BRANCH_RIP_RELATIVE)
     {
       ix86_address parts;
       int ok = ix86_decompose_address (addr, &parts);
@@ -715,6 +749,12 @@ ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn 
*condjmp)
       if (ix86_rip_relative_addr_p (&parts))
        return false;
     }
+  /* Znver5 supports fussion fusion with their reg/reg, reg/imm and
+     reg/mem forms. They are also supported when the instruction has an
+     immediate and displacement that meets the criteria of 4 byte displacement
+     and 2 byte immediate or the case of 2 byte displacement and 4 byte
+     immediate.  We do not know the displacement size, so we ignore this
+     limitation.  */
 
  handle_stack_protect_test:
   test_if = SET_SRC (pc_set (condjmp));
@@ -730,19 +770,19 @@ ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn 
*condjmp)
     return true;
 
   /* The following is the case that macro-fusion for alu + jmp.  */
-  if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
+  if (!TARGET_FUSE_ALU_AND_BRANCH || (!alu_set && !alu_clobber))
     return false;
 
   /* No fusion for alu op with memory destination operand.  */
-  dest = SET_DEST (alu_set);
-  if (MEM_P (dest))
+  if (alu_set && MEM_P (SET_DEST (alu_set)))
     return false;
 
+
   /* Macro-fusion for inc/dec + unsigned conditional jump is not
-     supported.  */
-  if (condgen_type == TYPE_INCDEC
-      && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU))
-    return false;
+     supported on some CPUs while supported on others (znver5 and core_avx512).
+     We however never generate it, so we do not need a specific tune for it.  
*/
+  gcc_checking_assert (!(condgen_type == TYPE_INCDEC
+                      && (ccode == GEU || ccode == GTU || ccode == LEU || 
ccode == LTU)));
 
   return true;
 }
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 0bdad7234a6a..b6e39f642e88 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -149,7 +149,7 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, 
"fuse_cmp_and_branch_soflags",
    TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND,
    There is also limitation for immediate and displacement supported.  */
 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
-         m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5)
+         m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER3 | 
m_ZNVER4 | m_ZNVER5)
 
 /* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
    and the destination is used by alu.  alu must be one of
@@ -157,6 +157,22 @@ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, 
"fuse_alu_and_branch",
 DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu",
         m_ZNVER5 | m_GRANITERAPIDS | m_GRANITERAPIDS_D)
 
+/* X86_TUNE_FUSE_AND_BRANCH_MEM: Fuse alu with a subsequent conditional
+   jump instruction when alu contains memory operand.
+   TODO: Not suported by TIGERLAKE and COPERLAKE, so m_CORE_AVX2 is wrong.  */
+DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH_MEM, "fuse_alu_and_branch_mem",
+         m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER3 | 
m_ZNVER4 | m_ZNVER5)
+
+/* X86_TUNE_FUSE_AND_BRANCH_MEM_IMM: Fuse alu with a subsequent conditional
+   jump instruction when alu contains both immediate and displacement.  */
+DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH_MEM_IMM, "fuse_alu_and_branch_mem_imm",
+         m_GENERIC | m_ZNVER4 | m_ZNVER5)
+
+/* X86_TUNE_FUSE_AND_BRANCH_RIP_RELATIVE: Fuse alu with a subsequent
+   conditional jump instruction when alu contains IP relative address.  */
+DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH_RIP_RELATIVE,
+         "fuse_alu_and_branch_rip_relative", 0)
+
 /*****************************************************************************/
 /* Function prologue, epilogue and function calling sequences.               */
 /*****************************************************************************/

Reply via email to