https://gcc.gnu.org/g:8c4a00f9a48f1b2af10448c9f2058b44b8cb7234

commit r15-7813-g8c4a00f9a48f1b2af10448c9f2058b44b8cb7234
Author: Jan Hubicka <hubi...@ucw.cz>
Date:   Tue Mar 4 16:22:01 2025 +0100

    Break false dependency chain on Zen5
    
    Zen5 on some variants has false dependency on tzcnt, blsi, blsr and blsmsk
    instructions.  Those can be tested by the following benchmark
    
    jh@shroud:~> cat ee.c
    int
    main()
    {
           int a = 10;
           int b = 0;
           for (int i = 0; i < 1000000000; i++)
           {
                   asm volatile ("xor %0, %0": "=r" (b));
                   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
                   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
                   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
                   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
                   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
                   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
                   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
                   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
                   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
                   asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a));
           }
           return 0;
    }
    jh@shroud:~> cat bmk.sh
    gcc ee.c -DBREAK -DINST=\"$1\" -O2 ; time ./a.out ; gcc ee.c -DINST=\"$1\" 
-O2 ; time ./a.out
    jh@shroud:~> sh bmk.sh tzcnt
    
    real    0m0.886s
    user    0m0.886s
    sys     0m0.000s
    
    real    0m0.886s
    user    0m0.886s
    sys     0m0.000s
    
    jh@shroud:~> sh bmk.sh blsi
    
    real    0m0.979s
    user    0m0.979s
    sys     0m0.000s
    
    real    0m2.418s
    user    0m2.418s
    sys     0m0.000s
    
    jh@shroud:~> sh bmk.sh blsr
    
    real    0m0.986s
    user    0m0.986s
    sys     0m0.000s
    
    real    0m2.422s
    user    0m2.421s
    sys     0m0.000s
    jh@shroud:~> sh bmk.sh blsmsk
    
    real    0m0.973s
    user    0m0.973s
    sys     0m0.000s
    
    real    0m2.422s
    user    0m2.422s
    sys     0m0.000s
    
    We already have runable that controls tzcnt together with lzcnt and popcnt.
    Since it seems that only tzcnt is affected I added new tunable to control 
tzcnt
    only.  I also added splitters for blsi/blsr/blsmsk implemented analogously 
to
    existing splitter for lzcnt.
    
    The patch is neutral on SPEC. We produce blsi and blsr in some internal 
loops, but
    they usually have same destination as source. However it is good to break 
the
    dependency chain to avoid patogolical cases and it is quite cheap overall, 
so I
    think we want to enable this for generic.  I will send followup patch for 
this.
    
    Bootstrapped/regtested x86_64-linux, will commit it shortly.
    
    gcc/ChangeLog:
    
            * config/i386/i386.h (TARGET_AVOID_FALSE_DEP_FOR_TZCNT): New macro.
            (TARGET_AVOID_FALSE_DEP_FOR_BLS): New macro.
            * config/i386/i386.md (*bmi_blsi_<mode>): Add splitter for false
            dependency.
            (*bmi_blsi_<mode>_ccno): Add splitter for false dependency.
            (*bmi_blsi_<mode>_falsedep): New pattern.
            (*bmi_blsmsk_<mode>): Add splitter for false dependency.
            (*bmi_blsmsk_<mode>_falsedep): New pattern.
            (*bmi_blsr_<mode>): Add splitter for false dependency.
            (*bmi_blsr_<mode>_cmp): Add splitter for false dependency
            (*bmi_blsr_<mode>_cmp_falsedep): New pattern.
            * config/i386/x86-tune.def (X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT): 
New tune.
            (X86_TUNE_AVOID_FALSE_DEP_FOR_BLS): New tune.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/blsi.c: New test.
            * gcc.target/i386/blsmsk.c: New test.
            * gcc.target/i386/blsr.c: New test.

Diff:
---
 gcc/config/i386/i386.h                 |   4 +
 gcc/config/i386/i386.md                | 168 +++++++++++++++++++++++++++++++--
 gcc/config/i386/x86-tune.def           |  10 ++
 gcc/testsuite/gcc.target/i386/blsi.c   |  26 +++++
 gcc/testsuite/gcc.target/i386/blsmsk.c |   9 ++
 gcc/testsuite/gcc.target/i386/blsr.c   |  26 +++++
 6 files changed, 233 insertions(+), 10 deletions(-)

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 2696bfb3a81e..ce29c272bc0b 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -461,6 +461,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
     ix86_tune_features[X86_TUNE_ADJUST_UNROLL]
 #define TARGET_AVOID_FALSE_DEP_FOR_BMI \
        ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BMI]
+#define TARGET_AVOID_FALSE_DEP_FOR_TZCNT \
+       ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT]
+#define TARGET_AVOID_FALSE_DEP_FOR_BLS \
+       ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BLS]
 #define TARGET_ONE_IF_CONV_INSN \
        ix86_tune_features[X86_TUNE_ONE_IF_CONV_INSN]
 #define TARGET_AVOID_MFENCE ix86_tune_features[X86_TUNE_AVOID_MFENCE]
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 8575fbf40fed..b1cd52382a80 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -20993,7 +20993,8 @@
        (ctz:SWI48 (match_dup 1)))]
   "TARGET_BMI"
   "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}";
-  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+  "&& (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT)
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && !reg_mentioned_p (operands[0], operands[1])"
   [(parallel
@@ -21060,7 +21061,8 @@
   return "bsf{<imodesuffix>}\t{%1, %0|%0, %1}";
 }
   "(TARGET_BMI || TARGET_CPU_P (GENERIC))
-   && TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT)
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && !reg_mentioned_p (operands[0], operands[1])"
   [(parallel
@@ -21115,7 +21117,8 @@
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_BMI && TARGET_64BIT"
   "tzcnt{l}\t{%1, %k0|%k0, %1}"
-  "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+  "&& (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT)
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && !reg_mentioned_p (operands[0], operands[1])"
   [(parallel
@@ -21166,7 +21169,8 @@
   return "bsf{l}\t{%1, %k0|%k0, %1}";
 }
   "(TARGET_BMI || TARGET_CPU_P (GENERIC))
-   && TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed
+   && (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT)
+   && epilogue_completed
    && optimize_function_for_speed_p (cfun)
    && !reg_mentioned_p (operands[0], operands[1])"
   [(parallel
@@ -21747,7 +21751,7 @@
    (set_attr "btver2_decode" "direct, double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*bmi_blsi_<mode>"
+(define_insn_and_split "*bmi_blsi_<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (and:SWI48
           (neg:SWI48
@@ -21756,6 +21760,20 @@
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_BMI"
   "blsi\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (reg:CCNO FLAGS_REG)
+      (compare:CCNO
+        (and:SWI48
+          (neg:SWI48 (match_dup 1))
+          (match_dup 1))
+        (const_int 0)))
+     (set (match_dup 0) (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
@@ -21775,6 +21793,51 @@
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
+(define_split
+  [(set (match_operand 3 "flags_reg_operand")
+       (match_operator 4 "compare_operator"
+         [(and:SWI48
+           (neg:SWI48 (match_operand:SWI48 1 "nonimmediate_operand"))
+           (match_dup 1))
+          (const_int 0)]))
+   (set (match_operand:SWI48 0 "register_operand")
+       (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))]
+  "TARGET_BMI
+   && TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (match_dup 3)
+      (match_op_dup 4
+        [(and:SWI48
+           (neg:SWI48 (match_dup 1))
+           (match_dup 1))
+         (const_int 0)]))
+     (set (match_dup 0) (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);")
+
+; False dependency happens when destination is only updated by blsi.
+(define_insn "*bmi_blsi_<mode>_falsedep"
+  [(set (reg FLAGS_REG)
+       (compare
+         (and:SWI48
+           (neg:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm"))
+           (match_dup 1))
+         (const_int 0)))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+       (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+          UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_BMI && ix86_match_ccmode (insn, CCNOmode)"
+   "blsi\t{%1, %0|%0, %1}"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+;; No need for splitter of the false dependency, since the output is unused
+;; so it will not extend dependency chain.
 (define_insn "*bmi_blsi_<mode>_ccno"
   [(set (reg FLAGS_REG)
        (compare
@@ -21789,7 +21852,7 @@
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*bmi_blsmsk_<mode>"
+(define_insn_and_split "*bmi_blsmsk_<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (xor:SWI48
           (plus:SWI48
@@ -21799,11 +21862,40 @@
    (clobber (reg:CC FLAGS_REG))]
   "TARGET_BMI"
   "blsmsk\t{%1, %0|%0, %1}"
+  "TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (match_dup 0)
+          (xor:SWI48
+            (plus:SWI48 (match_dup 1) (const_int -1))
+            (match_dup 1)))
+     (clobber (reg:CC FLAGS_REG))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+; False dependency happens when destination is only updated by blmsk.
+(define_insn "*bmi_blsmsk_<mode>_falsedep"
+  [(set (match_operand:SWI48 0 "register_operand" "=r")
+        (xor:SWI48
+          (plus:SWI48
+            (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+            (const_int -1))
+          (match_dup 1)))
+   (clobber (reg:CC FLAGS_REG))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+          UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_BMI"
+  "blsmsk\t{%1, %0|%0, %1}"
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*bmi_blsr_<mode>"
+(define_insn_and_split "*bmi_blsr_<mode>"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
         (and:SWI48
           (plus:SWI48
@@ -21811,13 +21903,28 @@
             (const_int -1))
           (match_dup 1)))
    (clobber (reg:CC FLAGS_REG))]
-   "TARGET_BMI"
-   "blsr\t{%1, %0|%0, %1}"
+  "TARGET_BMI"
+  "blsr\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (reg:CCZ FLAGS_REG)
+      (compare:CCZ
+        (and:SWI48
+          (plus:SWI48 (match_dup 1) (const_int -1))
+          (match_dup 1))
+        (const_int 0)))
+     (set (match_dup 0) (and:SWI48 (plus:SWI48 (match_dup 1) (const_int -1))
+                                   (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "*bmi_blsr_<mode>_cmp"
+(define_insn_and_split "*bmi_blsr_<mode>_cmp"
   [(set (reg:CCZ FLAGS_REG)
        (compare:CCZ
          (and:SWI48
@@ -21832,12 +21939,53 @@
            (match_dup 1)
            (const_int -1))
          (match_dup 1)))]
+  "TARGET_BMI"
+  "blsr\t{%1, %0|%0, %1}"
+  "&& TARGET_AVOID_FALSE_DEP_FOR_BLS
+   && epilogue_completed
+   && optimize_function_for_speed_p (cfun)
+   && !reg_mentioned_p (operands[0], operands[1])"
+  [(parallel
+    [(set (reg:CCZ FLAGS_REG)
+      (compare:CCZ
+        (and:SWI48
+          (plus:SWI48 (match_dup 1) (const_int -1))
+          (match_dup 1))
+        (const_int 0)))
+     (set (match_dup 0) (and:SWI48 (plus:SWI48 (match_dup 1) (const_int -1))
+                                   (match_dup 1)))
+     (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "ix86_expand_clear (operands[0]);"
+  [(set_attr "type" "bitmanip")
+   (set_attr "btver2_decode" "double")
+   (set_attr "mode" "<MODE>")])
+
+; False dependency happens when destination is only updated by bslr.
+(define_insn "*bmi_blsr_<mode>_cmp_falsedep"
+  [(set (reg:CCZ FLAGS_REG)
+       (compare:CCZ
+         (and:SWI48
+           (plus:SWI48
+             (match_operand:SWI48 1 "nonimmediate_operand" "rm")
+             (const_int -1))
+           (match_dup 1))
+         (const_int 0)))
+   (set (match_operand:SWI48 0 "register_operand" "=r")
+       (and:SWI48
+         (plus:SWI48
+           (match_dup 1)
+           (const_int -1))
+         (match_dup 1)))
+   (unspec [(match_operand:SWI48 2 "register_operand" "0")]
+          UNSPEC_INSN_FALSE_DEP)]
    "TARGET_BMI"
    "blsr\t{%1, %0|%0, %1}"
   [(set_attr "type" "bitmanip")
    (set_attr "btver2_decode" "double")
    (set_attr "mode" "<MODE>")])
 
+;; No need for splitter of the false dependency, since the output is unused
+;; so it will not extend dependency chain.
 (define_insn "*bmi_blsr_<mode>_ccz"
   [(set (reg:CCZ FLAGS_REG)
        (compare:CCZ
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index b6e39f642e88..c857e769b60e 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -366,6 +366,16 @@ DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, 
"avoid_false_dep_for_bmi",
          | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE
          | m_ZHAOXIN | m_GENERIC)
 
+/* X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT: Avoid false dependency
+   for tzcnt instruction (also included in X86_TUNE_AVOID_FALSE_DEP_FOR_BMI).  
*/
+DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT, "avoid_false_dep_for_tzcnt",
+         m_ZNVER5)
+
+/* X86_TUNE_AVOID_FALSE_DEP_FOR_BLS: Avoid false dependency
+   for blsi, blsr and blsmsk instructions.  */
+DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BLS, "avoid_false_dep_for_bls",
+         m_ZNVER5)
+
 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
    on hardware capabilities. Bdver3 hardware has a loop buffer which makes
    unrolling small loop less important. For, such architectures we adjust
diff --git a/gcc/testsuite/gcc.target/i386/blsi.c 
b/gcc/testsuite/gcc.target/i386/blsi.c
new file mode 100644
index 000000000000..ee68a9d42c3d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/blsi.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=znver5" } */
+void bar ();
+int
+test(int a)
+{
+       return a & -a;
+}
+int
+test2(int a)
+{
+       if (a & -a)
+               bar ();
+}
+int
+test3(int a)
+{
+       int ret = a & -a;
+       if (ret)
+               bar ();
+       return ret;
+}
+/* All three functions should produce bslr.
+   Only test and test3 needs xor to break false dependency.  */
+/* { dg-final { scan-assembler-times "blsi\[ \\t\]+" 3 } } */
+/* { dg-final { scan-assembler-times "xor" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/blsmsk.c 
b/gcc/testsuite/gcc.target/i386/blsmsk.c
new file mode 100644
index 000000000000..9d15ac031805
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/blsmsk.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=znver5" } */
+int
+test(int a)
+{
+       return (a - 1)^a;
+}
+/* { dg-final { scan-assembler-times "blsmsk\[ \\t\]+" 1 } } */
+/* { dg-final { scan-assembler-times "xor" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/blsr.c 
b/gcc/testsuite/gcc.target/i386/blsr.c
new file mode 100644
index 000000000000..340ebb40dd0d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/blsr.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=znver5" } */
+void bar ();
+int
+test(int a)
+{
+       return a & (a - 1);
+}
+int
+test2(int a)
+{
+       if (a & (a - 1))
+               bar ();
+}
+int
+test3(int a)
+{
+       int ret = a & (a - 1);
+       if (ret)
+               bar ();
+       return ret;
+}
+/* All three functions should produce bslr.
+   Only test and test3 needs xor to break false dependency.  */
+/* { dg-final { scan-assembler-times "blsr\[ \\t\]+" 3 } } */
+/* { dg-final { scan-assembler-times "xor" 2 } } */

Reply via email to