https://gcc.gnu.org/g:8c4a00f9a48f1b2af10448c9f2058b44b8cb7234
commit r15-7813-g8c4a00f9a48f1b2af10448c9f2058b44b8cb7234 Author: Jan Hubicka <hubi...@ucw.cz> Date: Tue Mar 4 16:22:01 2025 +0100 Break false dependency chain on Zen5 Zen5 on some variants has false dependency on tzcnt, blsi, blsr and blsmsk instructions. Those can be tested by the following benchmark jh@shroud:~> cat ee.c int main() { int a = 10; int b = 0; for (int i = 0; i < 1000000000; i++) { asm volatile ("xor %0, %0": "=r" (b)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); asm volatile (INST " %2, %0": "=r"(b): "0"(b),"r"(a)); } return 0; } jh@shroud:~> cat bmk.sh gcc ee.c -DBREAK -DINST=\"$1\" -O2 ; time ./a.out ; gcc ee.c -DINST=\"$1\" -O2 ; time ./a.out jh@shroud:~> sh bmk.sh tzcnt real 0m0.886s user 0m0.886s sys 0m0.000s real 0m0.886s user 0m0.886s sys 0m0.000s jh@shroud:~> sh bmk.sh blsi real 0m0.979s user 0m0.979s sys 0m0.000s real 0m2.418s user 0m2.418s sys 0m0.000s jh@shroud:~> sh bmk.sh blsr real 0m0.986s user 0m0.986s sys 0m0.000s real 0m2.422s user 0m2.421s sys 0m0.000s jh@shroud:~> sh bmk.sh blsmsk real 0m0.973s user 0m0.973s sys 0m0.000s real 0m2.422s user 0m2.422s sys 0m0.000s We already have runable that controls tzcnt together with lzcnt and popcnt. Since it seems that only tzcnt is affected I added new tunable to control tzcnt only. I also added splitters for blsi/blsr/blsmsk implemented analogously to existing splitter for lzcnt. The patch is neutral on SPEC. We produce blsi and blsr in some internal loops, but they usually have same destination as source. However it is good to break the dependency chain to avoid patogolical cases and it is quite cheap overall, so I think we want to enable this for generic. I will send followup patch for this. Bootstrapped/regtested x86_64-linux, will commit it shortly. gcc/ChangeLog: * config/i386/i386.h (TARGET_AVOID_FALSE_DEP_FOR_TZCNT): New macro. (TARGET_AVOID_FALSE_DEP_FOR_BLS): New macro. * config/i386/i386.md (*bmi_blsi_<mode>): Add splitter for false dependency. (*bmi_blsi_<mode>_ccno): Add splitter for false dependency. (*bmi_blsi_<mode>_falsedep): New pattern. (*bmi_blsmsk_<mode>): Add splitter for false dependency. (*bmi_blsmsk_<mode>_falsedep): New pattern. (*bmi_blsr_<mode>): Add splitter for false dependency. (*bmi_blsr_<mode>_cmp): Add splitter for false dependency (*bmi_blsr_<mode>_cmp_falsedep): New pattern. * config/i386/x86-tune.def (X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT): New tune. (X86_TUNE_AVOID_FALSE_DEP_FOR_BLS): New tune. gcc/testsuite/ChangeLog: * gcc.target/i386/blsi.c: New test. * gcc.target/i386/blsmsk.c: New test. * gcc.target/i386/blsr.c: New test. Diff: --- gcc/config/i386/i386.h | 4 + gcc/config/i386/i386.md | 168 +++++++++++++++++++++++++++++++-- gcc/config/i386/x86-tune.def | 10 ++ gcc/testsuite/gcc.target/i386/blsi.c | 26 +++++ gcc/testsuite/gcc.target/i386/blsmsk.c | 9 ++ gcc/testsuite/gcc.target/i386/blsr.c | 26 +++++ 6 files changed, 233 insertions(+), 10 deletions(-) diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 2696bfb3a81e..ce29c272bc0b 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -461,6 +461,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_ADJUST_UNROLL] #define TARGET_AVOID_FALSE_DEP_FOR_BMI \ ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BMI] +#define TARGET_AVOID_FALSE_DEP_FOR_TZCNT \ + ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT] +#define TARGET_AVOID_FALSE_DEP_FOR_BLS \ + ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BLS] #define TARGET_ONE_IF_CONV_INSN \ ix86_tune_features[X86_TUNE_ONE_IF_CONV_INSN] #define TARGET_AVOID_MFENCE ix86_tune_features[X86_TUNE_AVOID_MFENCE] diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 8575fbf40fed..b1cd52382a80 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -20993,7 +20993,8 @@ (ctz:SWI48 (match_dup 1)))] "TARGET_BMI" "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"; - "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + "&& (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT) + && epilogue_completed && optimize_function_for_speed_p (cfun) && !reg_mentioned_p (operands[0], operands[1])" [(parallel @@ -21060,7 +21061,8 @@ return "bsf{<imodesuffix>}\t{%1, %0|%0, %1}"; } "(TARGET_BMI || TARGET_CPU_P (GENERIC)) - && TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + && (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT) + && epilogue_completed && optimize_function_for_speed_p (cfun) && !reg_mentioned_p (operands[0], operands[1])" [(parallel @@ -21115,7 +21117,8 @@ (clobber (reg:CC FLAGS_REG))] "TARGET_BMI && TARGET_64BIT" "tzcnt{l}\t{%1, %k0|%k0, %1}" - "&& TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + "&& (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT) + && epilogue_completed && optimize_function_for_speed_p (cfun) && !reg_mentioned_p (operands[0], operands[1])" [(parallel @@ -21166,7 +21169,8 @@ return "bsf{l}\t{%1, %k0|%k0, %1}"; } "(TARGET_BMI || TARGET_CPU_P (GENERIC)) - && TARGET_AVOID_FALSE_DEP_FOR_BMI && epilogue_completed + && (TARGET_AVOID_FALSE_DEP_FOR_BMI || TARGET_AVOID_FALSE_DEP_FOR_TZCNT) + && epilogue_completed && optimize_function_for_speed_p (cfun) && !reg_mentioned_p (operands[0], operands[1])" [(parallel @@ -21747,7 +21751,7 @@ (set_attr "btver2_decode" "direct, double") (set_attr "mode" "<MODE>")]) -(define_insn "*bmi_blsi_<mode>" +(define_insn_and_split "*bmi_blsi_<mode>" [(set (match_operand:SWI48 0 "register_operand" "=r") (and:SWI48 (neg:SWI48 @@ -21756,6 +21760,20 @@ (clobber (reg:CC FLAGS_REG))] "TARGET_BMI" "blsi\t{%1, %0|%0, %1}" + "&& TARGET_AVOID_FALSE_DEP_FOR_BLS + && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (reg:CCNO FLAGS_REG) + (compare:CCNO + (and:SWI48 + (neg:SWI48 (match_dup 1)) + (match_dup 1)) + (const_int 0))) + (set (match_dup 0) (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "ix86_expand_clear (operands[0]);" [(set_attr "type" "bitmanip") (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) @@ -21775,6 +21793,51 @@ (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) +(define_split + [(set (match_operand 3 "flags_reg_operand") + (match_operator 4 "compare_operator" + [(and:SWI48 + (neg:SWI48 (match_operand:SWI48 1 "nonimmediate_operand")) + (match_dup 1)) + (const_int 0)])) + (set (match_operand:SWI48 0 "register_operand") + (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1)))] + "TARGET_BMI + && TARGET_AVOID_FALSE_DEP_FOR_BLS + && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (match_dup 3) + (match_op_dup 4 + [(and:SWI48 + (neg:SWI48 (match_dup 1)) + (match_dup 1)) + (const_int 0)])) + (set (match_dup 0) (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "ix86_expand_clear (operands[0]);") + +; False dependency happens when destination is only updated by blsi. +(define_insn "*bmi_blsi_<mode>_falsedep" + [(set (reg FLAGS_REG) + (compare + (and:SWI48 + (neg:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm")) + (match_dup 1)) + (const_int 0))) + (set (match_operand:SWI48 0 "register_operand" "=r") + (and:SWI48 (neg:SWI48 (match_dup 1)) (match_dup 1))) + (unspec [(match_operand:SWI48 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP)] + "TARGET_BMI && ix86_match_ccmode (insn, CCNOmode)" + "blsi\t{%1, %0|%0, %1}" + [(set_attr "type" "bitmanip") + (set_attr "btver2_decode" "double") + (set_attr "mode" "<MODE>")]) + +;; No need for splitter of the false dependency, since the output is unused +;; so it will not extend dependency chain. (define_insn "*bmi_blsi_<mode>_ccno" [(set (reg FLAGS_REG) (compare @@ -21789,7 +21852,7 @@ (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) -(define_insn "*bmi_blsmsk_<mode>" +(define_insn_and_split "*bmi_blsmsk_<mode>" [(set (match_operand:SWI48 0 "register_operand" "=r") (xor:SWI48 (plus:SWI48 @@ -21799,11 +21862,40 @@ (clobber (reg:CC FLAGS_REG))] "TARGET_BMI" "blsmsk\t{%1, %0|%0, %1}" + "TARGET_AVOID_FALSE_DEP_FOR_BLS + && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (match_dup 0) + (xor:SWI48 + (plus:SWI48 (match_dup 1) (const_int -1)) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG)) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "ix86_expand_clear (operands[0]);" + [(set_attr "type" "bitmanip") + (set_attr "btver2_decode" "double") + (set_attr "mode" "<MODE>")]) + +; False dependency happens when destination is only updated by blmsk. +(define_insn "*bmi_blsmsk_<mode>_falsedep" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (xor:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int -1)) + (match_dup 1))) + (clobber (reg:CC FLAGS_REG)) + (unspec [(match_operand:SWI48 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP)] + "TARGET_BMI" + "blsmsk\t{%1, %0|%0, %1}" [(set_attr "type" "bitmanip") (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) -(define_insn "*bmi_blsr_<mode>" +(define_insn_and_split "*bmi_blsr_<mode>" [(set (match_operand:SWI48 0 "register_operand" "=r") (and:SWI48 (plus:SWI48 @@ -21811,13 +21903,28 @@ (const_int -1)) (match_dup 1))) (clobber (reg:CC FLAGS_REG))] - "TARGET_BMI" - "blsr\t{%1, %0|%0, %1}" + "TARGET_BMI" + "blsr\t{%1, %0|%0, %1}" + "&& TARGET_AVOID_FALSE_DEP_FOR_BLS + && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (and:SWI48 + (plus:SWI48 (match_dup 1) (const_int -1)) + (match_dup 1)) + (const_int 0))) + (set (match_dup 0) (and:SWI48 (plus:SWI48 (match_dup 1) (const_int -1)) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "ix86_expand_clear (operands[0]);" [(set_attr "type" "bitmanip") (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) -(define_insn "*bmi_blsr_<mode>_cmp" +(define_insn_and_split "*bmi_blsr_<mode>_cmp" [(set (reg:CCZ FLAGS_REG) (compare:CCZ (and:SWI48 @@ -21832,12 +21939,53 @@ (match_dup 1) (const_int -1)) (match_dup 1)))] + "TARGET_BMI" + "blsr\t{%1, %0|%0, %1}" + "&& TARGET_AVOID_FALSE_DEP_FOR_BLS + && epilogue_completed + && optimize_function_for_speed_p (cfun) + && !reg_mentioned_p (operands[0], operands[1])" + [(parallel + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (and:SWI48 + (plus:SWI48 (match_dup 1) (const_int -1)) + (match_dup 1)) + (const_int 0))) + (set (match_dup 0) (and:SWI48 (plus:SWI48 (match_dup 1) (const_int -1)) + (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])] + "ix86_expand_clear (operands[0]);" + [(set_attr "type" "bitmanip") + (set_attr "btver2_decode" "double") + (set_attr "mode" "<MODE>")]) + +; False dependency happens when destination is only updated by bslr. +(define_insn "*bmi_blsr_<mode>_cmp_falsedep" + [(set (reg:CCZ FLAGS_REG) + (compare:CCZ + (and:SWI48 + (plus:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm") + (const_int -1)) + (match_dup 1)) + (const_int 0))) + (set (match_operand:SWI48 0 "register_operand" "=r") + (and:SWI48 + (plus:SWI48 + (match_dup 1) + (const_int -1)) + (match_dup 1))) + (unspec [(match_operand:SWI48 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP)] "TARGET_BMI" "blsr\t{%1, %0|%0, %1}" [(set_attr "type" "bitmanip") (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) +;; No need for splitter of the false dependency, since the output is unused +;; so it will not extend dependency chain. (define_insn "*bmi_blsr_<mode>_ccz" [(set (reg:CCZ FLAGS_REG) (compare:CCZ diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index b6e39f642e88..c857e769b60e 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -366,6 +366,16 @@ DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi", | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE | m_ZHAOXIN | m_GENERIC) +/* X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT: Avoid false dependency + for tzcnt instruction (also included in X86_TUNE_AVOID_FALSE_DEP_FOR_BMI). */ +DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_TZCNT, "avoid_false_dep_for_tzcnt", + m_ZNVER5) + +/* X86_TUNE_AVOID_FALSE_DEP_FOR_BLS: Avoid false dependency + for blsi, blsr and blsmsk instructions. */ +DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BLS, "avoid_false_dep_for_bls", + m_ZNVER5) + /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based on hardware capabilities. Bdver3 hardware has a loop buffer which makes unrolling small loop less important. For, such architectures we adjust diff --git a/gcc/testsuite/gcc.target/i386/blsi.c b/gcc/testsuite/gcc.target/i386/blsi.c new file mode 100644 index 000000000000..ee68a9d42c3d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/blsi.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=znver5" } */ +void bar (); +int +test(int a) +{ + return a & -a; +} +int +test2(int a) +{ + if (a & -a) + bar (); +} +int +test3(int a) +{ + int ret = a & -a; + if (ret) + bar (); + return ret; +} +/* All three functions should produce bslr. + Only test and test3 needs xor to break false dependency. */ +/* { dg-final { scan-assembler-times "blsi\[ \\t\]+" 3 } } */ +/* { dg-final { scan-assembler-times "xor" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/blsmsk.c b/gcc/testsuite/gcc.target/i386/blsmsk.c new file mode 100644 index 000000000000..9d15ac031805 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/blsmsk.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=znver5" } */ +int +test(int a) +{ + return (a - 1)^a; +} +/* { dg-final { scan-assembler-times "blsmsk\[ \\t\]+" 1 } } */ +/* { dg-final { scan-assembler-times "xor" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/blsr.c b/gcc/testsuite/gcc.target/i386/blsr.c new file mode 100644 index 000000000000..340ebb40dd0d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/blsr.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=znver5" } */ +void bar (); +int +test(int a) +{ + return a & (a - 1); +} +int +test2(int a) +{ + if (a & (a - 1)) + bar (); +} +int +test3(int a) +{ + int ret = a & (a - 1); + if (ret) + bar (); + return ret; +} +/* All three functions should produce bslr. + Only test and test3 needs xor to break false dependency. */ +/* { dg-final { scan-assembler-times "blsr\[ \\t\]+" 3 } } */ +/* { dg-final { scan-assembler-times "xor" 2 } } */