Hello! Attached patch fixes the problem with false data dependency on output register for popcnt, lzcnt and tzcnt insns on sandybridge and haswell targets.
The new insn pattern shadows existing one, and after reload, the clearing isns is split out of the insn. This way the clearing insn can be scheduled by postreload scheduler. The new pattern takes care to avoid live registers, so the compiler is always able to clear output reg. The testcase from the PR, compiled with -O3 -march=corei7 improves on Ivybridge from: unsigned 209717360000 3.21002 sec 16.3329 GB/s uint64_t 209717360000 4.06517 sec 12.8971 GB/s to (-O3 -march=corei7 -mtune-ctrl=avoid_false_dep_for_bmi): unsigned 209717360000 3.14541 sec 16.6683 GB/s uint64_t 209717360000 2.3663 sec 22.1564 GB/s Due to high impact, the new tune flag is enabled by default for Intel tunes and generic: m_SANDYBRIDGE | m_HASWELL | m_INTEL | m_GENERIC 2014-08-16 Uros Bizjak <ubiz...@gmail.com> PR target/62011 * config/i386/x86-tune.def (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI): New tune flag. * config/i386/i386.h (TARGET_AVOID_FALSE_DEP_FOR_BMI): New define. * config/i386/i386.md (unspec) <UNSPEC_INSN_FALSE_DEP>: New unspec. (ffs<mode>2): Do not expand with tzcnt for TARGET_AVOID_FALSE_DEP_FOR_BMI. (ffssi2_no_cmove): Ditto. (*tzcnt<mode>_1): Disable for TARGET_AVOID_FALSE_DEP_FOR_BMI. (ctz<mode>2): New expander. (*ctz<mode>2_falsedep_1): New insn_and_split pattern. (*ctz<mode>2_falsedep): New insn. (*ctz<mode>2): Rename from ctz<mode>2. (clz<mode>2_lzcnt): New expander. (*clz<mode>2_lzcnt_falsedep_1): New insn_and_split pattern. (*clz<mode>2_lzcnt_falsedep): New insn. (*clz<mode>2): Rename from ctz<mode>2. (popcount<mode>2): New expander. (*popcount<mode>2_falsedep_1): New insn_and_split pattern. (*popcount<mode>2_falsedep): New insn. (*popcount<mode>2): Rename from ctz<mode>2. (*popcount<mode>2_cmp): Remove. (*popcountsi2_cmp_zext): Ditto. The patch was bootstrapped and regression tested on x86_64-pc-linux-gnu {,-m32} and will be committed to mainline SVN after a couple of days. The patch will be also backported to 4.9 branch. Uros.
Index: config/i386/i386.h =================================================================== --- config/i386/i386.h (revision 214059) +++ config/i386/i386.h (working copy) @@ -473,6 +473,8 @@ ix86_tune_features[X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS] #define TARGET_ADJUST_UNROLL \ ix86_tune_features[X86_TUNE_ADJUST_UNROLL] +#define TARGET_AVOID_FALSE_DEP_FOR_BMI \ + ix86_tune_features[X86_TUNE_AVOID_FALSE_DEP_FOR_BMI] /* Feature tests against the various architecture variations. */ enum ix86_arch_indices { Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 214059) +++ config/i386/i386.md (working copy) @@ -112,6 +112,7 @@ UNSPEC_XBEGIN_ABORT UNSPEC_STOS UNSPEC_PEEPSIB + UNSPEC_INSN_FALSE_DEP ;; For SSE/MMX support: UNSPEC_FIX_NOTRUNC @@ -12177,7 +12178,8 @@ DONE; } - flags_mode = TARGET_BMI ? CCCmode : CCZmode; + flags_mode + = (TARGET_BMI && !TARGET_AVOID_FALSE_DEP_FOR_BMI) ? CCCmode : CCZmode; operands[2] = gen_reg_rtx (<MODE>mode); operands[3] = gen_rtx_REG (flags_mode, FLAGS_REG); @@ -12203,7 +12205,8 @@ (parallel [(set (match_dup 0) (plus:SI (match_dup 0) (const_int 1))) (clobber (reg:CC FLAGS_REG))])] { - enum machine_mode flags_mode = TARGET_BMI ? CCCmode : CCZmode; + enum machine_mode flags_mode + = (TARGET_BMI && !TARGET_AVOID_FALSE_DEP_FOR_BMI) ? CCCmode : CCZmode; operands[3] = gen_lowpart (QImode, operands[2]); operands[4] = gen_rtx_REG (flags_mode, FLAGS_REG); @@ -12218,7 +12221,7 @@ (const_int 0))) (set (match_operand:SWI48 0 "register_operand" "=r") (ctz:SWI48 (match_dup 1)))] - "TARGET_BMI" + "TARGET_BMI && !TARGET_AVOID_FALSE_DEP_FOR_BMI" "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}" [(set_attr "type" "alu1") (set_attr "prefix_0f" "1") @@ -12239,7 +12242,52 @@ (set_attr "btver2_decode" "double") (set_attr "mode" "<MODE>")]) -(define_insn "ctz<mode>2" +(define_expand "ctz<mode>2" + [(parallel + [(set (match_operand:SWI248 0 "register_operand") + (ctz:SWI248 + (match_operand:SWI248 1 "nonimmediate_operand"))) + (clobber (reg:CC FLAGS_REG))])]) + +(define_insn_and_split "*ctz<mode>2_falsedep_1" + [(set (match_operand:SWI48 0 "register_operand" "=&r") + (ctz:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm"))) + (clobber (reg:CC FLAGS_REG))] + "(TARGET_BMI || TARGET_GENERIC) + && TARGET_AVOID_FALSE_DEP_FOR_BMI && optimize_function_for_speed_p (cfun)" + "#" + "&& reload_completed" + [(parallel + [(set (match_dup 0) + (ctz:SWI48 (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))])] + "ix86_expand_clear (operands[0]);") + +(define_insn "*ctz<mode>2_falsedep" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (ctz:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm"))) + (unspec [(match_operand:SWI48 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))] + "" +{ + if (TARGET_BMI) + return "tzcnt{<imodesuffix>}\t{%1, %0|%0, %1}"; + else if (TARGET_GENERIC) + /* tzcnt expands to 'rep bsf' and we can use it even if !TARGET_BMI. */ + return "rep%; bsf{<imodesuffix>}\t{%1, %0|%0, %1}"; + else + gcc_unreachable (); +} + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "prefix_rep" "1") + (set_attr "mode" "<MODE>")]) + +(define_insn "*ctz<mode>2" [(set (match_operand:SWI248 0 "register_operand" "=r") (ctz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "rm"))) (clobber (reg:CC FLAGS_REG))] @@ -12286,7 +12334,44 @@ operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1); }) -(define_insn "clz<mode>2_lzcnt" +(define_expand "clz<mode>2_lzcnt" + [(parallel + [(set (match_operand:SWI248 0 "register_operand") + (clz:SWI248 + (match_operand:SWI248 1 "nonimmediate_operand"))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_LZCNT") + +(define_insn_and_split "*clz<mode>2_lzcnt_falsedep_1" + [(set (match_operand:SWI48 0 "register_operand" "=&r") + (clz:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_LZCNT + && TARGET_AVOID_FALSE_DEP_FOR_BMI && optimize_function_for_speed_p (cfun)" + "#" + "&& reload_completed" + [(parallel + [(set (match_dup 0) + (clz:SWI48 (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))])] + "ix86_expand_clear (operands[0]);") + +(define_insn "*clz<mode>2_lzcnt_falsedep" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (clz:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm"))) + (unspec [(match_operand:SWI48 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))] + "TARGET_LZCNT" + "lzcnt{<imodesuffix>}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "<MODE>")]) + +(define_insn "*clz<mode>2_lzcnt" [(set (match_operand:SWI248 0 "register_operand" "=r") (clz:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "rm"))) (clobber (reg:CC FLAGS_REG))] @@ -12569,11 +12654,37 @@ (set_attr "prefix_0f" "1") (set_attr "mode" "HI")]) -(define_insn "popcount<mode>2" - [(set (match_operand:SWI248 0 "register_operand" "=r") - (popcount:SWI248 - (match_operand:SWI248 1 "nonimmediate_operand" "rm"))) +(define_expand "popcount<mode>2" + [(parallel + [(set (match_operand:SWI248 0 "register_operand") + (popcount:SWI248 + (match_operand:SWI248 1 "nonimmediate_operand"))) + (clobber (reg:CC FLAGS_REG))])] + "TARGET_POPCNT") + +(define_insn_and_split "*popcount<mode>2_falsedep_1" + [(set (match_operand:SWI48 0 "register_operand" "=&r") + (popcount:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm"))) (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT + && TARGET_AVOID_FALSE_DEP_FOR_BMI && optimize_function_for_speed_p (cfun)" + "#" + "&& reload_completed" + [(parallel + [(set (match_dup 0) + (popcount:SWI48 (match_dup 1))) + (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))])] + "ix86_expand_clear (operands[0]);") + +(define_insn "*popcount<mode>2_falsedep" + [(set (match_operand:SWI48 0 "register_operand" "=r") + (popcount:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand" "rm"))) + (unspec [(match_operand:SWI48 2 "register_operand" "0")] + UNSPEC_INSN_FALSE_DEP) + (clobber (reg:CC FLAGS_REG))] "TARGET_POPCNT" { #if TARGET_MACHO @@ -12586,15 +12697,12 @@ (set_attr "type" "bitmanip") (set_attr "mode" "<MODE>")]) -(define_insn "*popcount<mode>2_cmp" - [(set (reg FLAGS_REG) - (compare - (popcount:SWI248 - (match_operand:SWI248 1 "nonimmediate_operand" "rm")) - (const_int 0))) - (set (match_operand:SWI248 0 "register_operand" "=r") - (popcount:SWI248 (match_dup 1)))] - "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" +(define_insn "*popcount<mode>2" + [(set (match_operand:SWI248 0 "register_operand" "=r") + (popcount:SWI248 + (match_operand:SWI248 1 "nonimmediate_operand" "rm"))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT" { #if TARGET_MACHO return "popcnt\t{%1, %0|%0, %1}"; @@ -12606,25 +12714,6 @@ (set_attr "type" "bitmanip") (set_attr "mode" "<MODE>")]) -(define_insn "*popcountsi2_cmp_zext" - [(set (reg FLAGS_REG) - (compare - (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) - (const_int 0))) - (set (match_operand:DI 0 "register_operand" "=r") - (zero_extend:DI(popcount:SI (match_dup 1))))] - "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" -{ -#if TARGET_MACHO - return "popcnt\t{%1, %0|%0, %1}"; -#else - return "popcnt{l}\t{%1, %0|%0, %1}"; -#endif -} - [(set_attr "prefix_rep" "1") - (set_attr "type" "bitmanip") - (set_attr "mode" "SI")]) - (define_expand "bswapdi2" [(set (match_operand:DI 0 "register_operand") (bswap:DI (match_operand:DI 1 "nonimmediate_operand")))] Index: config/i386/x86-tune.def =================================================================== --- config/i386/x86-tune.def (revision 214059) +++ config/i386/x86-tune.def (working copy) @@ -509,6 +509,11 @@ DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode", m_K8) +/* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency + for bit-manipulation instructions. */ +DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi", + m_SANDYBRIDGE | m_HASWELL | m_INTEL | m_GENERIC) + /*****************************************************************************/ /* This never worked well before. */ /*****************************************************************************/