> Hi, > > Core2 follows a similar pattern, although it's not seeing any > > slowdown in the "no deps, predictable, jmp" case like K8 does. > > > > Any comments? (please cc me) Should gcc be using conditional jumps > > more often eg. in the case of __builtin_expect())? > > The problem is that in general GCC's branch prediction algorithms are > very poor on predicting predictability of branch: they are pretty good > on guessing outcome but that is. > > Only cases we do so quite reliably IMO are: > 1) loop branches that are not interesting for cmov conversion > 2) branches leading to noreturn calls, also not interesting > 3) builtin_expect mentioned. > 4) when profile feedback is around to some degree (ie we know when the > branch is very likely or very unlikely. We don't simulate what > hardware will do on it). > > I guess we can implement the machinery for 3 and 4 (in fact once > I played adding EDGE_PREDICTABLE_P predicate that basically tested if > the esimated probability of branch is <5% or >95%) but never got really > noticeable improvements out of it and gave up.
Just for those who might be interested, I found the old patch. I will try to find time to update it to mainline, but if someone beats me, I definitly won't complain. Index: expr.h =================================================================== RCS file: /cvs/gcc/gcc/gcc/expr.h,v retrieving revision 1.171 diff -c -3 -p -r1.171 expr.h *** expr.h 8 Sep 2004 18:44:56 -0000 1.171 --- expr.h 25 Sep 2004 13:22:22 -0000 *************** Software Foundation, 59 Temple Place - S *** 38,43 **** --- 38,49 ---- #ifndef BRANCH_COST #define BRANCH_COST 1 #endif + #ifndef PREDICTABLE_BRANCH_COST + #define PREDICTABLE_BRANCH_COST BRANCH_COST + #endif + #ifndef COLD_BRANCH_COST + #define COLD_BRANCH_COST BRANCH_COST + #endif /* This is the 4th arg to `expand_expr'. EXPAND_STACK_PARM means we are possibly expanding a call param onto Index: ifcvt.c =================================================================== RCS file: /cvs/gcc/gcc/gcc/ifcvt.c,v retrieving revision 1.165 diff -c -3 -p -r1.165 ifcvt.c *** ifcvt.c 17 Sep 2004 05:32:36 -0000 1.165 --- ifcvt.c 25 Sep 2004 13:22:22 -0000 *************** cond_exec_process_if_block (ce_if_block_ *** 608,613 **** --- 608,614 ---- struct noce_if_info { basic_block test_bb; + int branch_cost; rtx insn_a, insn_b; rtx x, a, b; rtx jump, cond, cond_earliest; *************** noce_try_store_flag_constants (struct no *** 869,888 **** normalize = 0; else if (ifalse == 0 && exact_log2 (itrue) >= 0 && (STORE_FLAG_VALUE == 1 ! || BRANCH_COST >= 2)) normalize = 1; else if (itrue == 0 && exact_log2 (ifalse) >= 0 && can_reverse ! && (STORE_FLAG_VALUE == 1 || BRANCH_COST >= 2)) normalize = 1, reversep = 1; else if (itrue == -1 && (STORE_FLAG_VALUE == -1 ! || BRANCH_COST >= 2)) normalize = -1; else if (ifalse == -1 && can_reverse ! && (STORE_FLAG_VALUE == -1 || BRANCH_COST >= 2)) normalize = -1, reversep = 1; ! else if ((BRANCH_COST >= 2 && STORE_FLAG_VALUE == -1) ! || BRANCH_COST >= 3) normalize = -1; else return FALSE; --- 870,889 ---- normalize = 0; else if (ifalse == 0 && exact_log2 (itrue) >= 0 && (STORE_FLAG_VALUE == 1 ! || if_info->branch_cost >= 2)) normalize = 1; else if (itrue == 0 && exact_log2 (ifalse) >= 0 && can_reverse ! && (STORE_FLAG_VALUE == 1 || if_info->branch_cost >= 2)) normalize = 1, reversep = 1; else if (itrue == -1 && (STORE_FLAG_VALUE == -1 ! || if_info->branch_cost >= 2)) normalize = -1; else if (ifalse == -1 && can_reverse ! && (STORE_FLAG_VALUE == -1 || if_info->branch_cost >= 2)) normalize = -1, reversep = 1; ! else if ((if_info->branch_cost >= 2 && STORE_FLAG_VALUE == -1) ! || if_info->branch_cost >= 3) normalize = -1; else return FALSE; *************** noce_try_addcc (struct noce_if_info *if_ *** 1014,1020 **** /* If that fails, construct conditional increment or decrement using setcc. */ ! if (BRANCH_COST >= 2 && (XEXP (if_info->a, 1) == const1_rtx || XEXP (if_info->a, 1) == constm1_rtx)) { --- 1015,1021 ---- /* If that fails, construct conditional increment or decrement using setcc. */ ! if (if_info->branch_cost >= 2 && (XEXP (if_info->a, 1) == const1_rtx || XEXP (if_info->a, 1) == constm1_rtx)) { *************** noce_try_store_flag_mask (struct noce_if *** 1066,1072 **** reversep = 0; if (! no_new_pseudos ! && (BRANCH_COST >= 2 || STORE_FLAG_VALUE == -1) && ((if_info->a == const0_rtx && rtx_equal_p (if_info->b, if_info->x)) --- 1067,1073 ---- reversep = 0; if (! no_new_pseudos ! && (if_info->branch_cost >= 2 || STORE_FLAG_VALUE == -1) && ((if_info->a == const0_rtx && rtx_equal_p (if_info->b, if_info->x)) *************** noce_try_cmove_arith (struct noce_if_inf *** 1223,1229 **** already checked for no side effects. */ if (! no_new_pseudos && cse_not_expected && MEM_P (a) && MEM_P (b) ! && BRANCH_COST >= 5) { a = XEXP (a, 0); b = XEXP (b, 0); --- 1224,1230 ---- already checked for no side effects. */ if (! no_new_pseudos && cse_not_expected && MEM_P (a) && MEM_P (b) ! && if_info->branch_cost >= 5) { a = XEXP (a, 0); b = XEXP (b, 0); *************** noce_try_cmove_arith (struct noce_if_inf *** 1253,1259 **** if (insn_a) { insn_cost = insn_rtx_cost (PATTERN (insn_a)); ! if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (BRANCH_COST)) return FALSE; } else --- 1254,1260 ---- if (insn_a) { insn_cost = insn_rtx_cost (PATTERN (insn_a)); ! if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (if_info->branch_cost)) return FALSE; } else *************** noce_try_cmove_arith (struct noce_if_inf *** 1263,1269 **** if (insn_b) { insn_cost += insn_rtx_cost (PATTERN (insn_b)); ! if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (BRANCH_COST)) return FALSE; } --- 1264,1270 ---- if (insn_b) { insn_cost += insn_rtx_cost (PATTERN (insn_b)); ! if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (if_info->branch_cost)) return FALSE; } *************** noce_process_if_block (struct ce_if_bloc *** 2010,2015 **** --- 2011,2024 ---- if_info.a = a; if_info.b = b; if_info.b_unconditional = else_bb == 0; + if (!maybe_hot_bb (test_bb)) + if_info.branch_cost = COLD_BRANCH_COST + if (profile_status != PROFILE_ABSENT + && (test_bb->succ < 0.03 * REG_BR_PROB_BASE + || test_bb->succ > 0.97 * REG_BR_PROB_BASE)) + if_info.branch_cost = PREDICTABLE_BRANCH_COST; + else + if_info.branch_cost = BRANCH_COST; /* Try optimizations in some approximation of a useful order. */ /* ??? Should first look to see if X is live incoming at all. If it Index: config/i386/i386.c =================================================================== RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.c,v retrieving revision 1.730 diff -c -3 -p -r1.730 i386.c *** config/i386/i386.c 23 Sep 2004 14:34:24 -0000 1.730 --- config/i386/i386.c 25 Sep 2004 13:22:24 -0000 *************** struct processor_costs size_cost = { /* *** 98,103 **** --- 98,104 ---- 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ 1, /* Branch cost */ + 1, /* Predictable branch cost */ 2, /* cost of FADD and FSUB insns. */ 2, /* cost of FMUL instruction. */ 2, /* cost of FDIV instruction. */ *************** struct processor_costs i386_cost = { /* *** 143,148 **** --- 144,150 ---- 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ 1, /* Branch cost */ + 1, /* Predictable branch cost */ 23, /* cost of FADD and FSUB insns. */ 27, /* cost of FMUL instruction. */ 88, /* cost of FDIV instruction. */ *************** struct processor_costs i486_cost = { /* *** 187,192 **** --- 189,195 ---- 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ 1, /* Branch cost */ + 1, /* Predictable branch cost */ 8, /* cost of FADD and FSUB insns. */ 16, /* cost of FMUL instruction. */ 73, /* cost of FDIV instruction. */ *************** struct processor_costs pentium_cost = { *** 231,236 **** --- 234,240 ---- 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ 2, /* Branch cost */ + 1, /* Predictable branch cost */ 3, /* cost of FADD and FSUB insns. */ 3, /* cost of FMUL instruction. */ 39, /* cost of FDIV instruction. */ *************** struct processor_costs pentiumpro_cost = *** 275,280 **** --- 279,285 ---- 32, /* size of prefetch block */ 6, /* number of parallel prefetches */ 2, /* Branch cost */ + 1, /* Predictable branch cost */ 3, /* cost of FADD and FSUB insns. */ 5, /* cost of FMUL instruction. */ 56, /* cost of FDIV instruction. */ *************** struct processor_costs k6_cost = { *** 319,324 **** --- 324,330 ---- 32, /* size of prefetch block */ 1, /* number of parallel prefetches */ 1, /* Branch cost */ + 1, /* Predictable branch cost */ 2, /* cost of FADD and FSUB insns. */ 2, /* cost of FMUL instruction. */ 56, /* cost of FDIV instruction. */ *************** struct processor_costs athlon_cost = { *** 362,368 **** 5, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ ! 2, /* Branch cost */ 4, /* cost of FADD and FSUB insns. */ 4, /* cost of FMUL instruction. */ 24, /* cost of FDIV instruction. */ --- 368,375 ---- 5, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ ! 3, /* Branch cost */ ! 1, /* Predictable branch cost */ 4, /* cost of FADD and FSUB insns. */ 4, /* cost of FMUL instruction. */ 24, /* cost of FDIV instruction. */ *************** struct processor_costs k8_cost = { *** 406,412 **** 5, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ ! 2, /* Branch cost */ 4, /* cost of FADD and FSUB insns. */ 4, /* cost of FMUL instruction. */ 19, /* cost of FDIV instruction. */ --- 413,420 ---- 5, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ ! 3, /* Branch cost */ ! 1, /* Predictable branch cost */ 4, /* cost of FADD and FSUB insns. */ 4, /* cost of FMUL instruction. */ 19, /* cost of FDIV instruction. */ *************** struct processor_costs pentium4_cost = { *** 450,456 **** 10, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ ! 2, /* Branch cost */ 5, /* cost of FADD and FSUB insns. */ 7, /* cost of FMUL instruction. */ 43, /* cost of FDIV instruction. */ --- 458,465 ---- 10, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ ! 3, /* Branch cost */ ! 1, /* Predictable branch cost */ 5, /* cost of FADD and FSUB insns. */ 7, /* cost of FMUL instruction. */ 43, /* cost of FDIV instruction. */ *************** struct processor_costs nocona_cost = { *** 494,500 **** 8, /* MMX or SSE register to integer */ 128, /* size of prefetch block */ 8, /* number of parallel prefetches */ ! 1, /* Branch cost */ 6, /* cost of FADD and FSUB insns. */ 8, /* cost of FMUL instruction. */ 40, /* cost of FDIV instruction. */ --- 503,510 ---- 8, /* MMX or SSE register to integer */ 128, /* size of prefetch block */ 8, /* number of parallel prefetches */ ! 2, /* Branch cost */ ! 1, /* Predictable branch cost */ 6, /* cost of FADD and FSUB insns. */ 8, /* cost of FMUL instruction. */ 40, /* cost of FDIV instruction. */ Index: config/i386/i386.h =================================================================== RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.h,v retrieving revision 1.402 diff -c -3 -p -r1.402 i386.h *** config/i386/i386.h 12 Sep 2004 23:31:31 -0000 1.402 --- config/i386/i386.h 25 Sep 2004 13:22:24 -0000 *************** struct processor_costs { *** 78,83 **** --- 78,84 ---- const int simultaneous_prefetches; /* number of parallel prefetch operations. */ const int branch_cost; /* Default value for BRANCH_COST. */ + const int predictable_branch_cost; /* Default value for PREDICTABLE_BRANCH_COST. */ const int fadd; /* cost of FADD and FSUB instructions. */ const int fmul; /* cost of FMUL instruction. */ const int fdiv; /* cost of FDIV instruction. */ *************** do { \ *** 2594,2599 **** --- 2595,2602 ---- is the default; other values are interpreted relative to that. */ #define BRANCH_COST ix86_branch_cost + #define COLD_BRANCH_COST 1 + #define PREDICTABLE_BRANCH_COST ix86_predictable_branch_cost /* Define this macro as a C expression which is nonzero if accessing less than a word of memory (i.e. a `char' or a `short') is no *************** extern unsigned int ix86_preferred_stack *** 2910,2915 **** --- 2913,2919 ---- extern const char *ix86_preferred_stack_boundary_string; extern int ix86_branch_cost; + extern int ix86_predictable_branch_cost; extern const char *ix86_branch_cost_string; extern const char *ix86_debug_arg_string;