> Hi,
> > Core2 follows a similar pattern, although it's not seeing any
> > slowdown in the "no deps, predictable, jmp" case like K8 does.
> >
> > Any comments? (please cc me) Should gcc be using conditional jumps
> > more often eg. in the case of __builtin_expect())?
>
> The problem is that in general GCC's branch prediction algorithms are
> very poor on predicting predictability of branch: they are pretty good
> on guessing outcome but that is.
>
> Only cases we do so quite reliably IMO are:
> 1) loop branches that are not interesting for cmov conversion
> 2) branches leading to noreturn calls, also not interesting
> 3) builtin_expect mentioned.
> 4) when profile feedback is around to some degree (ie we know when the
> branch is very likely or very unlikely. We don't simulate what
> hardware will do on it).
>
> I guess we can implement the machinery for 3 and 4 (in fact once
> I played adding EDGE_PREDICTABLE_P predicate that basically tested if
> the esimated probability of branch is <5% or >95%) but never got really
> noticeable improvements out of it and gave up.
Just for those who might be interested, I found the old patch.
I will try to find time to update it to mainline, but if someone beats
me, I definitly won't complain.
Index: expr.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/expr.h,v
retrieving revision 1.171
diff -c -3 -p -r1.171 expr.h
*** expr.h 8 Sep 2004 18:44:56 -0000 1.171
--- expr.h 25 Sep 2004 13:22:22 -0000
*************** Software Foundation, 59 Temple Place - S
*** 38,43 ****
--- 38,49 ----
#ifndef BRANCH_COST
#define BRANCH_COST 1
#endif
+ #ifndef PREDICTABLE_BRANCH_COST
+ #define PREDICTABLE_BRANCH_COST BRANCH_COST
+ #endif
+ #ifndef COLD_BRANCH_COST
+ #define COLD_BRANCH_COST BRANCH_COST
+ #endif
/* This is the 4th arg to `expand_expr'.
EXPAND_STACK_PARM means we are possibly expanding a call param onto
Index: ifcvt.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/ifcvt.c,v
retrieving revision 1.165
diff -c -3 -p -r1.165 ifcvt.c
*** ifcvt.c 17 Sep 2004 05:32:36 -0000 1.165
--- ifcvt.c 25 Sep 2004 13:22:22 -0000
*************** cond_exec_process_if_block (ce_if_block_
*** 608,613 ****
--- 608,614 ----
struct noce_if_info
{
basic_block test_bb;
+ int branch_cost;
rtx insn_a, insn_b;
rtx x, a, b;
rtx jump, cond, cond_earliest;
*************** noce_try_store_flag_constants (struct no
*** 869,888 ****
normalize = 0;
else if (ifalse == 0 && exact_log2 (itrue) >= 0
&& (STORE_FLAG_VALUE == 1
! || BRANCH_COST >= 2))
normalize = 1;
else if (itrue == 0 && exact_log2 (ifalse) >= 0 && can_reverse
! && (STORE_FLAG_VALUE == 1 || BRANCH_COST >= 2))
normalize = 1, reversep = 1;
else if (itrue == -1
&& (STORE_FLAG_VALUE == -1
! || BRANCH_COST >= 2))
normalize = -1;
else if (ifalse == -1 && can_reverse
! && (STORE_FLAG_VALUE == -1 || BRANCH_COST >= 2))
normalize = -1, reversep = 1;
! else if ((BRANCH_COST >= 2 && STORE_FLAG_VALUE == -1)
! || BRANCH_COST >= 3)
normalize = -1;
else
return FALSE;
--- 870,889 ----
normalize = 0;
else if (ifalse == 0 && exact_log2 (itrue) >= 0
&& (STORE_FLAG_VALUE == 1
! || if_info->branch_cost >= 2))
normalize = 1;
else if (itrue == 0 && exact_log2 (ifalse) >= 0 && can_reverse
! && (STORE_FLAG_VALUE == 1 || if_info->branch_cost >= 2))
normalize = 1, reversep = 1;
else if (itrue == -1
&& (STORE_FLAG_VALUE == -1
! || if_info->branch_cost >= 2))
normalize = -1;
else if (ifalse == -1 && can_reverse
! && (STORE_FLAG_VALUE == -1 || if_info->branch_cost >= 2))
normalize = -1, reversep = 1;
! else if ((if_info->branch_cost >= 2 && STORE_FLAG_VALUE == -1)
! || if_info->branch_cost >= 3)
normalize = -1;
else
return FALSE;
*************** noce_try_addcc (struct noce_if_info *if_
*** 1014,1020 ****
/* If that fails, construct conditional increment or decrement using
setcc. */
! if (BRANCH_COST >= 2
&& (XEXP (if_info->a, 1) == const1_rtx
|| XEXP (if_info->a, 1) == constm1_rtx))
{
--- 1015,1021 ----
/* If that fails, construct conditional increment or decrement using
setcc. */
! if (if_info->branch_cost >= 2
&& (XEXP (if_info->a, 1) == const1_rtx
|| XEXP (if_info->a, 1) == constm1_rtx))
{
*************** noce_try_store_flag_mask (struct noce_if
*** 1066,1072 ****
reversep = 0;
if (! no_new_pseudos
! && (BRANCH_COST >= 2
|| STORE_FLAG_VALUE == -1)
&& ((if_info->a == const0_rtx
&& rtx_equal_p (if_info->b, if_info->x))
--- 1067,1073 ----
reversep = 0;
if (! no_new_pseudos
! && (if_info->branch_cost >= 2
|| STORE_FLAG_VALUE == -1)
&& ((if_info->a == const0_rtx
&& rtx_equal_p (if_info->b, if_info->x))
*************** noce_try_cmove_arith (struct noce_if_inf
*** 1223,1229 ****
already checked for no side effects. */
if (! no_new_pseudos && cse_not_expected
&& MEM_P (a) && MEM_P (b)
! && BRANCH_COST >= 5)
{
a = XEXP (a, 0);
b = XEXP (b, 0);
--- 1224,1230 ----
already checked for no side effects. */
if (! no_new_pseudos && cse_not_expected
&& MEM_P (a) && MEM_P (b)
! && if_info->branch_cost >= 5)
{
a = XEXP (a, 0);
b = XEXP (b, 0);
*************** noce_try_cmove_arith (struct noce_if_inf
*** 1253,1259 ****
if (insn_a)
{
insn_cost = insn_rtx_cost (PATTERN (insn_a));
! if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (BRANCH_COST))
return FALSE;
}
else
--- 1254,1260 ----
if (insn_a)
{
insn_cost = insn_rtx_cost (PATTERN (insn_a));
! if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (if_info->branch_cost))
return FALSE;
}
else
*************** noce_try_cmove_arith (struct noce_if_inf
*** 1263,1269 ****
if (insn_b) {
insn_cost += insn_rtx_cost (PATTERN (insn_b));
! if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (BRANCH_COST))
return FALSE;
}
--- 1264,1270 ----
if (insn_b) {
insn_cost += insn_rtx_cost (PATTERN (insn_b));
! if (insn_cost == 0 || insn_cost > COSTS_N_INSNS (if_info->branch_cost))
return FALSE;
}
*************** noce_process_if_block (struct ce_if_bloc
*** 2010,2015 ****
--- 2011,2024 ----
if_info.a = a;
if_info.b = b;
if_info.b_unconditional = else_bb == 0;
+ if (!maybe_hot_bb (test_bb))
+ if_info.branch_cost = COLD_BRANCH_COST
+ if (profile_status != PROFILE_ABSENT
+ && (test_bb->succ < 0.03 * REG_BR_PROB_BASE
+ || test_bb->succ > 0.97 * REG_BR_PROB_BASE))
+ if_info.branch_cost = PREDICTABLE_BRANCH_COST;
+ else
+ if_info.branch_cost = BRANCH_COST;
/* Try optimizations in some approximation of a useful order. */
/* ??? Should first look to see if X is live incoming at all. If it
Index: config/i386/i386.c
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.c,v
retrieving revision 1.730
diff -c -3 -p -r1.730 i386.c
*** config/i386/i386.c 23 Sep 2004 14:34:24 -0000 1.730
--- config/i386/i386.c 25 Sep 2004 13:22:24 -0000
*************** struct processor_costs size_cost = { /*
*** 98,103 ****
--- 98,104 ----
0, /* size of prefetch block */
0, /* number of parallel prefetches */
1, /* Branch cost */
+ 1, /* Predictable branch cost */
2, /* cost of FADD and FSUB insns. */
2, /* cost of FMUL instruction. */
2, /* cost of FDIV instruction. */
*************** struct processor_costs i386_cost = { /*
*** 143,148 ****
--- 144,150 ----
0, /* size of prefetch block */
0, /* number of parallel prefetches */
1, /* Branch cost */
+ 1, /* Predictable branch cost */
23, /* cost of FADD and FSUB insns. */
27, /* cost of FMUL instruction. */
88, /* cost of FDIV instruction. */
*************** struct processor_costs i486_cost = { /*
*** 187,192 ****
--- 189,195 ----
0, /* size of prefetch block */
0, /* number of parallel prefetches */
1, /* Branch cost */
+ 1, /* Predictable branch cost */
8, /* cost of FADD and FSUB insns. */
16, /* cost of FMUL instruction. */
73, /* cost of FDIV instruction. */
*************** struct processor_costs pentium_cost = {
*** 231,236 ****
--- 234,240 ----
0, /* size of prefetch block */
0, /* number of parallel prefetches */
2, /* Branch cost */
+ 1, /* Predictable branch cost */
3, /* cost of FADD and FSUB insns. */
3, /* cost of FMUL instruction. */
39, /* cost of FDIV instruction. */
*************** struct processor_costs pentiumpro_cost =
*** 275,280 ****
--- 279,285 ----
32, /* size of prefetch block */
6, /* number of parallel prefetches */
2, /* Branch cost */
+ 1, /* Predictable branch cost */
3, /* cost of FADD and FSUB insns. */
5, /* cost of FMUL instruction. */
56, /* cost of FDIV instruction. */
*************** struct processor_costs k6_cost = {
*** 319,324 ****
--- 324,330 ----
32, /* size of prefetch block */
1, /* number of parallel prefetches */
1, /* Branch cost */
+ 1, /* Predictable branch cost */
2, /* cost of FADD and FSUB insns. */
2, /* cost of FMUL instruction. */
56, /* cost of FDIV instruction. */
*************** struct processor_costs athlon_cost = {
*** 362,368 ****
5, /* MMX or SSE register to integer */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
! 2, /* Branch cost */
4, /* cost of FADD and FSUB insns. */
4, /* cost of FMUL instruction. */
24, /* cost of FDIV instruction. */
--- 368,375 ----
5, /* MMX or SSE register to integer */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
! 3, /* Branch cost */
! 1, /* Predictable branch cost */
4, /* cost of FADD and FSUB insns. */
4, /* cost of FMUL instruction. */
24, /* cost of FDIV instruction. */
*************** struct processor_costs k8_cost = {
*** 406,412 ****
5, /* MMX or SSE register to integer */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
! 2, /* Branch cost */
4, /* cost of FADD and FSUB insns. */
4, /* cost of FMUL instruction. */
19, /* cost of FDIV instruction. */
--- 413,420 ----
5, /* MMX or SSE register to integer */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
! 3, /* Branch cost */
! 1, /* Predictable branch cost */
4, /* cost of FADD and FSUB insns. */
4, /* cost of FMUL instruction. */
19, /* cost of FDIV instruction. */
*************** struct processor_costs pentium4_cost = {
*** 450,456 ****
10, /* MMX or SSE register to integer */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
! 2, /* Branch cost */
5, /* cost of FADD and FSUB insns. */
7, /* cost of FMUL instruction. */
43, /* cost of FDIV instruction. */
--- 458,465 ----
10, /* MMX or SSE register to integer */
64, /* size of prefetch block */
6, /* number of parallel prefetches */
! 3, /* Branch cost */
! 1, /* Predictable branch cost */
5, /* cost of FADD and FSUB insns. */
7, /* cost of FMUL instruction. */
43, /* cost of FDIV instruction. */
*************** struct processor_costs nocona_cost = {
*** 494,500 ****
8, /* MMX or SSE register to integer */
128, /* size of prefetch block */
8, /* number of parallel prefetches */
! 1, /* Branch cost */
6, /* cost of FADD and FSUB insns. */
8, /* cost of FMUL instruction. */
40, /* cost of FDIV instruction. */
--- 503,510 ----
8, /* MMX or SSE register to integer */
128, /* size of prefetch block */
8, /* number of parallel prefetches */
! 2, /* Branch cost */
! 1, /* Predictable branch cost */
6, /* cost of FADD and FSUB insns. */
8, /* cost of FMUL instruction. */
40, /* cost of FDIV instruction. */
Index: config/i386/i386.h
===================================================================
RCS file: /cvs/gcc/gcc/gcc/config/i386/i386.h,v
retrieving revision 1.402
diff -c -3 -p -r1.402 i386.h
*** config/i386/i386.h 12 Sep 2004 23:31:31 -0000 1.402
--- config/i386/i386.h 25 Sep 2004 13:22:24 -0000
*************** struct processor_costs {
*** 78,83 ****
--- 78,84 ----
const int simultaneous_prefetches; /* number of parallel prefetch
operations. */
const int branch_cost; /* Default value for BRANCH_COST. */
+ const int predictable_branch_cost; /* Default value for
PREDICTABLE_BRANCH_COST. */
const int fadd; /* cost of FADD and FSUB instructions. */
const int fmul; /* cost of FMUL instruction. */
const int fdiv; /* cost of FDIV instruction. */
*************** do { \
*** 2594,2599 ****
--- 2595,2602 ----
is the default; other values are interpreted relative to that. */
#define BRANCH_COST ix86_branch_cost
+ #define COLD_BRANCH_COST 1
+ #define PREDICTABLE_BRANCH_COST ix86_predictable_branch_cost
/* Define this macro as a C expression which is nonzero if accessing
less than a word of memory (i.e. a `char' or a `short') is no
*************** extern unsigned int ix86_preferred_stack
*** 2910,2915 ****
--- 2913,2919 ----
extern const char *ix86_preferred_stack_boundary_string;
extern int ix86_branch_cost;
+ extern int ix86_predictable_branch_cost;
extern const char *ix86_branch_cost_string;
extern const char *ix86_debug_arg_string;