Various CPUs have max_cond_insns set to 5 due to historical reasons. Benchmarking shows that max_cond_insns=2 is fastest on modern Cortex-A cores, so change it to 2 for all Cortex-A cores. Set max_cond_insns to 4 on Thumb-2 architectures given it's already limited to that by MAX_INSN_PER_IT_BLOCK. Also use the CPU tuning setting when a CPU/tune is selected if -mrestrict-it is not explicitly set.
On Cortex-A57 this gives 1.1% performance gain on SPECINT2006 as well as a 0.4% codesize reduction. Bootstrapped on armhf. OK for commit? ChangeLog: 2019-08-19 Wilco Dijkstra <wdijk...@arm.com> * gcc/config/arm/arm.c (arm_option_override_internal): Use max_cond_insns from CPU tuning unless -mrestrict-it is used. (arm_v6t2_tune): set max_cond_insns to 4. (arm_cortex_tune): set max_cond_insns to 2. (arm_cortex_a8_tune): Likewise. (arm_cortex_a7_tune): Likewise. (arm_cortex_a35_tune): Likewise. (arm_cortex_a53_tune): Likewise. (arm_cortex_a5_tune): Likewise. (arm_cortex_a9_tune): Likewise. (arm_v6m_tune): set max_cond_insns to 4. --- diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 628cf02f23fb29392a63d87f561c3ee2fb73a515..38ac16ad1def91ca78ccfa98fd1679b2b5114851 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -1943,7 +1943,7 @@ const struct tune_params arm_v6t2_tune = arm_default_branch_cost, &arm_default_vec_cost, 1, /* Constant limit. */ - 5, /* Max cond insns. */ + 4, /* Max cond insns. */ 8, /* Memset max inline. */ 1, /* Issue rate. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -1968,7 +1968,7 @@ const struct tune_params arm_cortex_tune = arm_default_branch_cost, &arm_default_vec_cost, 1, /* Constant limit. */ - 5, /* Max cond insns. */ + 2, /* Max cond insns. */ 8, /* Memset max inline. */ 2, /* Issue rate. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -1991,7 +1991,7 @@ const struct tune_params arm_cortex_a8_tune = arm_default_branch_cost, &arm_default_vec_cost, 1, /* Constant limit. */ - 5, /* Max cond insns. */ + 2, /* Max cond insns. */ 8, /* Memset max inline. */ 2, /* Issue rate. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -2014,7 +2014,7 @@ const struct tune_params arm_cortex_a7_tune = arm_default_branch_cost, &arm_default_vec_cost, 1, /* Constant limit. */ - 5, /* Max cond insns. */ + 2, /* Max cond insns. */ 8, /* Memset max inline. */ 2, /* Issue rate. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -2060,7 +2060,7 @@ const struct tune_params arm_cortex_a35_tune = arm_default_branch_cost, &arm_default_vec_cost, 1, /* Constant limit. */ - 5, /* Max cond insns. */ + 2, /* Max cond insns. */ 8, /* Memset max inline. */ 1, /* Issue rate. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -2083,7 +2083,7 @@ const struct tune_params arm_cortex_a53_tune = arm_default_branch_cost, &arm_default_vec_cost, 1, /* Constant limit. */ - 5, /* Max cond insns. */ + 2, /* Max cond insns. */ 8, /* Memset max inline. */ 2, /* Issue rate. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -2167,9 +2167,6 @@ const struct tune_params arm_xgene1_tune = tune_params::SCHED_AUTOPREF_OFF }; -/* Branches can be dual-issued on Cortex-A5, so conditional execution is - less appealing. Set max_insns_skipped to a low value. */ - const struct tune_params arm_cortex_a5_tune = { &cortexa5_extra_costs, @@ -2178,7 +2175,7 @@ const struct tune_params arm_cortex_a5_tune = arm_cortex_a5_branch_cost, &arm_default_vec_cost, 1, /* Constant limit. */ - 1, /* Max cond insns. */ + 2, /* Max cond insns. */ 8, /* Memset max inline. */ 2, /* Issue rate. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -2201,7 +2198,7 @@ const struct tune_params arm_cortex_a9_tune = arm_default_branch_cost, &arm_default_vec_cost, 1, /* Constant limit. */ - 5, /* Max cond insns. */ + 2, /* Max cond insns. */ 8, /* Memset max inline. */ 2, /* Issue rate. */ ARM_PREFETCH_BENEFICIAL(4,32,32), @@ -2328,7 +2325,7 @@ const struct tune_params arm_v6m_tune = arm_default_branch_cost, &arm_default_vec_cost, /* Vectorizer costs. */ 1, /* Constant limit. */ - 5, /* Max cond insns. */ + 4, /* Max cond insns. */ 8, /* Memset max inline. */ 1, /* Issue rate. */ ARM_PREFETCH_NOT_BENEFICIAL, @@ -3050,6 +3047,11 @@ arm_option_override_internal (struct gcc_options *opts, if (!TARGET_THUMB2_P (opts->x_target_flags) || !arm_arch_notm) opts->x_arm_restrict_it = 0; + /* Use the IT size from CPU specific tuning unless -mrestrict-it is used. */ + if (!opts_set->x_arm_restrict_it + && (opts_set->x_arm_cpu_string || opts_set->x_arm_tune_string)) + opts->x_arm_restrict_it = 0; + /* Enable -munaligned-access by default for - all ARMv6 architecture-based processors when compiling for a 32-bit ISA i.e. Thumb2 and ARM state only.