This patch modifies tunings for ampere1/ampere1a/ampere1b, to: 1. Allow reassociation on FP additions. 2. Avoid generating loop-dependant FMA chains. Added a tuning option for this.
Bootstrapped and tested. Is this ok for trunk? Thanks, Di Zhao gcc/ChangeLog: * config/aarch64/aarch64-tuning-flags.def (AARCH64_EXTRA_TUNING_OPTION): New tuing option to avoid cross-loop FMA. * config/aarch64/aarch64.cc (aarch64_override_options_internal): Set param_avoid_fma_max_bits according to tuning option. * config/aarch64/tuning_models/ampere1.h: Modify tunings related with FMA. * config/aarch64/tuning_models/ampere1a.h: Modify tunings related with FMA. * config/aarch64/tuning_models/ampere1b.h: Modify tunings related with FMA. --- gcc/config/aarch64/aarch64-tuning-flags.def | 2 ++ gcc/config/aarch64/aarch64.cc | 6 ++++++ gcc/config/aarch64/tuning_models/ampere1.h | 2 +- gcc/config/aarch64/tuning_models/ampere1a.h | 4 ++-- gcc/config/aarch64/tuning_models/ampere1b.h | 5 +++-- 5 files changed, 14 insertions(+), 5 deletions(-) diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 774568e9106..f28a73839a6 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -47,4 +47,6 @@ AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS) AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", MATCHED_VECTOR_THROUGHPUT) +AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 64684258b7b..28bc70a787f 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -16083,6 +16083,12 @@ aarch64_override_options_internal (struct gcc_options *opts) && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level) opts->x_flag_prefetch_loop_arrays = 1; + /* Avoid loop-dependant FMA chains. */ + if (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA) + SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits, + 512); + aarch64_override_options_after_change_1 (opts); } diff --git a/gcc/config/aarch64/tuning_models/ampere1.h b/gcc/config/aarch64/tuning_models/ampere1.h index 8d2a1c69610..a144e8f94b3 100644 --- a/gcc/config/aarch64/tuning_models/ampere1.h +++ b/gcc/config/aarch64/tuning_models/ampere1.h @@ -104,7 +104,7 @@ static const struct tune_params ampere1_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA), /* tune_flags. */ &ere1_prefetch_tune, AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/ampere1a.h b/gcc/config/aarch64/tuning_models/ampere1a.h index c419ffb3c1a..f688ed08a79 100644 --- a/gcc/config/aarch64/tuning_models/ampere1a.h +++ b/gcc/config/aarch64/tuning_models/ampere1a.h @@ -50,13 +50,13 @@ static const struct tune_params ampere1a_tunings = "32:16", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ - 1, /* fma_reassoc_width. */ + 4, /* fma_reassoc_width. */ 2, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA), /* tune_flags. */ &ere1_prefetch_tune, AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/ampere1b.h b/gcc/config/aarch64/tuning_models/ampere1b.h index c4928f50d29..a98b6a980f7 100644 --- a/gcc/config/aarch64/tuning_models/ampere1b.h +++ b/gcc/config/aarch64/tuning_models/ampere1b.h @@ -99,13 +99,14 @@ static const struct tune_params ampere1b_tunings = "32:16", /* loop_align. */ 2, /* int_reassoc_width. */ 4, /* fp_reassoc_width. */ - 1, /* fma_reassoc_width. */ + 4, /* fma_reassoc_width. */ 2, /* vec_reassoc_width. */ 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND | + AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA), /* tune_flags. */ &ere1b_prefetch_tune, AARCH64_LDP_STP_POLICY_ALIGNED, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALIGNED /* stp_policy_model. */ -- 2.25.1