Hi, Here's v3 of the patch to disable register offset addressing mode for stores of 128-bit values on Falkor because they're very costly. Following Kyrill's suggestion, I compared the codegen for a53 and found that the codegen was quite different. Jim's original patch is the most minimal compromise for this and is also a cleaner temporary fix before I attempt to split address costs into loads and stores for gcc9.
So v3 is essentially a very slightly cleaned up version of v1 again, this time with confirmation that there are no codegen changes in CPU2017 on non-falkor builds; only the codegen for -mcpu=falkor is different. ---- On Falkor, because of an idiosyncracy of how the pipelines are designed, a quad-word store using a reg+reg addressing mode is almost twice as slow as an add followed by a quad-word store with a single reg addressing mode. So we get better performance if we disallow addressing modes using register offsets with quad-word stores. This is the most minimal change for gcc8, I will volunteer to make a more lasting change for gcc9 where I split the addressing mode costs into loads and stores wherever possible and needed. This patch improves fpspeed by 0.17% and intspeed by 0.62% in CPU2017, with xalancbmk_s (3.84%) wrf_s (1.46%) and mcf_s (1.62%) being the biggest winners. There were no regressions beyond 0.4%. 2018-xx-xx Jim Wilson <jim.wil...@linaro.org> Kugan Vivenakandarajah <kugan.vivekanandara...@linaro.org> Siddhesh Poyarekar <siddh...@sourceware.org> gcc/ * config/aarch64/aarch64-protos.h (aarch64_movti_target_operand_p): New. * config/aarch64/aarch64-simd.md (aarch64_simd_mov<mode>): Use Utf. * config/aarch64/aarch64-tuning-flags.def (SLOW_REGOFFSET_QUADWORD_STORE): New. * config/aarch64/aarch64.c (qdf24xx_tunings): Add SLOW_REGOFFSET_QUADWORD_STORE to tuning flags. (aarch64_movti_target_operand_p): New. * config/aarch64/aarch64.md (movti_aarch64): Use Utf. (movtf_aarch64): Likewise. * config/aarch64/constraints.md (Utf): New. gcc/testsuite * gcc.target/aarch64/pr82533.c: New test case. --- gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64-simd.md | 4 ++-- gcc/config/aarch64/aarch64-tuning-flags.def | 4 ++++ gcc/config/aarch64/aarch64.c | 14 +++++++++++++- gcc/config/aarch64/aarch64.md | 8 ++++---- gcc/config/aarch64/constraints.md | 6 ++++++ gcc/testsuite/gcc.target/aarch64/pr82533.c | 11 +++++++++++ 7 files changed, 41 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/pr82533.c diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index cda2895d28e..5a0323deb1e 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -433,6 +433,7 @@ bool aarch64_simd_mem_operand_p (rtx); bool aarch64_sve_ld1r_operand_p (rtx); bool aarch64_sve_ldr_operand_p (rtx); bool aarch64_sve_struct_memory_operand_p (rtx); +bool aarch64_movti_target_operand_p (rtx); rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool); rtx aarch64_tls_get_addr (void); tree aarch64_fold_builtin (tree, int, tree *, bool); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 3d1f6a01cb7..f7daac3e28d 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -131,9 +131,9 @@ (define_insn "*aarch64_simd_mov<VQ:mode>" [(set (match_operand:VQ 0 "nonimmediate_operand" - "=w, Umq, m, w, ?r, ?w, ?r, w") + "=w, Umq, Utf, w, ?r, ?w, ?r, w") (match_operand:VQ 1 "general_operand" - "m, Dz, w, w, w, r, r, Dn"))] + "m, Dz, w, w, w, r, r, Dn"))] "TARGET_SIMD && (register_operand (operands[0], <MODE>mode) || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))" diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index ea9ead234cb..04baf5b6de6 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -41,4 +41,8 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW) are not considered cheap. */ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND) +/* Don't use a register offset in a memory address for a quad-word store. */ +AARCH64_EXTRA_TUNING_OPTION ("slow_regoffset_quadword_store", + SLOW_REGOFFSET_QUADWORD_STORE) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 228fd1b908d..c0a05598415 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -875,7 +875,7 @@ static const struct tune_params qdf24xx_tunings = 2, /* min_div_recip_mul_df. */ 0, /* max_case_values. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + (AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE), /* tune_flags. */ &qdf24xx_prefetch_tune }; @@ -13634,6 +13634,18 @@ aarch64_sve_struct_memory_operand_p (rtx op) && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last)); } +/* Return TRUE if OP uses an efficient memory address for quad-word target. */ +bool +aarch64_movti_target_operand_p (rtx op) +{ + if (! optimize_size + && (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE)) + return MEM_P (op) && ! (GET_CODE (XEXP (op, 0)) == PLUS + && ! CONST_INT_P (XEXP (XEXP (op, 0), 1))); + return MEM_P (op); +} + /* Emit a register copy from operand to operand, taking care not to early-clobber source registers in the process. diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 5a2a9309a3b..1e6edcf51f2 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -1080,9 +1080,9 @@ (define_insn "*movti_aarch64" [(set (match_operand:TI 0 - "nonimmediate_operand" "= r,w, r,w,r,m,m,w,m") + "nonimmediate_operand" "= r,w, r,w,r,m,m,w,Utf") (match_operand:TI 1 - "aarch64_movti_operand" " rUti,r, w,w,m,r,Z,m,w"))] + "aarch64_movti_operand" " rUti,r, w,w,m,r,Z,m, w"))] "(register_operand (operands[0], TImode) || aarch64_reg_or_zero (operands[1], TImode))" "@ @@ -1227,9 +1227,9 @@ (define_insn "*movtf_aarch64" [(set (match_operand:TF 0 - "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m") + "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,Utf,?r,m ,m") (match_operand:TF 1 - "general_operand" " w,?r, ?r,w ,Y,Y ,m,w,m ,?r,Y"))] + "general_operand" " w,?r, ?r,w ,Y,Y ,m,w ,m ,?r,Y"))] "TARGET_FLOAT && (register_operand (operands[0], TFmode) || aarch64_reg_or_fp_zero (operands[1], TFmode))" "@ diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md index f052103e859..35aa62996ae 100644 --- a/gcc/config/aarch64/constraints.md +++ b/gcc/config/aarch64/constraints.md @@ -235,6 +235,12 @@ (and (match_code "mem") (match_test "aarch64_sve_ldr_operand_p (op)"))) +(define_memory_constraint "Utf" + "@internal + An efficient memory address for a quad-word target operand." + (and (match_code "mem") + (match_test "aarch64_movti_target_operand_p (op)"))) + (define_memory_constraint "Utv" "@internal An address valid for loading/storing opaque structure diff --git a/gcc/testsuite/gcc.target/aarch64/pr82533.c b/gcc/testsuite/gcc.target/aarch64/pr82533.c new file mode 100644 index 00000000000..fa28ffac03a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr82533.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-mcpu=falkor -O2 -ftree-vectorize" } */ + +void +copy (int N, double *c, double *a) +{ + for (int i = 0; i < N; ++i) + c[i] = a[i]; +} + +/* { dg-final { scan-assembler-not "str\tq\[0-9\]+, \\\[x\[0-9\]+, x\[0-9\]+\\\]" } } */ -- 2.14.3