Move pass_stv2 and pass_rpad after pre_reload pass_late_combine, also define target_insn_cost to prevent post_reload pass_late_combine to revert the optimziation did in pass_rpad.
Adjust testcases since pass_late_combine generates better code but break scan assembly. .i.e Under 32-bit target, gcc used to generate broadcast from stack and then do the real operation. After flate_combine, they're combined into embeded broadcast operations. gcc/ChangeLog: * config/i386/i386-features.cc (ix86_rpad_gate): New function. * config/i386/i386-options.cc (ix86_override_options_after_change): Don't disable flate_combine. * config/i386/i386-passes.def: Move pass_stv2 and pass_rpad after pre_reload pas_late_combine. * config/i386/i386-protos.h (ix86_rpad_gate): New declare. * config/i386/i386.cc (ix86_insn_cost): New function. (TARGET_INSN_COST): Define. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512f-broadcast-pr87767-1.c: Adjus testcase. * gcc.target/i386/avx512f-broadcast-pr87767-5.c: Ditto. * gcc.target/i386/avx512f-fmadd-sf-zmm-7.c: Ditto. * gcc.target/i386/avx512f-fmsub-sf-zmm-7.c: Ditto. * gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c: Ditto. * gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c: Ditto. * gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Ditto. * gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Ditto. * gcc.target/i386/pr91333.c: Ditto. * gcc.target/i386/vect-strided-4.c: Ditto. --- gcc/config/i386/i386-features.cc | 16 +++++++++++----- gcc/config/i386/i386-options.cc | 4 ---- gcc/config/i386/i386-passes.def | 4 ++-- gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.cc | 18 ++++++++++++++++++ .../i386/avx512f-broadcast-pr87767-1.c | 4 ++-- .../i386/avx512f-broadcast-pr87767-5.c | 1 - .../gcc.target/i386/avx512f-fmadd-sf-zmm-7.c | 2 +- .../gcc.target/i386/avx512f-fmsub-sf-zmm-7.c | 2 +- .../gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c | 2 +- .../gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c | 2 +- .../i386/avx512vl-broadcast-pr87767-1.c | 4 ++-- .../i386/avx512vl-broadcast-pr87767-5.c | 2 -- gcc/testsuite/gcc.target/i386/pr91333.c | 2 +- gcc/testsuite/gcc.target/i386/vect-strided-4.c | 2 +- 15 files changed, 42 insertions(+), 24 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 607d1991460..fc224ed06b0 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -2995,6 +2995,16 @@ make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt) return new pass_insert_endbr_and_patchable_area (ctxt); } +bool +ix86_rpad_gate () +{ + return (TARGET_AVX + && TARGET_SSE_PARTIAL_REG_DEPENDENCY + && TARGET_SSE_MATH + && optimize + && optimize_function_for_speed_p (cfun)); +} + /* At entry of the nearest common dominator for basic blocks with conversions/rcp/sqrt/rsqrt/round, generate a single vxorps %xmmN, %xmmN, %xmmN @@ -3232,11 +3242,7 @@ public: /* opt_pass methods: */ bool gate (function *) final override { - return (TARGET_AVX - && TARGET_SSE_PARTIAL_REG_DEPENDENCY - && TARGET_SSE_MATH - && optimize - && optimize_function_for_speed_p (cfun)); + return ix86_rpad_gate (); } unsigned int execute (function *) final override diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc index 9c12d498928..1ef2c71a7a2 100644 --- a/gcc/config/i386/i386-options.cc +++ b/gcc/config/i386/i386-options.cc @@ -1944,10 +1944,6 @@ ix86_override_options_after_change (void) flag_cunroll_grow_size = flag_peel_loops || optimize >= 3; } - /* Late combine tends to undo some of the effects of STV and RPAD, - by combining instructions back to their original form. */ - if (!OPTION_SET_P (flag_late_combine_instructions)) - flag_late_combine_instructions = 0; } /* Clear stack slot assignments remembered from previous functions. diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def index 7d96766f7b9..2d29f65da88 100644 --- a/gcc/config/i386/i386-passes.def +++ b/gcc/config/i386/i386-passes.def @@ -25,11 +25,11 @@ along with GCC; see the file COPYING3. If not see */ INSERT_PASS_AFTER (pass_postreload_cse, 1, pass_insert_vzeroupper); - INSERT_PASS_AFTER (pass_combine, 1, pass_stv, false /* timode_p */); + INSERT_PASS_AFTER (pass_late_combine, 1, pass_stv, false /* timode_p */); /* Run the 64-bit STV pass before the CSE pass so that CONST0_RTX and CONSTM1_RTX generated by the STV pass can be CSEed. */ INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */); INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area); - INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); + INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency); diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 4f48dc0bf75..3dbd18dc70b 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -422,6 +422,7 @@ extern rtl_opt_pass *make_pass_remove_partial_avx_dependency (gcc::context *); extern bool ix86_has_no_direct_extern_access; +extern bool ix86_rpad_gate (); /* In i386-expand.cc. */ bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*, diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 1f71ed04be6..9d2b7d1f174 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -21371,6 +21371,22 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, } } +static int +ix86_insn_cost (rtx_insn *insn, bool speed) +{ + int insn_cost = 0; + /* Add extra cost to avoid post_reload late_combine revert + the optimization did in pass_rpad. */ + if (reload_completed + && ix86_rpad_gate () + && recog_memoized (insn) >= 0 + && get_attr_avx_partial_xmm_update (insn) + == AVX_PARTIAL_XMM_UPDATE_TRUE) + insn_cost += COSTS_N_INSNS (3); + + return insn_cost + pattern_cost (PATTERN (insn), speed); +} + /* Compute a (partial) cost for rtx X. Return true if the complete cost has been computed, and false if subexpressions should be scanned. In either case, *TOTAL contains the cost result. */ @@ -26514,6 +26530,8 @@ static const scoped_attribute_specs *const ix86_attribute_table[] = #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost #undef TARGET_RTX_COSTS #define TARGET_RTX_COSTS ix86_rtx_costs +#undef TARGET_INSN_COST +#define TARGET_INSN_COST ix86_insn_cost #undef TARGET_ADDRESS_COST #define TARGET_ADDRESS_COST ix86_address_cost diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c index 138dbb4c973..3a50749e610 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c @@ -3,8 +3,8 @@ /* { dg-options "-O2 -mavx512f -mavx512dq" } */ /* { dg-additional-options "-fno-PIE" { target ia32 } } */ /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } } } -/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } } */ -/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } } */ +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } } */ /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %zmm\[0-9\]+" 3 } } */ /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %zmm\[0-9\]+" 3 { target { ! ia32 } } } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c index d22251bc2a3..ea2f64861d0 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c @@ -3,7 +3,6 @@ /* { dg-options "-O2 -mavx512f" } */ /* { dg-additional-options "-fno-PIE" { target ia32 } } */ /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } } } -/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to8\\\}" { target ia32 } } } */ /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %zmm\[0-9\]+" 4 } } */ /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %zmm\[0-9\]+" 4 { target { ! ia32 } } } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c b/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c index 8c117207efa..bbcc5ed0bec 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-options "-mavx512f -O2" } */ -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vfmadd...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */ #define type __m512 diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c index cc705af8ea5..fc72dd6e557 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-options "-mavx512f -O2" } */ -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vfmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */ #define type __m512 diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c b/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c index db5c34678c0..342de482da8 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-options "-mavx512f -O2" } */ -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vfnmadd...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */ #define type __m512 diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c b/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c index 7815251b82d..f56a3f8acc4 100644 --- a/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c +++ b/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-options "-mavx512f -O2" } */ -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vfnmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */ #define type __m512 diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c index e6df4d25f36..08898445be5 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c @@ -3,8 +3,8 @@ /* { dg-options "-O2 -mavx512f -mavx512vl -mavx512dq" } */ /* { dg-additional-options "-fno-PIE" { target ia32 } } */ /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } } } -/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 } } */ -/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 } } */ +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } } */ /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 3 } } */ /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 3 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c index ebdc3619d8e..c57a2e29767 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c @@ -3,8 +3,6 @@ /* { dg-options "-O2 -mavx512f -mavx512vl" } */ /* { dg-additional-options "-fno-PIE" { target ia32 } } */ /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } } } -/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to2\\\}" { target ia32 } } } */ -/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to4\\\}" { target ia32 } } } */ /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 4 } } */ /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 4 } } */ /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %xmm\[0-9\]+" 4 { target { ! ia32 } } } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr91333.c b/gcc/testsuite/gcc.target/i386/pr91333.c index 2bdff871024..b4940b5c9ec 100644 --- a/gcc/testsuite/gcc.target/i386/pr91333.c +++ b/gcc/testsuite/gcc.target/i386/pr91333.c @@ -1,6 +1,6 @@ /* { dg-do compile { target { ! ia32 } } } */ /* { dg-options "-O2 -mavx" } */ -/* { dg-final { scan-assembler-times "vmovapd|vmovsd" 3 } } */ +/* { dg-final { scan-assembler-times "vmovapd|vmovsd" 2 } } */ static inline double g (double x){ asm volatile ("" : "+x" (x)); diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-4.c b/gcc/testsuite/gcc.target/i386/vect-strided-4.c index dd922926a2a..3fb9f07886e 100644 --- a/gcc/testsuite/gcc.target/i386/vect-strided-4.c +++ b/gcc/testsuite/gcc.target/i386/vect-strided-4.c @@ -15,6 +15,6 @@ void foo (int * __restrict a, int * __restrict b, int *c, int s) /* Vectorization factor two, two two-element stores to a using movq and two two-element stores to b via pextrq/movhps of the high part. */ -/* { dg-final { scan-assembler-times "movq" 2 } } */ +/* { dg-final { scan-assembler-times "movq\[\t ]+%xmm\[0-9]" 2 } } */ /* { dg-final { scan-assembler-times "pextrq" 2 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "movhps" 2 { target { ia32 } } } } */ -- 2.31.1