On Fri, Jun 28, 2024 at 7:29 AM liuhongt <hongtao....@intel.com> wrote: > > Move pass_stv2 and pass_rpad after pre_reload pass_late_combine, also > define target_insn_cost to prevent post_reload pass_late_combine to > revert the optimziation did in pass_rpad. > > Adjust testcases since pass_late_combine generates better code but > break scan assembly. > > .i.e > Under 32-bit target, gcc used to generate broadcast from stack and > then do the real operation. > After flate_combine, they're combined into embeded broadcast > operations. > > gcc/ChangeLog: > > * config/i386/i386-features.cc (ix86_rpad_gate): New function. > * config/i386/i386-options.cc (ix86_override_options_after_change): > Don't disable flate_combine. > * config/i386/i386-passes.def: Move pass_stv2 and pass_rpad > after pre_reload pas_late_combine. > * config/i386/i386-protos.h (ix86_rpad_gate): New declare. > * config/i386/i386.cc (ix86_insn_cost): New function. > (TARGET_INSN_COST): Define. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/avx512f-broadcast-pr87767-1.c: Adjus > testcase. > * gcc.target/i386/avx512f-broadcast-pr87767-5.c: Ditto. > * gcc.target/i386/avx512f-fmadd-sf-zmm-7.c: Ditto. > * gcc.target/i386/avx512f-fmsub-sf-zmm-7.c: Ditto. > * gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c: Ditto. > * gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c: Ditto. > * gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Ditto. > * gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Ditto. > * gcc.target/i386/pr91333.c: Ditto. > * gcc.target/i386/vect-strided-4.c: Ditto.
LGTM. Thanks, Uros. > --- > gcc/config/i386/i386-features.cc | 16 +++++++++++----- > gcc/config/i386/i386-options.cc | 4 ---- > gcc/config/i386/i386-passes.def | 4 ++-- > gcc/config/i386/i386-protos.h | 1 + > gcc/config/i386/i386.cc | 18 ++++++++++++++++++ > .../i386/avx512f-broadcast-pr87767-1.c | 4 ++-- > .../i386/avx512f-broadcast-pr87767-5.c | 1 - > .../gcc.target/i386/avx512f-fmadd-sf-zmm-7.c | 2 +- > .../gcc.target/i386/avx512f-fmsub-sf-zmm-7.c | 2 +- > .../gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c | 2 +- > .../gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c | 2 +- > .../i386/avx512vl-broadcast-pr87767-1.c | 4 ++-- > .../i386/avx512vl-broadcast-pr87767-5.c | 2 -- > gcc/testsuite/gcc.target/i386/pr91333.c | 2 +- > gcc/testsuite/gcc.target/i386/vect-strided-4.c | 2 +- > 15 files changed, 42 insertions(+), 24 deletions(-) > > diff --git a/gcc/config/i386/i386-features.cc > b/gcc/config/i386/i386-features.cc > index 607d1991460..fc224ed06b0 100644 > --- a/gcc/config/i386/i386-features.cc > +++ b/gcc/config/i386/i386-features.cc > @@ -2995,6 +2995,16 @@ make_pass_insert_endbr_and_patchable_area > (gcc::context *ctxt) > return new pass_insert_endbr_and_patchable_area (ctxt); > } > > +bool > +ix86_rpad_gate () > +{ > + return (TARGET_AVX > + && TARGET_SSE_PARTIAL_REG_DEPENDENCY > + && TARGET_SSE_MATH > + && optimize > + && optimize_function_for_speed_p (cfun)); > +} > + > /* At entry of the nearest common dominator for basic blocks with > conversions/rcp/sqrt/rsqrt/round, generate a single > vxorps %xmmN, %xmmN, %xmmN > @@ -3232,11 +3242,7 @@ public: > /* opt_pass methods: */ > bool gate (function *) final override > { > - return (TARGET_AVX > - && TARGET_SSE_PARTIAL_REG_DEPENDENCY > - && TARGET_SSE_MATH > - && optimize > - && optimize_function_for_speed_p (cfun)); > + return ix86_rpad_gate (); > } > > unsigned int execute (function *) final override > diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc > index 9c12d498928..1ef2c71a7a2 100644 > --- a/gcc/config/i386/i386-options.cc > +++ b/gcc/config/i386/i386-options.cc > @@ -1944,10 +1944,6 @@ ix86_override_options_after_change (void) > flag_cunroll_grow_size = flag_peel_loops || optimize >= 3; > } > > - /* Late combine tends to undo some of the effects of STV and RPAD, > - by combining instructions back to their original form. */ > - if (!OPTION_SET_P (flag_late_combine_instructions)) > - flag_late_combine_instructions = 0; > } > > /* Clear stack slot assignments remembered from previous functions. > diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def > index 7d96766f7b9..2d29f65da88 100644 > --- a/gcc/config/i386/i386-passes.def > +++ b/gcc/config/i386/i386-passes.def > @@ -25,11 +25,11 @@ along with GCC; see the file COPYING3. If not see > */ > > INSERT_PASS_AFTER (pass_postreload_cse, 1, pass_insert_vzeroupper); > - INSERT_PASS_AFTER (pass_combine, 1, pass_stv, false /* timode_p */); > + INSERT_PASS_AFTER (pass_late_combine, 1, pass_stv, false /* timode_p */); > /* Run the 64-bit STV pass before the CSE pass so that CONST0_RTX and > CONSTM1_RTX generated by the STV pass can be CSEed. */ > INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */); > > INSERT_PASS_BEFORE (pass_shorten_branches, 1, > pass_insert_endbr_and_patchable_area); > > - INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); > + INSERT_PASS_AFTER (pass_late_combine, 1, > pass_remove_partial_avx_dependency); > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 4f48dc0bf75..3dbd18dc70b 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -422,6 +422,7 @@ extern rtl_opt_pass > *make_pass_remove_partial_avx_dependency > (gcc::context *); > > extern bool ix86_has_no_direct_extern_access; > +extern bool ix86_rpad_gate (); > > /* In i386-expand.cc. */ > bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*, > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index 1f71ed04be6..9d2b7d1f174 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -21371,6 +21371,22 @@ ix86_shift_rotate_cost (const struct processor_costs > *cost, > } > } > > +static int > +ix86_insn_cost (rtx_insn *insn, bool speed) > +{ > + int insn_cost = 0; > + /* Add extra cost to avoid post_reload late_combine revert > + the optimization did in pass_rpad. */ > + if (reload_completed > + && ix86_rpad_gate () > + && recog_memoized (insn) >= 0 > + && get_attr_avx_partial_xmm_update (insn) > + == AVX_PARTIAL_XMM_UPDATE_TRUE) > + insn_cost += COSTS_N_INSNS (3); > + > + return insn_cost + pattern_cost (PATTERN (insn), speed); > +} > + > /* Compute a (partial) cost for rtx X. Return true if the complete > cost has been computed, and false if subexpressions should be > scanned. In either case, *TOTAL contains the cost result. */ > @@ -26514,6 +26530,8 @@ static const scoped_attribute_specs *const > ix86_attribute_table[] = > #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost > #undef TARGET_RTX_COSTS > #define TARGET_RTX_COSTS ix86_rtx_costs > +#undef TARGET_INSN_COST > +#define TARGET_INSN_COST ix86_insn_cost > #undef TARGET_ADDRESS_COST > #define TARGET_ADDRESS_COST ix86_address_cost > > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c > b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c > index 138dbb4c973..3a50749e610 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c > @@ -3,8 +3,8 @@ > /* { dg-options "-O2 -mavx512f -mavx512dq" } */ > /* { dg-additional-options "-fno-PIE" { target ia32 } } */ > /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 > } } } > -/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } } */ > -/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } } */ > +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 { target { ! > ia32 } } } } */ > +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } } */ > /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, > %zmm\[0-9\]+" 3 } } */ > /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, > %zmm\[0-9\]+" 3 { target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c > b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c > index d22251bc2a3..ea2f64861d0 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c > +++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c > @@ -3,7 +3,6 @@ > /* { dg-options "-O2 -mavx512f" } */ > /* { dg-additional-options "-fno-PIE" { target ia32 } } */ > /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 > } } } > -/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to8\\\}" { target ia32 } } > } */ > /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, > %zmm\[0-9\]+" 4 } } */ > /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, > %zmm\[0-9\]+" 4 { target { ! ia32 } } } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c > b/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c > index 8c117207efa..bbcc5ed0bec 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c > +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c > @@ -1,6 +1,6 @@ > /* { dg-do compile } */ > /* { dg-options "-mavx512f -O2" } */ > -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } > } */ > +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { > target { ! ia32 } } } } */ > /* { dg-final { scan-assembler-times "vfmadd...ps\[^\n\]*%zmm\[0-9\]+" 1 } } > */ > > #define type __m512 > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c > b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c > index cc705af8ea5..fc72dd6e557 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c > +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c > @@ -1,6 +1,6 @@ > /* { dg-do compile } */ > /* { dg-options "-mavx512f -O2" } */ > -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } > } */ > +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { > target { ! ia32 } } } } */ > /* { dg-final { scan-assembler-times "vfmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } } > */ > > #define type __m512 > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c > b/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c > index db5c34678c0..342de482da8 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c > +++ b/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c > @@ -1,6 +1,6 @@ > /* { dg-do compile } */ > /* { dg-options "-mavx512f -O2" } */ > -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } > } */ > +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { > target { ! ia32 } } } } */ > /* { dg-final { scan-assembler-times "vfnmadd...ps\[^\n\]*%zmm\[0-9\]+" 1 } > } */ > > #define type __m512 > diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c > b/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c > index 7815251b82d..f56a3f8acc4 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c > +++ b/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c > @@ -1,6 +1,6 @@ > /* { dg-do compile } */ > /* { dg-options "-mavx512f -O2" } */ > -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } > } */ > +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { > target { ! ia32 } } } } */ > /* { dg-final { scan-assembler-times "vfnmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } > } */ > > #define type __m512 > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c > b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c > index e6df4d25f36..08898445be5 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c > @@ -3,8 +3,8 @@ > /* { dg-options "-O2 -mavx512f -mavx512vl -mavx512dq" } */ > /* { dg-additional-options "-fno-PIE" { target ia32 } } */ > /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 > } } } > -/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 } } */ > -/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 } } */ > +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 { target { ! > ia32 } } } } */ > +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 { target { ! > ia32 } } } } */ > /* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } } */ > /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, > %xmm\[0-9\]+" 3 } } */ > /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, > %ymm\[0-9\]+" 3 } } */ > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c > b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c > index ebdc3619d8e..c57a2e29767 100644 > --- a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c > @@ -3,8 +3,6 @@ > /* { dg-options "-O2 -mavx512f -mavx512vl" } */ > /* { dg-additional-options "-fno-PIE" { target ia32 } } */ > /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 > } } } > -/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to2\\\}" { target ia32 } } > } */ > -/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to4\\\}" { target ia32 } } > } */ > /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, > %xmm\[0-9\]+" 4 } } */ > /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, > %ymm\[0-9\]+" 4 } } */ > /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, > %xmm\[0-9\]+" 4 { target { ! ia32 } } } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr91333.c > b/gcc/testsuite/gcc.target/i386/pr91333.c > index 2bdff871024..b4940b5c9ec 100644 > --- a/gcc/testsuite/gcc.target/i386/pr91333.c > +++ b/gcc/testsuite/gcc.target/i386/pr91333.c > @@ -1,6 +1,6 @@ > /* { dg-do compile { target { ! ia32 } } } */ > /* { dg-options "-O2 -mavx" } */ > -/* { dg-final { scan-assembler-times "vmovapd|vmovsd" 3 } } */ > +/* { dg-final { scan-assembler-times "vmovapd|vmovsd" 2 } } */ > > static inline double g (double x){ > asm volatile ("" : "+x" (x)); > diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-4.c > b/gcc/testsuite/gcc.target/i386/vect-strided-4.c > index dd922926a2a..3fb9f07886e 100644 > --- a/gcc/testsuite/gcc.target/i386/vect-strided-4.c > +++ b/gcc/testsuite/gcc.target/i386/vect-strided-4.c > @@ -15,6 +15,6 @@ void foo (int * __restrict a, int * __restrict b, int *c, > int s) > > /* Vectorization factor two, two two-element stores to a using movq > and two two-element stores to b via pextrq/movhps of the high part. */ > -/* { dg-final { scan-assembler-times "movq" 2 } } */ > +/* { dg-final { scan-assembler-times "movq\[\t ]+%xmm\[0-9]" 2 } } */ > /* { dg-final { scan-assembler-times "pextrq" 2 { target { ! ia32 } } } } */ > /* { dg-final { scan-assembler-times "movhps" 2 { target { ia32 } } } } */ > -- > 2.31.1 >