On Fri, Jun 28, 2024 at 7:29 AM liuhongt <hongtao....@intel.com> wrote:
>
> Move pass_stv2 and pass_rpad after pre_reload pass_late_combine, also
> define target_insn_cost to prevent post_reload pass_late_combine to
> revert the optimziation did in pass_rpad.
>
> Adjust testcases since pass_late_combine generates better code but
> break scan assembly.
>
> .i.e
> Under 32-bit target, gcc used to generate broadcast from stack and
> then do the real operation.
> After flate_combine, they're combined into embeded broadcast
> operations.
>
> gcc/ChangeLog:
>
>         * config/i386/i386-features.cc (ix86_rpad_gate): New function.
>         * config/i386/i386-options.cc (ix86_override_options_after_change):
>         Don't disable flate_combine.
>         * config/i386/i386-passes.def: Move pass_stv2 and pass_rpad
>         after pre_reload pas_late_combine.
>         * config/i386/i386-protos.h (ix86_rpad_gate): New declare.
>         * config/i386/i386.cc (ix86_insn_cost): New function.
>         (TARGET_INSN_COST): Define.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx512f-broadcast-pr87767-1.c: Adjus
>         testcase.
>         * gcc.target/i386/avx512f-broadcast-pr87767-5.c: Ditto.
>         * gcc.target/i386/avx512f-fmadd-sf-zmm-7.c: Ditto.
>         * gcc.target/i386/avx512f-fmsub-sf-zmm-7.c: Ditto.
>         * gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c: Ditto.
>         * gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c: Ditto.
>         * gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Ditto.
>         * gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Ditto.
>         * gcc.target/i386/pr91333.c: Ditto.
>         * gcc.target/i386/vect-strided-4.c: Ditto.

LGTM.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386-features.cc               | 16 +++++++++++-----
>  gcc/config/i386/i386-options.cc                |  4 ----
>  gcc/config/i386/i386-passes.def                |  4 ++--
>  gcc/config/i386/i386-protos.h                  |  1 +
>  gcc/config/i386/i386.cc                        | 18 ++++++++++++++++++
>  .../i386/avx512f-broadcast-pr87767-1.c         |  4 ++--
>  .../i386/avx512f-broadcast-pr87767-5.c         |  1 -
>  .../gcc.target/i386/avx512f-fmadd-sf-zmm-7.c   |  2 +-
>  .../gcc.target/i386/avx512f-fmsub-sf-zmm-7.c   |  2 +-
>  .../gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c  |  2 +-
>  .../gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c  |  2 +-
>  .../i386/avx512vl-broadcast-pr87767-1.c        |  4 ++--
>  .../i386/avx512vl-broadcast-pr87767-5.c        |  2 --
>  gcc/testsuite/gcc.target/i386/pr91333.c        |  2 +-
>  gcc/testsuite/gcc.target/i386/vect-strided-4.c |  2 +-
>  15 files changed, 42 insertions(+), 24 deletions(-)
>
> diff --git a/gcc/config/i386/i386-features.cc 
> b/gcc/config/i386/i386-features.cc
> index 607d1991460..fc224ed06b0 100644
> --- a/gcc/config/i386/i386-features.cc
> +++ b/gcc/config/i386/i386-features.cc
> @@ -2995,6 +2995,16 @@ make_pass_insert_endbr_and_patchable_area 
> (gcc::context *ctxt)
>    return new pass_insert_endbr_and_patchable_area (ctxt);
>  }
>
> +bool
> +ix86_rpad_gate ()
> +{
> +  return (TARGET_AVX
> +         && TARGET_SSE_PARTIAL_REG_DEPENDENCY
> +         && TARGET_SSE_MATH
> +         && optimize
> +         && optimize_function_for_speed_p (cfun));
> +}
> +
>  /* At entry of the nearest common dominator for basic blocks with
>     conversions/rcp/sqrt/rsqrt/round, generate a single
>         vxorps %xmmN, %xmmN, %xmmN
> @@ -3232,11 +3242,7 @@ public:
>    /* opt_pass methods: */
>    bool gate (function *) final override
>      {
> -      return (TARGET_AVX
> -             && TARGET_SSE_PARTIAL_REG_DEPENDENCY
> -             && TARGET_SSE_MATH
> -             && optimize
> -             && optimize_function_for_speed_p (cfun));
> +      return ix86_rpad_gate ();
>      }
>
>    unsigned int execute (function *) final override
> diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
> index 9c12d498928..1ef2c71a7a2 100644
> --- a/gcc/config/i386/i386-options.cc
> +++ b/gcc/config/i386/i386-options.cc
> @@ -1944,10 +1944,6 @@ ix86_override_options_after_change (void)
>         flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
>      }
>
> -  /* Late combine tends to undo some of the effects of STV and RPAD,
> -     by combining instructions back to their original form.  */
> -  if (!OPTION_SET_P (flag_late_combine_instructions))
> -    flag_late_combine_instructions = 0;
>  }
>
>  /* Clear stack slot assignments remembered from previous functions.
> diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
> index 7d96766f7b9..2d29f65da88 100644
> --- a/gcc/config/i386/i386-passes.def
> +++ b/gcc/config/i386/i386-passes.def
> @@ -25,11 +25,11 @@ along with GCC; see the file COPYING3.  If not see
>   */
>
>    INSERT_PASS_AFTER (pass_postreload_cse, 1, pass_insert_vzeroupper);
> -  INSERT_PASS_AFTER (pass_combine, 1, pass_stv, false /* timode_p */);
> +  INSERT_PASS_AFTER (pass_late_combine, 1, pass_stv, false /* timode_p */);
>    /* Run the 64-bit STV pass before the CSE pass so that CONST0_RTX and
>       CONSTM1_RTX generated by the STV pass can be CSEed.  */
>    INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
>
>    INSERT_PASS_BEFORE (pass_shorten_branches, 1, 
> pass_insert_endbr_and_patchable_area);
>
> -  INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
> +  INSERT_PASS_AFTER (pass_late_combine, 1, 
> pass_remove_partial_avx_dependency);
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index 4f48dc0bf75..3dbd18dc70b 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -422,6 +422,7 @@ extern rtl_opt_pass 
> *make_pass_remove_partial_avx_dependency
>    (gcc::context *);
>
>  extern bool ix86_has_no_direct_extern_access;
> +extern bool ix86_rpad_gate ();
>
>  /* In i386-expand.cc.  */
>  bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 1f71ed04be6..9d2b7d1f174 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -21371,6 +21371,22 @@ ix86_shift_rotate_cost (const struct processor_costs 
> *cost,
>      }
>  }
>
> +static int
> +ix86_insn_cost (rtx_insn *insn, bool speed)
> +{
> +  int insn_cost = 0;
> +  /* Add extra cost to avoid post_reload late_combine revert
> +     the optimization did in pass_rpad.  */
> +  if (reload_completed
> +      && ix86_rpad_gate ()
> +      && recog_memoized (insn) >= 0
> +      && get_attr_avx_partial_xmm_update (insn)
> +      == AVX_PARTIAL_XMM_UPDATE_TRUE)
> +    insn_cost += COSTS_N_INSNS (3);
> +
> +  return insn_cost + pattern_cost (PATTERN (insn), speed);
> +}
> +
>  /* Compute a (partial) cost for rtx X.  Return true if the complete
>     cost has been computed, and false if subexpressions should be
>     scanned.  In either case, *TOTAL contains the cost result.  */
> @@ -26514,6 +26530,8 @@ static const scoped_attribute_specs *const 
> ix86_attribute_table[] =
>  #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
>  #undef TARGET_RTX_COSTS
>  #define TARGET_RTX_COSTS ix86_rtx_costs
> +#undef TARGET_INSN_COST
> +#define TARGET_INSN_COST ix86_insn_cost
>  #undef TARGET_ADDRESS_COST
>  #define TARGET_ADDRESS_COST ix86_address_cost
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
> index 138dbb4c973..3a50749e610 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
> @@ -3,8 +3,8 @@
>  /* { dg-options "-O2 -mavx512f -mavx512dq" } */
>  /* { dg-additional-options "-fno-PIE" { target ia32 } } */
>  /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 
> } } }
> -/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } } */
> -/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } }  */
> +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 { target { ! 
> ia32 } } } } */
> +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}"  2 } }  */
>  /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
> %zmm\[0-9\]+" 3 } } */
>  /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
> %zmm\[0-9\]+" 3 { target { ! ia32 } } } } */
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
> index d22251bc2a3..ea2f64861d0 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
> @@ -3,7 +3,6 @@
>  /* { dg-options "-O2 -mavx512f" } */
>  /* { dg-additional-options "-fno-PIE" { target ia32 } } */
>  /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 
> } } }
> -/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to8\\\}" { target ia32 } } 
> } */
>  /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
> %zmm\[0-9\]+" 4 } } */
>  /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
> %zmm\[0-9\]+" 4 { target { ! ia32 } } } } */
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c
> index 8c117207efa..bbcc5ed0bec 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512f -O2" } */
> -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } 
> } */
> +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { 
> target { ! ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vfmadd...ps\[^\n\]*%zmm\[0-9\]+" 1 } } 
> */
>
>  #define type __m512
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c
> index cc705af8ea5..fc72dd6e557 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512f -O2" } */
> -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } 
> } */
> +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { 
> target { ! ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vfmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } } 
> */
>
>  #define type __m512
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c
> index db5c34678c0..342de482da8 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512f -O2" } */
> -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } 
> } */
> +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { 
> target { ! ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vfnmadd...ps\[^\n\]*%zmm\[0-9\]+" 1 } 
> } */
>
>  #define type __m512
> diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c 
> b/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c
> index 7815251b82d..f56a3f8acc4 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512f -O2" } */
> -/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } 
> } */
> +/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { 
> target { ! ia32 } } } } */
>  /* { dg-final { scan-assembler-times "vfnmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } 
> } */
>
>  #define type __m512
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c 
> b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
> index e6df4d25f36..08898445be5 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
> @@ -3,8 +3,8 @@
>  /* { dg-options "-O2 -mavx512f -mavx512vl -mavx512dq" } */
>  /* { dg-additional-options "-fno-PIE" { target ia32 } } */
>  /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 
> } } }
> -/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 } }  */
> -/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 } }  */
> +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 { target { ! 
> ia32 } } } }  */
> +/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 { target { ! 
> ia32 } } } }  */
>  /* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } }  */
>  /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
> %xmm\[0-9\]+" 3 } } */
>  /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
> %ymm\[0-9\]+" 3 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c 
> b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
> index ebdc3619d8e..c57a2e29767 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
> @@ -3,8 +3,6 @@
>  /* { dg-options "-O2 -mavx512f -mavx512vl" } */
>  /* { dg-additional-options "-fno-PIE" { target ia32 } } */
>  /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 
> } } }
> -/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to2\\\}" { target ia32 } } 
> } */
> -/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to4\\\}" { target ia32 } } 
> } */
>  /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
> %xmm\[0-9\]+" 4 } } */
>  /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
> %ymm\[0-9\]+" 4 } } */
>  /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
> %xmm\[0-9\]+" 4 { target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr91333.c 
> b/gcc/testsuite/gcc.target/i386/pr91333.c
> index 2bdff871024..b4940b5c9ec 100644
> --- a/gcc/testsuite/gcc.target/i386/pr91333.c
> +++ b/gcc/testsuite/gcc.target/i386/pr91333.c
> @@ -1,6 +1,6 @@
>  /* { dg-do compile { target { ! ia32 } } } */
>  /* { dg-options "-O2 -mavx" } */
> -/* { dg-final { scan-assembler-times "vmovapd|vmovsd" 3 } } */
> +/* { dg-final { scan-assembler-times "vmovapd|vmovsd" 2 } } */
>
>  static inline double g (double x){
>    asm volatile ("" : "+x" (x));
> diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-4.c 
> b/gcc/testsuite/gcc.target/i386/vect-strided-4.c
> index dd922926a2a..3fb9f07886e 100644
> --- a/gcc/testsuite/gcc.target/i386/vect-strided-4.c
> +++ b/gcc/testsuite/gcc.target/i386/vect-strided-4.c
> @@ -15,6 +15,6 @@ void foo (int * __restrict a, int * __restrict b, int *c, 
> int s)
>
>  /* Vectorization factor two, two two-element stores to a using movq
>     and two two-element stores to b via pextrq/movhps of the high part.  */
> -/* { dg-final { scan-assembler-times "movq" 2 } } */
> +/* { dg-final { scan-assembler-times "movq\[\t ]+%xmm\[0-9]" 2 } } */
>  /* { dg-final { scan-assembler-times "pextrq" 2 { target { ! ia32 } } } } */
>  /* { dg-final { scan-assembler-times "movhps" 2 { target { ia32 } } } } */
> --
> 2.31.1
>

Reply via email to