Move pass_stv2 and pass_rpad after pre_reload pass_late_combine, also
define target_insn_cost to prevent post_reload pass_late_combine to
revert the optimziation did in pass_rpad.

Adjust testcases since pass_late_combine generates better code but
break scan assembly.

.i.e
Under 32-bit target, gcc used to generate broadcast from stack and
then do the real operation.
After flate_combine, they're combined into embeded broadcast
operations.

gcc/ChangeLog:

        * config/i386/i386-features.cc (ix86_rpad_gate): New function.
        * config/i386/i386-options.cc (ix86_override_options_after_change):
        Don't disable flate_combine.
        * config/i386/i386-passes.def: Move pass_stv2 and pass_rpad
        after pre_reload pas_late_combine.
        * config/i386/i386-protos.h (ix86_rpad_gate): New declare.
        * config/i386/i386.cc (ix86_insn_cost): New function.
        (TARGET_INSN_COST): Define.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/avx512f-broadcast-pr87767-1.c: Adjus
        testcase.
        * gcc.target/i386/avx512f-broadcast-pr87767-5.c: Ditto.
        * gcc.target/i386/avx512f-fmadd-sf-zmm-7.c: Ditto.
        * gcc.target/i386/avx512f-fmsub-sf-zmm-7.c: Ditto.
        * gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c: Ditto.
        * gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c: Ditto.
        * gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Ditto.
        * gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Ditto.
        * gcc.target/i386/pr91333.c: Ditto.
        * gcc.target/i386/vect-strided-4.c: Ditto.
---
 gcc/config/i386/i386-features.cc               | 16 +++++++++++-----
 gcc/config/i386/i386-options.cc                |  4 ----
 gcc/config/i386/i386-passes.def                |  4 ++--
 gcc/config/i386/i386-protos.h                  |  1 +
 gcc/config/i386/i386.cc                        | 18 ++++++++++++++++++
 .../i386/avx512f-broadcast-pr87767-1.c         |  4 ++--
 .../i386/avx512f-broadcast-pr87767-5.c         |  1 -
 .../gcc.target/i386/avx512f-fmadd-sf-zmm-7.c   |  2 +-
 .../gcc.target/i386/avx512f-fmsub-sf-zmm-7.c   |  2 +-
 .../gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c  |  2 +-
 .../gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c  |  2 +-
 .../i386/avx512vl-broadcast-pr87767-1.c        |  4 ++--
 .../i386/avx512vl-broadcast-pr87767-5.c        |  2 --
 gcc/testsuite/gcc.target/i386/pr91333.c        |  2 +-
 gcc/testsuite/gcc.target/i386/vect-strided-4.c |  2 +-
 15 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 607d1991460..fc224ed06b0 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -2995,6 +2995,16 @@ make_pass_insert_endbr_and_patchable_area (gcc::context 
*ctxt)
   return new pass_insert_endbr_and_patchable_area (ctxt);
 }
 
+bool
+ix86_rpad_gate ()
+{
+  return (TARGET_AVX
+         && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+         && TARGET_SSE_MATH
+         && optimize
+         && optimize_function_for_speed_p (cfun));
+}
+
 /* At entry of the nearest common dominator for basic blocks with
    conversions/rcp/sqrt/rsqrt/round, generate a single
        vxorps %xmmN, %xmmN, %xmmN
@@ -3232,11 +3242,7 @@ public:
   /* opt_pass methods: */
   bool gate (function *) final override
     {
-      return (TARGET_AVX
-             && TARGET_SSE_PARTIAL_REG_DEPENDENCY
-             && TARGET_SSE_MATH
-             && optimize
-             && optimize_function_for_speed_p (cfun));
+      return ix86_rpad_gate ();
     }
 
   unsigned int execute (function *) final override
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 9c12d498928..1ef2c71a7a2 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1944,10 +1944,6 @@ ix86_override_options_after_change (void)
        flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
     }
 
-  /* Late combine tends to undo some of the effects of STV and RPAD,
-     by combining instructions back to their original form.  */
-  if (!OPTION_SET_P (flag_late_combine_instructions))
-    flag_late_combine_instructions = 0;
 }
 
 /* Clear stack slot assignments remembered from previous functions.
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 7d96766f7b9..2d29f65da88 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -25,11 +25,11 @@ along with GCC; see the file COPYING3.  If not see
  */
 
   INSERT_PASS_AFTER (pass_postreload_cse, 1, pass_insert_vzeroupper);
-  INSERT_PASS_AFTER (pass_combine, 1, pass_stv, false /* timode_p */);
+  INSERT_PASS_AFTER (pass_late_combine, 1, pass_stv, false /* timode_p */);
   /* Run the 64-bit STV pass before the CSE pass so that CONST0_RTX and
      CONSTM1_RTX generated by the STV pass can be CSEed.  */
   INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
 
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, 
pass_insert_endbr_and_patchable_area);
 
-  INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
+  INSERT_PASS_AFTER (pass_late_combine, 1, pass_remove_partial_avx_dependency);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 4f48dc0bf75..3dbd18dc70b 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -422,6 +422,7 @@ extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
   (gcc::context *);
 
 extern bool ix86_has_no_direct_extern_access;
+extern bool ix86_rpad_gate ();
 
 /* In i386-expand.cc.  */
 bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1f71ed04be6..9d2b7d1f174 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21371,6 +21371,22 @@ ix86_shift_rotate_cost (const struct processor_costs 
*cost,
     }
 }
 
+static int
+ix86_insn_cost (rtx_insn *insn, bool speed)
+{
+  int insn_cost = 0;
+  /* Add extra cost to avoid post_reload late_combine revert
+     the optimization did in pass_rpad.  */
+  if (reload_completed
+      && ix86_rpad_gate ()
+      && recog_memoized (insn) >= 0
+      && get_attr_avx_partial_xmm_update (insn)
+      == AVX_PARTIAL_XMM_UPDATE_TRUE)
+    insn_cost += COSTS_N_INSNS (3);
+
+  return insn_cost + pattern_cost (PATTERN (insn), speed);
+}
+
 /* Compute a (partial) cost for rtx X.  Return true if the complete
    cost has been computed, and false if subexpressions should be
    scanned.  In either case, *TOTAL contains the cost result.  */
@@ -26514,6 +26530,8 @@ static const scoped_attribute_specs *const 
ix86_attribute_table[] =
 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
 #undef TARGET_RTX_COSTS
 #define TARGET_RTX_COSTS ix86_rtx_costs
+#undef TARGET_INSN_COST
+#define TARGET_INSN_COST ix86_insn_cost
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST ix86_address_cost
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c 
b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
index 138dbb4c973..3a50749e610 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
@@ -3,8 +3,8 @@
 /* { dg-options "-O2 -mavx512f -mavx512dq" } */
 /* { dg-additional-options "-fno-PIE" { target ia32 } } */
 /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } 
} }
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } } */
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 { target { ! 
ia32 } } } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}"  2 } }  */
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%zmm\[0-9\]+" 3 } } */
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
%zmm\[0-9\]+" 3 { target { ! ia32 } } } } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c 
b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
index d22251bc2a3..ea2f64861d0 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
@@ -3,7 +3,6 @@
 /* { dg-options "-O2 -mavx512f" } */
 /* { dg-additional-options "-fno-PIE" { target ia32 } } */
 /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } 
} }
-/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to8\\\}" { target ia32 } } } 
*/
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%zmm\[0-9\]+" 4 } } */
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
%zmm\[0-9\]+" 4 { target { ! ia32 } } } } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c 
b/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c
index 8c117207efa..bbcc5ed0bec 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-fmadd-sf-zmm-7.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } 
*/
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { 
target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vfmadd...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */
 
 #define type __m512
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c 
b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c
index cc705af8ea5..fc72dd6e557 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-fmsub-sf-zmm-7.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } 
*/
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { 
target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vfmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } } */
 
 #define type __m512
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c 
b/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c
index db5c34678c0..342de482da8 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } 
*/
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { 
target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vfnmadd...ps\[^\n\]*%zmm\[0-9\]+" 1 } } 
*/
 
 #define type __m512
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c 
b/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c
index 7815251b82d..f56a3f8acc4 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
-/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 } } 
*/
+/* { dg-final { scan-assembler-times "vbroadcastss\[^\n\]*%zmm\[0-9\]+" 1 { 
target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "vfnmsub...ps\[^\n\]*%zmm\[0-9\]+" 1 } } 
*/
 
 #define type __m512
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
index e6df4d25f36..08898445be5 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
@@ -3,8 +3,8 @@
 /* { dg-options "-O2 -mavx512f -mavx512vl -mavx512dq" } */
 /* { dg-additional-options "-fno-PIE" { target ia32 } } */
 /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } 
} }
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 } }  */
-/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 2 { target { ! 
ia32 } } } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 4 { target { ! 
ia32 } } } }  */
 /* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 2 } }  */
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%xmm\[0-9\]+" 3 } } */
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%ymm\[0-9\]+" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
index ebdc3619d8e..c57a2e29767 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
@@ -3,8 +3,6 @@
 /* { dg-options "-O2 -mavx512f -mavx512vl" } */
 /* { dg-additional-options "-fno-PIE" { target ia32 } } */
 /* { dg-additional-options "-mdynamic-no-pic" { target { *-*-darwin* && ia32 } 
} }
-/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to2\\\}" { target ia32 } } } 
*/
-/* { dg-final { scan-assembler-not "\[^\n\]*\\\{1to4\\\}" { target ia32 } } } 
*/
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%xmm\[0-9\]+" 4 } } */
 /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, 
%ymm\[0-9\]+" 4 } } */
 /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, 
%xmm\[0-9\]+" 4 { target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr91333.c 
b/gcc/testsuite/gcc.target/i386/pr91333.c
index 2bdff871024..b4940b5c9ec 100644
--- a/gcc/testsuite/gcc.target/i386/pr91333.c
+++ b/gcc/testsuite/gcc.target/i386/pr91333.c
@@ -1,6 +1,6 @@
 /* { dg-do compile { target { ! ia32 } } } */
 /* { dg-options "-O2 -mavx" } */
-/* { dg-final { scan-assembler-times "vmovapd|vmovsd" 3 } } */
+/* { dg-final { scan-assembler-times "vmovapd|vmovsd" 2 } } */
 
 static inline double g (double x){
   asm volatile ("" : "+x" (x));
diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-4.c 
b/gcc/testsuite/gcc.target/i386/vect-strided-4.c
index dd922926a2a..3fb9f07886e 100644
--- a/gcc/testsuite/gcc.target/i386/vect-strided-4.c
+++ b/gcc/testsuite/gcc.target/i386/vect-strided-4.c
@@ -15,6 +15,6 @@ void foo (int * __restrict a, int * __restrict b, int *c, int 
s)
 
 /* Vectorization factor two, two two-element stores to a using movq
    and two two-element stores to b via pextrq/movhps of the high part.  */
-/* { dg-final { scan-assembler-times "movq" 2 } } */
+/* { dg-final { scan-assembler-times "movq\[\t ]+%xmm\[0-9]" 2 } } */
 /* { dg-final { scan-assembler-times "pextrq" 2 { target { ! ia32 } } } } */
 /* { dg-final { scan-assembler-times "movhps" 2 { target { ia32 } } } } */
-- 
2.31.1

Reply via email to