For mode2 bigger than 16-bytes, when it can be allocated to FIRST_SSE_REGS,
then it can only be allocated to ALL_SSE_REGS, and it can be tiebale
to all mode1 with smaller size which is available to FIRST_SSE_REGS.
When modes is equal to 16 bytes, exclude non-vector modes(TI/TFmode).
This is need for cse of all-ones/all-zeros, CSE checks costs with
ix86_modes_tieable_p with different size modes.

ALso update ix86_rtx_cost to prevent CONST0_RTX be propogated, it will
fail CSE of CONST0_RTX.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

        PR target/92080
        * config/i386/i386.cc (ix86_modes_tieable_p): Relax
        MODE_SIZE (mode1) to <= 64/32/16 bytes when it can be
        allocated to FIRST_SSE_REG.
        doesn't need to be exactly the same when >= 16.
        (ix86_rtx_costs): Increase cost of const_double/const_vector
        0/-1 a little to prevent propagation and enable more CSE.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/pr92080_vec_dup.c: New test.
        * gcc.target/i386/pr92080_zero.c: New test.
---
 gcc/config/i386/i386.cc                       | 14 +++--
 .../gcc.target/i386/pr92080_vec_dup.c         | 48 +++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr92080_zero.c  | 51 +++++++++++++++++++
 3 files changed, 108 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080_zero.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 224a78cc832..72b9859e376 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20933,15 +20933,17 @@ ix86_modes_tieable_p (machine_mode mode1, 
machine_mode mode2)
      any other mode acceptable to SSE registers.  */
   if (GET_MODE_SIZE (mode2) == 64
       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 64
+    return (GET_MODE_SIZE (mode1) <= 64
            && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
   if (GET_MODE_SIZE (mode2) == 32
       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 32
+    return (GET_MODE_SIZE (mode1) <= 32
            && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
   if (GET_MODE_SIZE (mode2) == 16
       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-    return (GET_MODE_SIZE (mode1) == 16
+    return ((VECTOR_MODE_P (mode2)
+            ? GET_MODE_SIZE (mode1) <= 16
+            : GET_MODE_SIZE (mode1) == 16)
            && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
 
   /* If MODE2 is appropriate for an MMX register, then tie
@@ -21507,10 +21509,12 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
        case 0:
          break;
        case 1:  /* 0: xor eliminates false dependency */
-         *total = 0;
+         /* Add extra cost 1 to prevent propagation of CONST_VECTOR
+            for SET, which will enable more CSE optimization.  */
+         *total = 0 + (outer_code == SET);
          return true;
        default: /* -1: cmp contains false dependency */
-         *total = 1;
+         *total = 1 + (outer_code == SET);
          return true;
        }
       /* FALLTHRU */
diff --git a/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c 
b/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
new file mode 100644
index 00000000000..67fdd15d69c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcast\[bwd\]" 3 } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+
+typedef short v32hi __attribute__((vector_size(64)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16si sinksz;
+v8si sinksy;
+v4si sinksx;
+v32hi sinkhz;
+v16hi sinkhy;
+v8hi sinkhx;
+v64qi sinkbz;
+v32qi sinkby;
+v16qi sinkbx;
+
+void foo(char c) {
+  sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinksy = __extension__(v8si){c,c,c,c,c,c,c,c};
+  sinksx = __extension__(v4si){c,c,c,c};
+}
+
+void foo1(char c) {
+  sinkhz = __extension__(v32hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkhy = __extension__(v16hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkhx = __extension__(v8hi){c,c,c,c,c,c,c,c};
+}
+
+void foo2(char c) {
+  sinkbz = __extension__(v64qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkby = __extension__(v32qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+    c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkbx = __extension__(v16qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080_zero.c 
b/gcc/testsuite/gcc.target/i386/pr92080_zero.c
new file mode 100644
index 00000000000..c6a8a98e955
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080_zero.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "vpxor" 2 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 2 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+v32qi b2;
+v16hi h2;
+v8si s2;
+
+void
+foo ()
+{
+  s1 = __extension__(v4si){0, 0, 0, 0};
+  h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+  b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+}
+
+void
+foo1 ()
+{
+  s1 = __extension__(v4si){-1, -1, -1, -1};
+  h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+  b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1};
+}
+
+
+void
+foo2 ()
+{
+  s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
+  h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+  b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+}
+
+void
+foo3 ()
+{
+  s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
+  h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+  b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
-1, -1, -1, -1};
+}
-- 
2.31.1

Reply via email to