For mode2 bigger than 16-bytes, when it can be allocated to FIRST_SSE_REGS,
then it can only be allocated to ALL_SSE_REGS, and it can be tiebale
to all mode1 with smaller size which is available to FIRST_SSE_REGS.
When modes is equal to 16 bytes, exclude non-vector modes(TI/TFmode).
This is need for cse of all-ones/all-zeros, CSE checks costs with
ix86_modes_tieable_p with different size modes.
ALso update ix86_rtx_cost to prevent CONST0_RTX be propogated, it will
fail CSE of CONST0_RTX.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?
gcc/ChangeLog:
PR target/92080
* config/i386/i386.cc (ix86_modes_tieable_p): Relax
MODE_SIZE (mode1) to <= 64/32/16 bytes when it can be
allocated to FIRST_SSE_REG.
doesn't need to be exactly the same when >= 16.
(ix86_rtx_costs): Increase cost of const_double/const_vector
0/-1 a little to prevent propagation and enable more CSE.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr92080_vec_dup.c: New test.
* gcc.target/i386/pr92080_zero.c: New test.
---
gcc/config/i386/i386.cc | 14 +++--
.../gcc.target/i386/pr92080_vec_dup.c | 48 +++++++++++++++++
gcc/testsuite/gcc.target/i386/pr92080_zero.c | 51 +++++++++++++++++++
3 files changed, 108 insertions(+), 5 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr92080_zero.c
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 224a78cc832..72b9859e376 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20933,15 +20933,17 @@ ix86_modes_tieable_p (machine_mode mode1,
machine_mode mode2)
any other mode acceptable to SSE registers. */
if (GET_MODE_SIZE (mode2) == 64
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 64
+ return (GET_MODE_SIZE (mode1) <= 64
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
if (GET_MODE_SIZE (mode2) == 32
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 32
+ return (GET_MODE_SIZE (mode1) <= 32
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
if (GET_MODE_SIZE (mode2) == 16
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
- return (GET_MODE_SIZE (mode1) == 16
+ return ((VECTOR_MODE_P (mode2)
+ ? GET_MODE_SIZE (mode1) <= 16
+ : GET_MODE_SIZE (mode1) == 16)
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
/* If MODE2 is appropriate for an MMX register, then tie
@@ -21507,10 +21509,12 @@ ix86_rtx_costs (rtx x, machine_mode mode, int
outer_code_i, int opno,
case 0:
break;
case 1: /* 0: xor eliminates false dependency */
- *total = 0;
+ /* Add extra cost 1 to prevent propagation of CONST_VECTOR
+ for SET, which will enable more CSE optimization. */
+ *total = 0 + (outer_code == SET);
return true;
default: /* -1: cmp contains false dependency */
- *total = 1;
+ *total = 1 + (outer_code == SET);
return true;
}
/* FALLTHRU */
diff --git a/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
b/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
new file mode 100644
index 00000000000..67fdd15d69c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcast\[bwd\]" 3 } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+
+typedef short v32hi __attribute__((vector_size(64)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16si sinksz;
+v8si sinksy;
+v4si sinksx;
+v32hi sinkhz;
+v16hi sinkhy;
+v8hi sinkhx;
+v64qi sinkbz;
+v32qi sinkby;
+v16qi sinkbx;
+
+void foo(char c) {
+ sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinksy = __extension__(v8si){c,c,c,c,c,c,c,c};
+ sinksx = __extension__(v4si){c,c,c,c};
+}
+
+void foo1(char c) {
+ sinkhz = __extension__(v32hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinkhy = __extension__(v16hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinkhx = __extension__(v8hi){c,c,c,c,c,c,c,c};
+}
+
+void foo2(char c) {
+ sinkbz = __extension__(v64qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinkby = __extension__(v32qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+ c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+ sinkbx = __extension__(v16qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr92080_zero.c
b/gcc/testsuite/gcc.target/i386/pr92080_zero.c
new file mode 100644
index 00000000000..c6a8a98e955
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080_zero.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v3" } */
+/* { dg-final { scan-assembler-times "vpxor" 2 } } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 2 } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+
+v16qi b1;
+v8hi h1;
+v4si s1;
+v32qi b2;
+v16hi h2;
+v8si s2;
+
+void
+foo ()
+{
+ s1 = __extension__(v4si){0, 0, 0, 0};
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+}
+
+void
+foo1 ()
+{
+ s1 = __extension__(v4si){-1, -1, -1, -1};
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1};
+}
+
+
+void
+foo2 ()
+{
+ s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0};
+ h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0};
+ b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+}
+
+void
+foo3 ()
+{
+ s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1};
+ h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1};
+ b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1};
+}
--
2.31.1