For mode2 bigger than 16-bytes, when it can be allocated to FIRST_SSE_REGS, then it can only be allocated to ALL_SSE_REGS, and it can be tiebale to all mode1 with smaller size which is available to FIRST_SSE_REGS. When modes is equal to 16 bytes, exclude non-vector modes(TI/TFmode). This is need for cse of all-ones/all-zeros, CSE checks costs with ix86_modes_tieable_p with different size modes.
ALso update ix86_rtx_cost to prevent CONST0_RTX be propogated, it will fail CSE of CONST0_RTX. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ok for trunk? gcc/ChangeLog: PR target/92080 * config/i386/i386.cc (ix86_modes_tieable_p): Relax MODE_SIZE (mode1) to <= 64/32/16 bytes when it can be allocated to FIRST_SSE_REG. doesn't need to be exactly the same when >= 16. (ix86_rtx_costs): Increase cost of const_double/const_vector 0/-1 a little to prevent propagation and enable more CSE. gcc/testsuite/ChangeLog: * gcc.target/i386/pr92080_vec_dup.c: New test. * gcc.target/i386/pr92080_zero.c: New test. --- gcc/config/i386/i386.cc | 14 +++-- .../gcc.target/i386/pr92080_vec_dup.c | 48 +++++++++++++++++ gcc/testsuite/gcc.target/i386/pr92080_zero.c | 51 +++++++++++++++++++ 3 files changed, 108 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c create mode 100644 gcc/testsuite/gcc.target/i386/pr92080_zero.c diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 224a78cc832..72b9859e376 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -20933,15 +20933,17 @@ ix86_modes_tieable_p (machine_mode mode1, machine_mode mode2) any other mode acceptable to SSE registers. */ if (GET_MODE_SIZE (mode2) == 64 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 64 + return (GET_MODE_SIZE (mode1) <= 64 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); if (GET_MODE_SIZE (mode2) == 32 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 32 + return (GET_MODE_SIZE (mode1) <= 32 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); if (GET_MODE_SIZE (mode2) == 16 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return (GET_MODE_SIZE (mode1) == 16 + return ((VECTOR_MODE_P (mode2) + ? GET_MODE_SIZE (mode1) <= 16 + : GET_MODE_SIZE (mode1) == 16) && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); /* If MODE2 is appropriate for an MMX register, then tie @@ -21507,10 +21509,12 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, case 0: break; case 1: /* 0: xor eliminates false dependency */ - *total = 0; + /* Add extra cost 1 to prevent propagation of CONST_VECTOR + for SET, which will enable more CSE optimization. */ + *total = 0 + (outer_code == SET); return true; default: /* -1: cmp contains false dependency */ - *total = 1; + *total = 1 + (outer_code == SET); return true; } /* FALLTHRU */ diff --git a/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c b/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c new file mode 100644 index 00000000000..67fdd15d69c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v4 -O2" } */ +/* { dg-final { scan-assembler-times "vpbroadcast\[bwd\]" 3 } } */ + +typedef int v16si __attribute__((vector_size(64))); +typedef int v8si __attribute__((vector_size(32))); +typedef int v4si __attribute__((vector_size(16))); + +typedef short v32hi __attribute__((vector_size(64))); +typedef short v16hi __attribute__((vector_size(32))); +typedef short v8hi __attribute__((vector_size(16))); + +typedef char v64qi __attribute__((vector_size(64))); +typedef char v32qi __attribute__((vector_size(32))); +typedef char v16qi __attribute__((vector_size(16))); + +v16si sinksz; +v8si sinksy; +v4si sinksx; +v32hi sinkhz; +v16hi sinkhy; +v8hi sinkhx; +v64qi sinkbz; +v32qi sinkby; +v16qi sinkbx; + +void foo(char c) { + sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; + sinksy = __extension__(v8si){c,c,c,c,c,c,c,c}; + sinksx = __extension__(v4si){c,c,c,c}; +} + +void foo1(char c) { + sinkhz = __extension__(v32hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c, + c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; + sinkhy = __extension__(v16hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; + sinkhx = __extension__(v8hi){c,c,c,c,c,c,c,c}; +} + +void foo2(char c) { + sinkbz = __extension__(v64qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c, + c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c, + c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c, + c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; + sinkby = __extension__(v32qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c, + c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; + sinkbx = __extension__(v16qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c}; +} diff --git a/gcc/testsuite/gcc.target/i386/pr92080_zero.c b/gcc/testsuite/gcc.target/i386/pr92080_zero.c new file mode 100644 index 00000000000..c6a8a98e955 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92080_zero.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v3" } */ +/* { dg-final { scan-assembler-times "vpxor" 2 } } */ +/* { dg-final { scan-assembler-times "vpcmpeq" 2 } } */ + +typedef int v4si __attribute__((vector_size(16))); +typedef int v8si __attribute__((vector_size(32))); +typedef short v8hi __attribute__((vector_size(16))); +typedef short v16hi __attribute__((vector_size(32))); +typedef char v16qi __attribute__((vector_size(16))); +typedef char v32qi __attribute__((vector_size(32))); + +v16qi b1; +v8hi h1; +v4si s1; +v32qi b2; +v16hi h2; +v8si s2; + +void +foo () +{ + s1 = __extension__(v4si){0, 0, 0, 0}; + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +} + +void +foo1 () +{ + s1 = __extension__(v4si){-1, -1, -1, -1}; + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; +} + + +void +foo2 () +{ + s2 = __extension__(v8si){0, 0, 0, 0, 0, 0, 0, 0}; + h1 = __extension__(v8hi){0, 0, 0, 0, 0, 0, 0, 0}; + b1 = __extension__(v16qi){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +} + +void +foo3 () +{ + s2 = __extension__(v8si){-1, -1, -1, -1, -1, -1, -1, -1}; + h1 = __extension__(v8hi){-1, -1, -1, -1, -1, -1, -1, -1}; + b1 = __extension__(v16qi){-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; +} -- 2.31.1