On Mon, Aug 23, 2021 at 6:17 PM Hongtao Liu <crazy...@gmail.com> wrote: > > On Tue, Aug 24, 2021 at 9:01 AM H.J. Lu via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: > > > > Broadcast from integer to a pseudo vector register instead of a hard > > vector register to allow LRA to remove redundant move instruction after > > broadcast. > > > > gcc/ > > > > PR target/102021 > > * config/i386/i386-expand.c (ix86_expand_vector_move): Broadcast > > from integer to a pseudo vector register. > > > > gcc/testsuite/ > > > > PR target/102021 > > * gcc.target/i386/pr100865-10b.c: Expect vzeroupper. > > * gcc.target/i386/pr100865-4b.c: Likewise. > > * gcc.target/i386/pr100865-6b.c: Expect vmovdqu and vzeroupper. > > * gcc.target/i386/pr100865-7b.c: Likewise. > > * gcc.target/i386/pr102021.c: New test. > > --- > > gcc/config/i386/i386-expand.c | 8 +------- > > gcc/testsuite/gcc.target/i386/pr100865-10b.c | 1 - > > gcc/testsuite/gcc.target/i386/pr100865-4b.c | 3 +-- > > gcc/testsuite/gcc.target/i386/pr100865-6b.c | 6 ++---- > > gcc/testsuite/gcc.target/i386/pr100865-7b.c | 6 ++---- > > gcc/testsuite/gcc.target/i386/pr102021.c | 15 +++++++++++++++ > > 6 files changed, 21 insertions(+), 18 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr102021.c > > > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > > index 9bf13dbfa92..12db742e5f4 100644 > > --- a/gcc/config/i386/i386-expand.c > > +++ b/gcc/config/i386/i386-expand.c > > @@ -579,13 +579,7 @@ ix86_expand_vector_move (machine_mode mode, rtx > > operands[]) > > { > > /* Broadcast to XMM/YMM/ZMM register from an integer > > constant or scalar mem. */ > > - /* Hard registers are used for 2 purposes: > > - 1. Prevent stack realignment when the original code > > - doesn't use vector registers, which is the same for > > - memcpy and memset. > > - 2. Prevent combine to convert constant broadcast to > > - load from constant pool. */ > > - op1 = ix86_gen_scratch_sse_rtx (mode); > > + op1 = gen_reg_rtx (mode); > > if (FLOAT_MODE_P (mode) > > || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)) > > { > if (FLOAT_MODE_P (mode) > || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)) > - { > - first = force_const_mem (GET_MODE_INNER (mode), first); > - op1 = gen_reg_rtx (mode); > - } > + first = force_const_mem (GET_MODE_INNER (mode), first); > Could you also apply the upper, others LGTM.
Like this? Thanks. > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c > > b/gcc/testsuite/gcc.target/i386/pr100865-10b.c > > index 77ace86ffe8..e5616d8d258 100644 > > --- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c > > @@ -5,4 +5,3 @@ > > > > /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t > > \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */ > > /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } > > } */ > > -/* { dg-final { scan-assembler-not "vzeroupper" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c > > b/gcc/testsuite/gcc.target/i386/pr100865-4b.c > > index 80e9fdb12ea..6d9cb91b8e9 100644 > > --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c > > @@ -5,7 +5,6 @@ > > > > /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t > > \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */ > > /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } > > } */ > > -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */ > > -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } > > */ > > +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */ > > /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, > > %ymm\[0-9\]+" } } */ > > /* { dg-final { scan-assembler-not "vmovdqa" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-6b.c > > b/gcc/testsuite/gcc.target/i386/pr100865-6b.c > > index 35f2e961d25..9588249cb02 100644 > > --- a/gcc/testsuite/gcc.target/i386/pr100865-6b.c > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-6b.c > > @@ -4,9 +4,7 @@ > > #include "pr100865-6a.c" > > > > /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t > > \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */ > > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 { > > target ia32 } } } */ > > -/* { dg-final { scan-assembler-times "vmovdqu32\[\\t \]%ymm\[0-9\]+, " 8 { > > target { ! ia32 } } } } */ > > -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */ > > -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } > > */ > > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } > > */ > > +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */ > > /* { dg-final { scan-assembler-not "vpbroadcastd\[\\t \]+%xmm\[0-9\]+, > > %ymm\[0-9\]+" } } */ > > /* { dg-final { scan-assembler-not "vmovdqa" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-7b.c > > b/gcc/testsuite/gcc.target/i386/pr100865-7b.c > > index ad267c43891..3b20c680521 100644 > > --- a/gcc/testsuite/gcc.target/i386/pr100865-7b.c > > +++ b/gcc/testsuite/gcc.target/i386/pr100865-7b.c > > @@ -5,8 +5,6 @@ > > > > /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, > > %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */ > > /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, > > %ymm\[0-9\]+" 1 { target ia32 } } } */ > > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 { > > target ia32 } } } */ > > -/* { dg-final { scan-assembler-times "vmovdqu64\[\\t \]%ymm\[0-9\]+, " 16 > > { target { ! ia32 } } } } */ > > -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */ > > -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } > > */ > > +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 } > > } */ > > +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */ > > /* { dg-final { scan-assembler-not "vmovdqa" } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr102021.c > > b/gcc/testsuite/gcc.target/i386/pr102021.c > > new file mode 100644 > > index 00000000000..6db3f57dc76 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr102021.c > > @@ -0,0 +1,15 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O3 -march=skylake-avx512" } */ > > + > > +#include<immintrin.h> > > + > > +__m256i > > +foo () > > +{ > > + return _mm256_set1_epi16 (12); > > +} > > + > > +/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, > > %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */ > > +/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, > > %ymm\[0-9\]+" 1 { target ia32 } } } */ > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */ > > +/* { dg-final { scan-assembler-not "vzeroupper" } } */ > > -- > > 2.31.1 > > > > > -- > BR, > Hongtao -- H.J.
From 4f71fc1de9a163191c4ab2cb3681f22e966e0ec0 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.to...@gmail.com> Date: Mon, 23 Aug 2021 14:47:03 -0700 Subject: [PATCH v2] x86: Broadcast from integer to a pseudo vector register Broadcast from integer to a pseudo vector register instead of a hard vector register to allow LRA to remove redundant move instruction after broadcast. gcc/ PR target/102021 * config/i386/i386-expand.c (ix86_expand_vector_move): Broadcast from integer to a pseudo vector register. gcc/testsuite/ PR target/102021 * gcc.target/i386/pr100865-10b.c: Expect vzeroupper. * gcc.target/i386/pr100865-4b.c: Likewise. * gcc.target/i386/pr100865-6b.c: Expect vmovdqu and vzeroupper. * gcc.target/i386/pr100865-7b.c: Likewise. * gcc.target/i386/pr102021.c: New test. --- gcc/config/i386/i386-expand.c | 13 ++----------- gcc/testsuite/gcc.target/i386/pr100865-10b.c | 1 - gcc/testsuite/gcc.target/i386/pr100865-4b.c | 3 +-- gcc/testsuite/gcc.target/i386/pr100865-6b.c | 6 ++---- gcc/testsuite/gcc.target/i386/pr100865-7b.c | 6 ++---- gcc/testsuite/gcc.target/i386/pr102021.c | 15 +++++++++++++++ 6 files changed, 22 insertions(+), 22 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr102021.c diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 9bf13dbfa92..2500dbfa7fb 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -579,19 +579,10 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[]) { /* Broadcast to XMM/YMM/ZMM register from an integer constant or scalar mem. */ - /* Hard registers are used for 2 purposes: - 1. Prevent stack realignment when the original code - doesn't use vector registers, which is the same for - memcpy and memset. - 2. Prevent combine to convert constant broadcast to - load from constant pool. */ - op1 = ix86_gen_scratch_sse_rtx (mode); + op1 = gen_reg_rtx (mode); if (FLOAT_MODE_P (mode) || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode)) - { - first = force_const_mem (GET_MODE_INNER (mode), first); - op1 = gen_reg_rtx (mode); - } + first = force_const_mem (GET_MODE_INNER (mode), first); bool ok = ix86_expand_vector_init_duplicate (false, mode, op1, first); gcc_assert (ok); diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c b/gcc/testsuite/gcc.target/i386/pr100865-10b.c index 77ace86ffe8..e5616d8d258 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c @@ -5,4 +5,3 @@ /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } */ -/* { dg-final { scan-assembler-not "vzeroupper" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c index 80e9fdb12ea..6d9cb91b8e9 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c @@ -5,7 +5,6 @@ /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */ /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 2 } } */ -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */ -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */ /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */ /* { dg-final { scan-assembler-not "vmovdqa" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr100865-6b.c b/gcc/testsuite/gcc.target/i386/pr100865-6b.c index 35f2e961d25..9588249cb02 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-6b.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-6b.c @@ -4,9 +4,7 @@ #include "pr100865-6a.c" /* { dg-final { scan-assembler-times "vpbroadcastd\[\\t \]+%(?:r|e)\[^\n\]*, %ymm\[0-9\]+" 1 } } */ -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 { target ia32 } } } */ -/* { dg-final { scan-assembler-times "vmovdqu32\[\\t \]%ymm\[0-9\]+, " 8 { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */ -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 8 } } */ +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */ /* { dg-final { scan-assembler-not "vpbroadcastd\[\\t \]+%xmm\[0-9\]+, %ymm\[0-9\]+" } } */ /* { dg-final { scan-assembler-not "vmovdqa" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr100865-7b.c b/gcc/testsuite/gcc.target/i386/pr100865-7b.c index ad267c43891..3b20c680521 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-7b.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-7b.c @@ -5,8 +5,6 @@ /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */ /* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */ -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 { target ia32 } } } */ -/* { dg-final { scan-assembler-times "vmovdqu64\[\\t \]%ymm\[0-9\]+, " 16 { target { ! ia32 } } } } */ -/* { dg-final { scan-assembler-times "vzeroupper" 1 { target ia32 } } } */ -/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%ymm\[0-9\]+, " 16 } } */ +/* { dg-final { scan-assembler-times "vzeroupper" 1 } } */ /* { dg-final { scan-assembler-not "vmovdqa" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr102021.c b/gcc/testsuite/gcc.target/i386/pr102021.c new file mode 100644 index 00000000000..6db3f57dc76 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr102021.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=skylake-avx512" } */ + +#include<immintrin.h> + +__m256i +foo () +{ + return _mm256_set1_epi16 (12); +} + +/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+%r\[^\n\]*, %ymm\[0-9\]+" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpbroadcastq\[\\t \]+\[^\n\]*, %ymm\[0-9\]+" 1 { target ia32 } } } */ +/* { dg-final { scan-assembler-not "vmovdqa" } } */ +/* { dg-final { scan-assembler-not "vzeroupper" } } */ -- 2.31.1