>What happens if you set preferred_for_speed to false for alternative 1?
It works, and I've removed the newly added splitter in this patch.
Also i tried to do similar things to *vec_dup<mode> with mode iterator
AVX2_VEC_DUP_MODE, but it hit ICE during reload since x86 don't have direct
move for QImode from gpr to sse register. so in this patch i only handle
*vec_dupv4si.
>> +(define_split
>> + [(set (match_operand:V4SI 0 "sse_reg_operand")
>> + (vec_duplicate:V4SI
>> + (match_operand:SI 1 "general_reg_operand")))]
>> + "TARGET_SSE && reload_completed
>> + /* Disable this splitter if avx512vl_vec_dup_gprv4si insn is
>> + available, because then we can broadcast from GPRs directly. */
>I think avx512vl_vec_dup_gprv4si should be merged with the above
>pattern instead.
Remove this splitter.
This will enable below
- vbroadcastss .LC1(%rip), %xmm0
+ movl $-45, %edx
+ vmovd %edx, %xmm0
+ vpshufd $0, %xmm0, %xmm0
According to microbenchmark, it's faster than broadcast from memory
for TARGET_INTER_UNIT_MOVES_TO_VEC.
gcc/ChangeLog:
* config/i386/sse.md (*vec_dupv4si): Disable memory operand
for !TARGET_INTER_UNIT_MOVES_TO_VEC when prefer_for_speed.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr100865-8a.c: Adjust testcase.
* gcc.target/i386/pr100865-8c.c: Ditto.
* gcc.target/i386/pr100865-9c.c: Ditto.
---
gcc/config/i386/sse.md | 7 ++++++-
gcc/testsuite/gcc.target/i386/pr100865-8a.c | 2 +-
gcc/testsuite/gcc.target/i386/pr100865-8c.c | 2 +-
gcc/testsuite/gcc.target/i386/pr100865-9c.c | 2 +-
4 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3066ea3734a..a091853065e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -25134,7 +25134,12 @@ (define_insn "*vec_dupv4si"
(set_attr "length_immediate" "1,0,1")
(set_attr "prefix_extra" "0,1,*")
(set_attr "prefix" "maybe_vex,maybe_evex,orig")
- (set_attr "mode" "TI,V4SF,V4SF")])
+ (set_attr "mode" "TI,V4SF,V4SF")
+ (set (attr "preferred_for_speed")
+ (cond [(eq_attr "alternative" "1")
+ (symbol_ref "!TARGET_INTER_UNIT_MOVES_TO_VEC")
+ ]
+ (symbol_ref "true")))])
(define_insn "*vec_dupv2di"
[(set (match_operand:V2DI 0 "register_operand" "=x,v,v,x")
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8a.c
b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
index 911b14d4a25..544a14db6f7 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-8a.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-8a.c
@@ -20,5 +20,5 @@ foo (void)
array[i] = MK_CONST128_BROADCAST_SIGNED (-45);
}
-/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t
\]+\[^\n\]*, %xmm\[0-9\]+" 1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "(?:vpbroadcastd|vpshufd)\[\\t
\]+\[^\n\]*, %xmm\[0-9\]+" 1 } } */
/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-8c.c
b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
index 00682edb8c9..efee0488614 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-8c.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-8c.c
@@ -3,5 +3,5 @@
#include "pr100865-8a.c"
-/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+"
1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+"
1 } } */
/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-9c.c
b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
index 8ffcdc1629d..e6f25902c1d 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-9c.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-9c.c
@@ -3,5 +3,5 @@
#include "pr100865-9a.c"
-/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+"
1 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times "vpshufd\[\\t \]+\[^\n\]*, %xmm\[0-9\]+"
1 } } */
/* { dg-final { scan-assembler-times "vmovdqa\[\\t \]%xmm\[0-9\]+, " 16 } } */
--
2.18.1