Attached patch considerably improves zero-extended SImode -> DImode
moves between SSE registers for SSE4.1 targets. The patch teaches the
compiler to generate:
vmovdqa m(%rip), %ymm1
vpmovzxdq %xmm1, %xmm1
vpsrlw %xmm1, %xmm0, %xmm0
to zero-extend the value in the SSE register, instead of round
tripping the value to GPR:
vmovdqa m(%rip), %ymm1
vmovd %xmm1, %eax
vmovq %rax, %xmm1
vpsrlw %xmm1, %xmm0, %xmm0
... or horrible code for targets without preference to inter-unit moves.
As mentioned by Jakub, there are other optimization opportunities with
count argument handling.
2017-04-06 Uros Bizjak <[email protected]>
PR target/80286
* config/i386/sse.md (*vec_extractv4si_0_zext_sse4): New pattern.
* config/i386/i386.md (*zero_extendsidi2):
Add (?*x,*x) and (?*v,*v) alternatives.
Patch was bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.
Committed to mainline SVN.
Uros.
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 246738)
+++ config/i386/i386.md (working copy)
@@ -3767,10 +3767,10 @@
(define_insn "*zero_extendsidi2"
[(set (match_operand:DI 0 "nonimmediate_operand"
- "=r,?r,?o,r ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,*r")
+ "=r,?r,?o,r ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,?*x,?*v,*r")
(zero_extend:DI
(match_operand:SI 1 "x86_64_zext_operand"
- "0 ,rm,r ,rmWz,0,r ,m ,*Yj,*x,r ,m ,*k")))]
+ "0 ,rm,r ,rmWz,0,r ,m ,*Yj,*x,r ,m , *x, *v,*k")))]
""
{
switch (get_attr_type (insn))
@@ -3791,6 +3791,15 @@
return "%vpextrd\t{$0, %1, %k0|%k0, %1, 0}";
case TYPE_SSEMOV:
+ if (SSE_REG_P (operands[0]) && SSE_REG_P (operands[1]))
+ {
+ if (EXT_REX_SSE_REG_P (operands[0])
+ || EXT_REX_SSE_REG_P (operands[1]))
+ return "vpmovzxdq\t{%t1, %g0|%g0, %t1}";
+ else
+ return "%vpmovzxdq\t{%1, %0|%0, %1}";
+ }
+
if (GENERAL_REG_P (operands[0]))
return "%vmovd\t{%1, %k0|%k0, %1}";
@@ -3813,6 +3822,10 @@
(eq_attr "alternative" "10")
(const_string "sse2")
(eq_attr "alternative" "11")
+ (const_string "sse4")
+ (eq_attr "alternative" "12")
+ (const_string "avx512f")
+ (eq_attr "alternative" "13")
(const_string "x64_avx512bw")
]
(const_string "*")))
@@ -3821,16 +3834,16 @@
(const_string "multi")
(eq_attr "alternative" "5,6")
(const_string "mmxmov")
- (eq_attr "alternative" "7,9,10")
+ (eq_attr "alternative" "7,9,10,11,12")
(const_string "ssemov")
(eq_attr "alternative" "8")
(const_string "sselog1")
- (eq_attr "alternative" "11")
+ (eq_attr "alternative" "13")
(const_string "mskmov")
]
(const_string "imovx")))
(set (attr "prefix_extra")
- (if_then_else (eq_attr "alternative" "8")
+ (if_then_else (eq_attr "alternative" "8,11,12")
(const_string "1")
(const_string "*")))
(set (attr "length_immediate")
@@ -3848,7 +3861,7 @@
(set (attr "mode")
(cond [(eq_attr "alternative" "5,6")
(const_string "DI")
- (eq_attr "alternative" "7,8,9")
+ (eq_attr "alternative" "7,8,9,11,12")
(const_string "TI")
]
(const_string "SI")))])
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md (revision 246738)
+++ config/i386/sse.md (working copy)
@@ -13516,18 +13516,6 @@
"#"
[(set_attr "isa" "*,sse4,*,*")])
-(define_insn_and_split "*vec_extractv4si_0_zext"
- [(set (match_operand:DI 0 "register_operand" "=r")
- (zero_extend:DI
- (vec_select:SI
- (match_operand:V4SI 1 "register_operand" "v")
- (parallel [(const_int 0)]))))]
- "TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC"
- "#"
- "&& reload_completed"
- [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
- "operands[1] = gen_lowpart (SImode, operands[1]);")
-
(define_insn "*vec_extractv2di_0_sse"
[(set (match_operand:DI 0 "nonimmediate_operand" "=v,m")
(vec_select:DI
@@ -13546,6 +13534,35 @@
[(set (match_dup 0) (match_dup 1))]
"operands[1] = gen_lowpart (<MODE>mode, operands[1]);")
+(define_insn "*vec_extractv4si_0_zext_sse4"
+ [(set (match_operand:DI 0 "register_operand" "=r,x,v")
+ (zero_extend:DI
+ (vec_select:SI
+ (match_operand:V4SI 1 "register_operand" "Yj,x,v")
+ (parallel [(const_int 0)]))))]
+ "TARGET_SSE4_1"
+ "#"
+ [(set_attr "isa" "x64,*,avx512f")])
+
+(define_insn "*vec_extractv4si_0_zext"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (zero_extend:DI
+ (vec_select:SI
+ (match_operand:V4SI 1 "register_operand" "x")
+ (parallel [(const_int 0)]))))]
+ "TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC"
+ "#")
+
+(define_split
+ [(set (match_operand:DI 0 "register_operand")
+ (zero_extend:DI
+ (vec_select:SI
+ (match_operand:V4SI 1 "register_operand")
+ (parallel [(const_int 0)]))))]
+ "TARGET_SSE2 && reload_completed"
+ [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
+ "operands[1] = gen_lowpart (SImode, operands[1]);")
+
(define_insn "*vec_extractv4si"
[(set (match_operand:SI 0 "nonimmediate_operand" "=rm,rm,Yr,*x,x,Yv")
(vec_select:SI