Hi,
while working on the Firefox performance
http://hubicka.blogspot.com/2018/12/even-more-fun-with-building-and.html
I noticed that we still disable inter-unit-moves because of Bulldozer
CPUs. Last year I left that in tuning because Zens was new and it did
not seem to hurt much other CPUs in tests. I think it is time to tune
for more recent CPUs in this case because it also turns out that this
penalizes quite noticeably hand written vectorized code. In particular
#include <emmintrin.h>
__m128i test (short a,short b,short c,short d,short e,short f,short g)
{
return _mm_set_epi16(a,b,c,d,e,f,g,255);
}
translates to
movzwl %r8w, %r8d
movzwl %di, %edi
movzwl %r9w, %r9d
movzwl %si, %esi
salq $16, %r8
salq $16, %rdi
movzwl 8(%rsp), %eax
movzwl %dx, %edx
orq %r9, %r8
orq %rsi, %rdi
movzwl %cx, %ecx
salq $16, %r8
salq $16, %rdi
orq %rax, %r8
orq %rdx, %rdi
salq $16, %r8
salq $16, %rdi
orb $-1, %r8b
orq %rcx, %rdi
movq %r8, -24(%rsp)
movq %rdi, -16(%rsp)
movdqa -24(%rsp), %xmm0
ret
Which has mismatches store.
Now we produce:
movd %r9d, %xmm3
movd %ecx, %xmm1
movd %esi, %xmm2
movl $255, %eax
movd %eax, %xmm0
pinsrw $1, %r8d, %xmm3
pinsrw $1, %edx, %xmm1
pinsrw $1, 8(%rsp), %xmm0
pinsrw $1, %edi, %xmm2
punpckldq %xmm2, %xmm1
punpckldq %xmm3, %xmm0
punpcklqdq %xmm1, %xmm0
which is a lot better.
Clan does
movd %edi, %xmm0
movd %esi, %xmm1
punpcklwd %xmm0, %xmm1 # xmm1 =
xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
movd %edx, %xmm0
movd %ecx, %xmm2
punpcklwd %xmm0, %xmm2 # xmm2 =
xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
punpckldq %xmm1, %xmm2 # xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
movd %r8d, %xmm0
movd %r9d, %xmm1
punpcklwd %xmm0, %xmm1 # xmm1 =
xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
movl $255, %eax
movd %eax, %xmm0
movd 8(%rsp), %xmm3 # xmm3 = mem[0],zero,zero,zero
punpcklwd %xmm3, %xmm0 # xmm0 =
xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
punpckldq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
punpcklqdq %xmm2, %xmm0 # xmm0 = xmm0[0],xmm2[0]
Which looks worse, but I like the comment explaining semantics.
Bootstrapped/regtested x86_64 linux. Comitted.
* x86-tune.def: Enable inter_unit_moves_to_vec for generic.
Index: config/i386/x86-tune.def
===================================================================
--- config/i386/x86-tune.def (revision 267477)
+++ config/i386/x86-tune.def (working copy)
@@ -379,9 +379,13 @@ DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "s
/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
to SSE registers. If disabled, the moves will be done by storing
- the value to memory and reloading. */
+ the value to memory and reloading.
+ Enable this flag for generic - the only relevant architecture preferring
+ no inter-unit moves is Buldozer. While this makes small regression on SPECfp
+ scores (sub 0.3%), disabling inter-unit moves penalizes noticeably hand
+ written vectorized code which use i.e. _mm_set_epi16. */
DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
- ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC))
+ ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER))
/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
to integer registers. If disabled, the moves will be done by storing