https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938

--- Comment #6 from postmaster at raasu dot org ---
I tried identical code using intrinsics with both clang and gcc:

clang:

 movdqa xmm1,XMMWORD PTR [rip+0xd98]        # 402050 <_IO_stdin_used+0x50>
 pand   xmm1,xmm0
 movdqa xmm2,xmm0
 pshufb xmm2,XMMWORD PTR [rip+0xd97]        # 402060 <_IO_stdin_used+0x60>
 movdqa xmm3,xmm0
 pshufb xmm3,XMMWORD PTR [rip+0xd9a]        # 402070 <_IO_stdin_used+0x70>
 paddd  xmm2,xmm1
 psrld  xmm0,0x18
 paddd  xmm0,xmm3
 paddd  xmm0,xmm2

gcc:

 movdqa  %xmm0, %xmm1
 movdqa  %xmm0, %xmm2
 movdqa  %xmm0, %xmm3
 pshufb  .LC0(%rip), %xmm1
 pshufb  .LC1(%rip), %xmm2
 pshufb  .LC2(%rip), %xmm3
 pshufb  .LC3(%rip), %xmm0
 paddd   %xmm3, %xmm0
 paddd   %xmm2, %xmm0
 paddd   %xmm1, %xmm0


This is the function using intrinsics:

static __m128i __attribute__((noinline)) haddd_epu8(__m128i a)
{
   __m128i b1 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 12, 0x80,
0x80, 0x80,  8, 0x80, 0x80, 0x80,  4, 0x80, 0x80, 0x80,  0));
   __m128i b2 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 13, 0x80,
0x80, 0x80,  9, 0x80, 0x80, 0x80,  5, 0x80, 0x80, 0x80,  1));
   __m128i b3 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 14, 0x80,
0x80, 0x80, 10, 0x80, 0x80, 0x80,  6, 0x80, 0x80, 0x80,  2));
   __m128i b4 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 15, 0x80,
0x80, 0x80, 11, 0x80, 0x80, 0x80,  7, 0x80, 0x80, 0x80,  3));
   __m128i c = _mm_add_epi32(b1, _mm_add_epi32(b2, _mm_add_epi32(b3, b4)));
   return c;
}

Reply via email to