https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79938
--- Comment #6 from postmaster at raasu dot org --- I tried identical code using intrinsics with both clang and gcc: clang: movdqa xmm1,XMMWORD PTR [rip+0xd98] # 402050 <_IO_stdin_used+0x50> pand xmm1,xmm0 movdqa xmm2,xmm0 pshufb xmm2,XMMWORD PTR [rip+0xd97] # 402060 <_IO_stdin_used+0x60> movdqa xmm3,xmm0 pshufb xmm3,XMMWORD PTR [rip+0xd9a] # 402070 <_IO_stdin_used+0x70> paddd xmm2,xmm1 psrld xmm0,0x18 paddd xmm0,xmm3 paddd xmm0,xmm2 gcc: movdqa %xmm0, %xmm1 movdqa %xmm0, %xmm2 movdqa %xmm0, %xmm3 pshufb .LC0(%rip), %xmm1 pshufb .LC1(%rip), %xmm2 pshufb .LC2(%rip), %xmm3 pshufb .LC3(%rip), %xmm0 paddd %xmm3, %xmm0 paddd %xmm2, %xmm0 paddd %xmm1, %xmm0 This is the function using intrinsics: static __m128i __attribute__((noinline)) haddd_epu8(__m128i a) { __m128i b1 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 12, 0x80, 0x80, 0x80, 8, 0x80, 0x80, 0x80, 4, 0x80, 0x80, 0x80, 0)); __m128i b2 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 13, 0x80, 0x80, 0x80, 9, 0x80, 0x80, 0x80, 5, 0x80, 0x80, 0x80, 1)); __m128i b3 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 14, 0x80, 0x80, 0x80, 10, 0x80, 0x80, 0x80, 6, 0x80, 0x80, 0x80, 2)); __m128i b4 = _mm_shuffle_epi8(a, _mm_set_epi8(0x80, 0x80, 0x80, 15, 0x80, 0x80, 0x80, 11, 0x80, 0x80, 0x80, 7, 0x80, 0x80, 0x80, 3)); __m128i c = _mm_add_epi32(b1, _mm_add_epi32(b2, _mm_add_epi32(b3, b4))); return c; }