https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114944
John Platts <john_platts at hotmail dot com> changed: What |Removed |Added ---------------------------------------------------------------------------- Target| |x86_64-*-*, i?86-*-* --- Comment #1 from John Platts <john_platts at hotmail dot com> --- Here is another snippet of code that has suboptimal codegen on SSE2 with GCC 13.2.0: #include <stdint.h> #include <emmintrin.h> __m128i SSE2ShuffleI8(__m128i a, __m128i b) { alignas(16) uint8_t a_lanes[16]; alignas(16) uint8_t b_lanes[16]; _mm_store_si128(reinterpret_cast<__m128i*>(a_lanes), a); _mm_store_si128(reinterpret_cast<__m128i*>(b_lanes), _mm_and_si128(b, _mm_set1_epi8(static_cast<char>(15)))); __m128i v0 = _mm_cvtsi32_si128(a_lanes[b_lanes[0]]); __m128i v1 = _mm_cvtsi32_si128(a_lanes[b_lanes[1]]); __m128i v2 = _mm_cvtsi32_si128(a_lanes[b_lanes[2]]); __m128i v3 = _mm_cvtsi32_si128(a_lanes[b_lanes[3]]); __m128i v4 = _mm_cvtsi32_si128(a_lanes[b_lanes[4]]); __m128i v5 = _mm_cvtsi32_si128(a_lanes[b_lanes[5]]); __m128i v6 = _mm_cvtsi32_si128(a_lanes[b_lanes[6]]); __m128i v7 = _mm_cvtsi32_si128(a_lanes[b_lanes[7]]); __m128i v8 = _mm_cvtsi32_si128(a_lanes[b_lanes[8]]); __m128i v9 = _mm_cvtsi32_si128(a_lanes[b_lanes[9]]); __m128i v10 = _mm_cvtsi32_si128(a_lanes[b_lanes[10]]); __m128i v11 = _mm_cvtsi32_si128(a_lanes[b_lanes[11]]); __m128i v12 = _mm_cvtsi32_si128(a_lanes[b_lanes[12]]); __m128i v13 = _mm_cvtsi32_si128(a_lanes[b_lanes[13]]); __m128i v14 = _mm_cvtsi32_si128(a_lanes[b_lanes[14]]); __m128i v15 = _mm_cvtsi32_si128(a_lanes[b_lanes[15]]); v0 = _mm_unpacklo_epi8(v0, v1); v2 = _mm_unpacklo_epi8(v2, v3); v4 = _mm_unpacklo_epi8(v4, v5); v6 = _mm_unpacklo_epi8(v6, v7); v8 = _mm_unpacklo_epi8(v8, v9); v10 = _mm_unpacklo_epi8(v10, v11); v12 = _mm_unpacklo_epi8(v12, v13); v14 = _mm_unpacklo_epi8(v14, v15); v0 = _mm_unpacklo_epi16(v0, v2); v4 = _mm_unpacklo_epi16(v4, v6); v8 = _mm_unpacklo_epi16(v8, v10); v12 = _mm_unpacklo_epi16(v12, v14); v0 = _mm_unpacklo_epi32(v0, v4); v8 = _mm_unpacklo_epi32(v8, v12); return _mm_unpacklo_epi64(v0, v8); } Here is the code that is generated when the above code is compiled on x86_64 GCC 13.2.0 with the -O2 option: SSE2ShuffleI8(long long __vector(2), long long __vector(2)): sub rsp, 144 pand xmm1, XMMWORD PTR .LC0[rip] movaps XMMWORD PTR [rsp+120], xmm0 movd eax, xmm1 movzx eax, al movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp+104], xmm1 movd xmm0, eax movzx eax, BYTE PTR [rsp+105] movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp+88], xmm1 movd xmm2, eax movzx eax, BYTE PTR [rsp+90] punpcklbw xmm0, xmm2 movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp+72], xmm1 movd xmm8, eax movzx eax, BYTE PTR [rsp+75] movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp+56], xmm1 movd xmm2, eax movzx eax, BYTE PTR [rsp+60] punpcklbw xmm8, xmm2 movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp+40], xmm1 punpcklwd xmm0, xmm8 movd xmm5, eax movzx eax, BYTE PTR [rsp+45] movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp+24], xmm1 movd xmm2, eax movzx eax, BYTE PTR [rsp+30] punpcklbw xmm5, xmm2 movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp+8], xmm1 movd xmm7, eax movzx eax, BYTE PTR [rsp+15] movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp-8], xmm1 movd xmm2, eax movzx eax, BYTE PTR [rsp] punpcklbw xmm7, xmm2 movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp-24], xmm1 punpcklwd xmm5, xmm7 punpckldq xmm0, xmm5 movd xmm3, eax movzx eax, BYTE PTR [rsp-15] movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp-40], xmm1 movd xmm4, eax movzx eax, BYTE PTR [rsp-30] punpcklbw xmm3, xmm4 movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp-56], xmm1 movd xmm6, eax movzx eax, BYTE PTR [rsp-45] movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp-72], xmm1 movd xmm2, eax movzx eax, BYTE PTR [rsp-60] punpcklbw xmm6, xmm2 movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp-88], xmm1 punpcklwd xmm3, xmm6 movd xmm2, eax movzx eax, BYTE PTR [rsp-75] movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp-104], xmm1 movd xmm4, eax movzx eax, BYTE PTR [rsp-90] punpcklbw xmm2, xmm4 movzx eax, BYTE PTR [rsp+120+rax] movaps XMMWORD PTR [rsp-120], xmm1 movd xmm4, eax movzx eax, BYTE PTR [rsp-105] movzx eax, BYTE PTR [rsp+120+rax] add rsp, 144 movd xmm1, eax punpcklbw xmm4, xmm1 movdqa xmm1, xmm2 movdqa xmm2, xmm3 punpcklwd xmm1, xmm4 punpckldq xmm2, xmm1 punpcklqdq xmm0, xmm2 ret .LC0: .quad 1085102592571150095 .quad 1085102592571150095 In the SSE2ShuffleI8 code generated above GCC 13.2.0 unnecessarily stores the result of _mm_and_si128(b, _mm_set1_epi8(static_cast<char>(15))) into 15 different memory locations when optimizations are enabled.