https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111332
--- Comment #6 from d_vampile <d_vampile at 163 dot com> --- GCC 7.3.0 produces: extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_loadu_si256 (__m256i_u const *__P) { return *__P; 401170: c5 fa 6f 1e vmovdqu (%rsi),%xmm3 dst = (uint8_t *)dst + 128; 401174: 48 83 ef 80 sub $0xffffffffffffff80,%rdi src = (const uint8_t *)src + 128; 401178: 48 83 ee 80 sub $0xffffffffffffff80,%rsi 40117c: c5 fa 6f 56 a0 vmovdqu -0x60(%rsi),%xmm2 401181: c4 e3 65 38 5e 90 01 vinserti128 $0x1,-0x70(%rsi),%ymm3,%ymm3 401188: c5 fa 6f 4e c0 vmovdqu -0x40(%rsi),%xmm1 40118d: c4 e3 6d 38 56 b0 01 vinserti128 $0x1,-0x50(%rsi),%ymm2,%ymm2 401194: c5 fa 6f 46 e0 vmovdqu -0x20(%rsi),%xmm0 401199: c4 e3 75 38 4e d0 01 vinserti128 $0x1,-0x30(%rsi),%ymm1,%ymm1 4011a0: c4 e3 7d 38 46 f0 01 vinserti128 $0x1,-0x10(%rsi),%ymm0,%ymm0 } extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm256_storeu_si256 (__m256i_u *__P, __m256i __A) { *__P = __A; 4011a7: c5 f8 11 5f 80 vmovups %xmm3,-0x80(%rdi) 4011ac: c4 e3 7d 39 5f 90 01 vextracti128 $0x1,%ymm3,-0x70(%rdi) 4011b3: c5 f8 11 57 a0 vmovups %xmm2,-0x60(%rdi) 4011b8: c4 e3 7d 39 57 b0 01 vextracti128 $0x1,%ymm2,-0x50(%rdi) 4011bf: c5 f8 11 4f c0 vmovups %xmm1,-0x40(%rdi) 4011c4: c4 e3 7d 39 4f d0 01 vextracti128 $0x1,%ymm1,-0x30(%rdi) 4011cb: c5 f8 11 47 e0 vmovups %xmm0,-0x20(%rdi) 4011d0: c4 e3 7d 39 47 f0 01 vextracti128 $0x1,%ymm0,-0x10(%rdi) while (n >= 128) { 4011d7: 48 39 c7 cmp %rax,%rdi 4011da: 75 94 jne 401170 <rte_mov128blocks+0x20> 4011dc: c5 f8 77 vzeroupper In terms of runtime, this code is the best.