https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86557
--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> --- ICC seems to emulate this even for SSE2 where I'm not sure this is profitable: ..B1.2: # Preds ..B1.2 ..B1.1 # Execution count [1.02e+03] movdqu .L_2il0floatpacket.0(%rip), %xmm2 #6.19 lea x(,%rax,8), %rdx #6.12 movdqu (%rdx), %xmm1 #6.12 movdqa %xmm2, %xmm0 #6.19 pand %xmm1, %xmm0 #6.19 movdqa %xmm1, %xmm3 #6.19 psrlq $1, %xmm3 #6.19 psrad $1, %xmm0 #6.19 por %xmm0, %xmm3 #6.19 psrlq $62, %xmm3 #6.19 paddq %xmm1, %xmm3 #6.19 pand %xmm3, %xmm2 #6.19 psrlq $2, %xmm3 #6.19 psrad $2, %xmm2 #6.19 por %xmm2, %xmm3 #6.19 movdqu %xmm3, (%rdx) #6.5 addq $2, %rax #5.3 cmpq $1024, %rax #5.3 jb ..B1.2 # Prob 99% #5.3 and for AVX2: ..B1.2: # Preds ..B1.2 ..B1.1 # Execution count [1.02e+03] lea x(,%rax,8), %rdx #6.12 vmovdqu (%rdx), %ymm4 #6.12 vpsrlq $1, %ymm4, %ymm0 #6.19 vpsrad $1, %ymm4, %ymm1 #6.19 vpblendw $204, %ymm1, %ymm0, %ymm2 #6.19 vpsrlq $62, %ymm2, %ymm3 #6.19 vpaddq %ymm4, %ymm3, %ymm5 #6.19 vpsrlq $2, %ymm5, %ymm6 #6.19 vpsrad $2, %ymm5, %ymm7 #6.19 vpblendw $204, %ymm7, %ymm6, %ymm8 #6.19 vmovdqu %ymm8, (%rdx) #6.5 addq $4, %rax #5.3 cmpq $1024, %rax #5.3 jb ..B1.2 # Prob 99% #5.3 long x[1024]; void foo() { for (int i = 0; i < 1024; ++i) x[i] = x[i] / 4; }