https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79709
--- Comment #9 from Thomas Koenig <tkoenig at gcc dot gnu.org> --- The generated code for the loop seems to be on par with what clang and icc do, so that part is fixed. Initialization is strange for icc. For clang, it really quite short: foo: # @foo .cfi_startproc # BB#0: vxorps %ymm2, %ymm2, %ymm2 vmovapd .LCPI0_0(%rip), %ymm8 # ymm8 = [4.000000e+00,4.000000e+00,4.000000e+00,4.000000e+00] vmovapd %ymm1, %ymm4 vmovapd %ymm0, %ymm5 .p2align 4, 0x90 .LBB0_1: # =>This I vs. gcc: foo: .LFB0: .cfi_startproc vmovsd .LC0(%rip), %xmm2 vmovapd %ymm1, %ymm7 vpxor %xmm5, %xmm5, %xmm5 vmovq %xmm2, %xmm9 vmulpd %ymm1, %ymm1, %ymm10 vmovapd %xmm9, %xmm9 vunpcklpd %xmm2, %xmm9, %xmm3 vinsertf128 $0x0, %xmm3, %ymm9, %ymm9 vextractf128 $0x1, %ymm9, %xmm3 vmovsd %xmm2, %xmm3, %xmm3 vinsertf128 $0x1, %xmm3, %ymm9, %ymm9 vextractf128 $0x1, %ymm9, %xmm3 vunpcklpd %xmm2, %xmm3, %xmm3 vmovsd .LC1(%rip), %xmm2 vmovq %xmm2, %xmm8 vinsertf128 $0x1, %xmm3, %ymm9, %ymm9 vmovapd %xmm8, %xmm8 vunpcklpd %xmm2, %xmm8, %xmm3 vinsertf128 $0x0, %xmm3, %ymm8, %ymm8 vextractf128 $0x1, %ymm8, %xmm3 vmovsd %xmm2, %xmm3, %xmm3 vinsertf128 $0x1, %xmm3, %ymm8, %ymm8 vextractf128 $0x1, %ymm8, %xmm3 vunpcklpd %xmm2, %xmm3, %xmm3 vinsertf128 $0x1, %xmm3, %ymm8, %ymm8 vmovapd %ymm0, %ymm3 jmp .L3 .p2align 4,,10 .p2align 3 .L13: