https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88713
--- Comment #7 from Chris Elrod <elrodc at gmail dot com> --- Created attachment 45357 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=45357&action=edit Assembly generated by Flang compiler on the original version of the code. This is the main loop body in the Flang compiled version of the original code (starts line 132): .LBB1_8: # %vector.body # =>This Inner Loop Header: Depth=1 leaq (%rsi,%rbx,4), %r12 vmovups (%rcx,%r12), %zmm2 addq %rcx, %r12 leaq (%r12,%rcx), %rbp vmovups (%r11,%rbp), %zmm3 addq %r11, %rbp leaq (%rcx,%rbp), %r13 leaq (%rcx,%r13), %r8 leaq (%r8,%rcx), %r10 leaq (%r10,%rcx), %r14 vmovups (%rcx,%r14), %zmm4 vrsqrt14ps %zmm4, %zmm5 vmulps %zmm5, %zmm4, %zmm4 vfmadd213ps %zmm0, %zmm5, %zmm4 # zmm4 = (zmm5 * zmm4) + zmm0 vmulps %zmm1, %zmm5, %zmm5 vmulps %zmm4, %zmm5, %zmm4 .Ltmp1: .loc 1 31 1 is_stmt 1 # vectorization_test.f90:31:1 vmulps (%rcx,%r8), %zmm4, %zmm5 .loc 1 32 1 # vectorization_test.f90:32:1 vmulps (%rcx,%r10), %zmm4, %zmm6 vmovups (%rcx,%r13), %zmm7 .loc 1 33 1 # vectorization_test.f90:33:1 vfnmadd231ps %zmm6, %zmm6, %zmm7 # zmm7 = -(zmm6 * zmm6) + zmm7 vrsqrt14ps %zmm7, %zmm8 vmulps %zmm8, %zmm7, %zmm7 vfmadd213ps %zmm0, %zmm8, %zmm7 # zmm7 = (zmm8 * zmm7) + zmm0 vmulps %zmm1, %zmm8, %zmm8 vmulps %zmm7, %zmm8, %zmm7 vmovups (%rcx,%rbp), %zmm8 .loc 1 35 1 # vectorization_test.f90:35:1 vfnmadd231ps %zmm5, %zmm6, %zmm8 # zmm8 = -(zmm6 * zmm5) + zmm8 vmulps %zmm8, %zmm7, %zmm8 vmulps %zmm5, %zmm5, %zmm9 vfmadd231ps %zmm8, %zmm8, %zmm9 # zmm9 = (zmm8 * zmm8) + zmm9 vsubps %zmm9, %zmm3, %zmm3 vrsqrt14ps %zmm3, %zmm9 vmulps %zmm9, %zmm3, %zmm3 vfmadd213ps %zmm0, %zmm9, %zmm3 # zmm3 = (zmm9 * zmm3) + zmm0 vmulps %zmm1, %zmm9, %zmm9 vmulps %zmm3, %zmm9, %zmm3 .loc 1 39 1 # vectorization_test.f90:39:1 vmulps %zmm8, %zmm7, %zmm8 .loc 1 40 1 # vectorization_test.f90:40:1 vmulps (%rcx,%r12), %zmm4, %zmm4 .loc 1 39 1 # vectorization_test.f90:39:1 vmulps %zmm3, %zmm8, %zmm8 .loc 1 41 1 # vectorization_test.f90:41:1 vmulps %zmm8, %zmm2, %zmm9 vfmsub231ps (%rsi,%rbx,4), %zmm3, %zmm9 # zmm9 = (zmm3 * mem) - zmm9 vmulps %zmm5, %zmm3, %zmm3 vfmsub231ps %zmm8, %zmm6, %zmm3 # zmm3 = (zmm6 * zmm8) - zmm3 vfmadd213ps %zmm9, %zmm4, %zmm3 # zmm3 = (zmm4 * zmm3) + zmm9 .loc 1 42 1 # vectorization_test.f90:42:1 vmulps %zmm4, %zmm6, %zmm5 vmulps %zmm5, %zmm7, %zmm5 vfmsub231ps %zmm7, %zmm2, %zmm5 # zmm5 = (zmm2 * zmm7) - zmm5 .Ltmp2: .loc 1 15 1 # vectorization_test.f90:15:1 vmovups %zmm3, (%rdi,%rbx,4) movq -16(%rsp), %rbp # 8-byte Reload vmovups %zmm5, (%rbp,%rbx,4) vmovups %zmm4, (%rax,%rbx,4) addq $16, %rbx cmpq %rbx, %rdx jne .LBB1_8 zmm registers are 64 byte registers. It vmovups from memory into registers, performs a series of arithmetics and inverse square roots on them, and then vmovups three of these 64 byte registers back into memory. That is the most efficient memory access pattern (as demonstrated empirically via benchmarks).