https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63537
--- Comment #2 from Tavian Barnes <tavianator at gmail dot com> ---
Is it possible to make SRA work even if the loop isn't unrolled? If the array
size is increased to 4 then -O2 doesn't unroll the loop at all, resulting in:
movq %rdi, %rax
xorl %edx, %edx
.L3:
movsd 8(%rsp,%rdx), %xmm1
mulsd %xmm0, %xmm1
movsd %xmm1, -40(%rsp,%rdx)
addq $8, %rdx
cmpq $32, %rdx
jne .L3
movq -40(%rsp), %rdx
movq %rdx, (%rax)
movq -32(%rsp), %rdx
movq %rdx, 8(%rax)
movq -24(%rsp), %rdx
movq %rdx, 16(%rax)
movq -16(%rsp), %rdx
movq %rdx, 24(%rax)
ret
which would be a lot prettier as something like:
movq %rdi, %rax
xorl %edx, %edx
.L3:
movsd 8(%rsp,%rdx), %xmm1
mulsd %xmm0, %xmm1
movsd %xmm1, (%rax,%rdx)
addl $8, %edx
cmpl $32, %edx
jne .L3
ret