[Bug tree-optimization/88760] GCC unrolling is suboptimal

rguenth at gcc dot gnu.org Wed, 09 Jan 2019 02:13:44 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88760


--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
So LLVM unrolls 4 times while GCC (always) unrolls 8 times.  The unrolled body
for GCC (x86_64 this time) is

.L4:
        movl    (%rdx), %ecx
        vmovsd  (%rax), %xmm8
        addq    $32, %rdx
        addq    $64, %rax
        vmovsd  -56(%rax), %xmm9
        vmovsd  -48(%rax), %xmm10
        vfmadd231sd     (%rsi,%rcx,8), %xmm8, %xmm0
        movl    -28(%rdx), %ecx
        vmovsd  -40(%rax), %xmm11
        vmovsd  -32(%rax), %xmm12
        vfmadd231sd     (%rsi,%rcx,8), %xmm9, %xmm0
        movl    -24(%rdx), %ecx
        vmovsd  -24(%rax), %xmm13
        vmovsd  -16(%rax), %xmm14
        vfmadd231sd     (%rsi,%rcx,8), %xmm10, %xmm0
        movl    -20(%rdx), %ecx
        vmovsd  -8(%rax), %xmm15
        vfmadd231sd     (%rsi,%rcx,8), %xmm11, %xmm0
        movl    -16(%rdx), %ecx
        vfmadd231sd     (%rsi,%rcx,8), %xmm12, %xmm0
        movl    -12(%rdx), %ecx
        vfmadd231sd     (%rsi,%rcx,8), %xmm13, %xmm0
        movl    -8(%rdx), %ecx
        vfmadd231sd     (%rsi,%rcx,8), %xmm14, %xmm0
        movl    -4(%rdx), %ecx
        vfmadd231sd     (%rsi,%rcx,8), %xmm15, %xmm0
        cmpq    %rax, %r9
        jne     .L4

and what you quoted is the prologue.  You didn't quote llvms prologue
but if I read my clangs outout correct it uses a loop there.
(is there sth like -fdump-tree-optimized for clang?)

Our RTL unroller cannot do a loopy prologue but it always has this
jump-into peeled copies thing.  Using --param max-unroll-times=4
produces

.L4:
        movl    (%rdx), %ecx
        vmovsd  (%rax), %xmm2
        addq    $16, %rdx
        addq    $32, %rax
        vmovsd  -24(%rax), %xmm3
        vmovsd  -16(%rax), %xmm4
        vfmadd231sd     (%rsi,%rcx,8), %xmm2, %xmm0
        movl    -12(%rdx), %ecx
        vmovsd  -8(%rax), %xmm5
        vfmadd231sd     (%rsi,%rcx,8), %xmm3, %xmm0
        movl    -8(%rdx), %ecx
        vfmadd231sd     (%rsi,%rcx,8), %xmm4, %xmm0
        movl    -4(%rdx), %ecx
        vfmadd231sd     (%rsi,%rcx,8), %xmm5, %xmm0
        cmpq    %rax, %r8
        jne     .L4

which is nearly equivalent to clnags varaint?

[Bug tree-optimization/88760] GCC unrolling is suboptimal

Reply via email to