https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81303

--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
Without peeling for alignment the numbers improve but we still regress from
176s to 205s.  The innermost (unrolled) loop is:

.L11:
        vmovsd  (%rdi,%r15,2), %xmm2
        vmovsd  (%rsi,%r15,2), %xmm1
        movq    -56(%rbp), %rbx
        vmovhpd (%rdi,%r14), %xmm2, %xmm0
        vmovsd  (%rdi), %xmm2
        vmovhpd (%rsi,%r14), %xmm1, %xmm6
        vmovsd  (%rsi), %xmm1
        vmovhpd (%rdi,%r15), %xmm2, %xmm2
        vmovhpd (%rsi,%r15), %xmm1, %xmm1
        addq    %r11, %rdi
        addq    %r11, %rsi
        vinsertf128     $0x1, %xmm0, %ymm2, %ymm2
        vmovsd  (%rcx,%r15,2), %xmm0
        vinsertf128     $0x1, %xmm6, %ymm1, %ymm1
        vmulpd  (%rbx,%rax), %ymm2, %ymm3
        movq    -72(%rbp), %rbx
        vmovapd %ymm1, %ymm2
        vmovhpd (%rcx,%r14), %xmm0, %xmm6
        vmovsd  (%rcx), %xmm0
        vfmadd132pd     (%rbx,%rax), %ymm3, %ymm2
        movq    -88(%rbp), %rbx
        vmovhpd (%rcx,%r15), %xmm0, %xmm0
        addq    %r11, %rcx
        vinsertf128     $0x1, %xmm6, %ymm0, %ymm0
        vmulpd  (%rbx,%rax), %ymm0, %ymm3
        vmovsd  (%rdx,%r15,2), %xmm0
        movq    -64(%rbp), %rbx
        vmovhpd (%rdx,%r14), %xmm0, %xmm6
        vmovsd  (%rdx), %xmm0
        vmovhpd (%rdx,%r15), %xmm0, %xmm0
        addq    %r11, %rdx
        vinsertf128     $0x1, %xmm6, %ymm0, %ymm0
        vfmadd132pd     (%rbx,%rax), %ymm3, %ymm0
        vaddpd  %ymm0, %ymm2, %ymm1
        vmovsd  (%r9,%r15,2), %xmm0
        vmovhpd (%r9,%r14), %xmm0, %xmm3
        vmovsd  (%r9), %xmm0
        vmovhpd (%r9,%r15), %xmm0, %xmm0
        addq    %r11, %r9
        vinsertf128     $0x1, %xmm3, %ymm0, %ymm0
        vmulpd  (%r12,%rax), %ymm0, %ymm2
        vmovsd  (%r8,%r15,2), %xmm0
        vmovhpd (%r8,%r14), %xmm0, %xmm3
        vmovsd  (%r8), %xmm0
        vmovhpd (%r8,%r15), %xmm0, %xmm0
        movq    -80(%rbp), %rbx
        addq    %r11, %r8
        vinsertf128     $0x1, %xmm3, %ymm0, %ymm0
        vfmadd132pd     (%rbx,%rax), %ymm2, %ymm0
        vaddpd  %ymm0, %ymm1, %ymm0
        vmovsd  (%r10,%r15,2), %xmm1
        vmovhpd (%r10,%r14), %xmm1, %xmm2
        vmovsd  (%r10), %xmm1
        vmovhpd (%r10,%r15), %xmm1, %xmm1
        addq    %r11, %r10
        vinsertf128     $0x1, %xmm2, %ymm1, %ymm1
        vfmadd231pd     0(%r13,%rax), %ymm1, %ymm4
        addq    $32, %rax
        vaddpd  %ymm4, %ymm0, %ymm4
        cmpq    -96(%rbp), %rax
        jne     .L11

vs

.L10:
        vmovsd  (%rax,%rbx,8), %xmm0
        vmulsd  (%r15,%rdx), %xmm0, %xmm0
        vmovsd  (%r8,%rdx), %xmm1
        vfmadd132sd     (%rax,%r11,8), %xmm0, %xmm1
        vmovsd  (%rax,%rsi,8), %xmm0
        vmulsd  (%r12,%rdx), %xmm0, %xmm0
        vmovsd  0(%rbp,%rdx), %xmm4
        vfmadd231sd     (%rax), %xmm4, %xmm0
        vmovsd  (%r14,%rdx), %xmm5
        vmovsd  (%rdi,%rdx), %xmm6
        vfmadd231sd     (%rax,%r9,8), %xmm6, %xmm2
        vaddsd  %xmm0, %xmm1, %xmm0
        vmovsd  (%rax,%r10,8), %xmm1
        vmulsd  0(%r13,%rdx), %xmm1, %xmm1
        vfmadd231sd     (%rax,%rcx,8), %xmm5, %xmm1
        addq    -112(%rsp), %rdx
        addq    $8, %rax
        vaddsd  %xmm2, %xmm1, %xmm2
        vaddsd  %xmm2, %xmm0, %xmm2
        cmpq    -120(%rsp), %rax
        jne     .L10

looks like register pressure is high and IVO doesn't do the best job either.
The vectorized loop might also run into CPU arch limits with respect to
loop cache (it's 310 bytes long).

Reply via email to