https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84201

Martin Liška <marxin at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|ASSIGNED                    |NEW
           Assignee|marxin at gcc dot gnu.org          |unassigned at gcc dot 
gnu.org

--- Comment #10 from Martin Liška <marxin at gcc dot gnu.org> ---
Using -fdbg-cnt option:
-fdbg-cnt=vect_loop:0 -fdbg-cnt=vect_loop:4:5:power.fppized

I was able to track that to a single vectorization that happens here:

   308  !! Set up frequency vector(s)
   309  tmppower => first_power
   310  ipower = 1
   311  do while(associated(tmppower))
   312    if ( tmppower%nofreq>1 ) then
   313      freqstep = (tmppower%freqlast-tmppower%freqfirst)                  
    &
   314               / real(tmppower%nofreq - 1, kind=rfp)
   315    else
   316      freqstep = 0.0_rfp
   317    end if
   318    freq = tmppower%freqfirst
   319    do ifreq = 1, tmppower%nofreq <------ HERE
   320      frequency(ifreq,ipower) = freq
   321      freq = freq + freqstep
   322    end do
   323    tmppower => tmppower%next
   324    ipower = ipower + 1
   325  end do

vect dump:
...
power.fppized.f90:319:0: note:    Runtime profitability threshold = 4
power.fppized.f90:319:0: note:    Static estimate profitability threshold = 9
power.fppized.f90:319:0: note:  epilog loop required
power.fppized.f90:319:0: note:  vect_can_advance_ivs_p:
power.fppized.f90:319:0: note:  Analyze phi: freq_20 = PHI <pretmp_1948(136),
freq_728(180)>
power.fppized.f90:319:0: note:  Analyze phi: ifreq_1653 = PHI <1(136),
ifreq_729(180)>
power.fppized.f90:319:0: note:  Analyze phi: .MEM_137 = PHI <.MEM_134(136),
.MEM_727(180)>
power.fppized.f90:319:0: note:  reduc or virtual phi. skip.
***dbgcnt: upper limit 5 reached for vect_loop.***
power.fppized.f90:319:0: optimized: loop vectorized using 32 byte vectors
power.fppized.f90:319:0: note:  === vec_transform_loop ===
power.fppized.f90:319:0: note:  Profitability threshold is 4 loop iterations.
...

which leads to following assembly:

.L248:
        movl    88(%rdi), %esi
        vmovsd  96(%rdi), %xmm2
        cmpl    $1, %esi
        jle     .L243
        vmovsd  104(%rdi), %xmm1
        leal    -1(%rsi), %eax
        vcvtsi2sdl      %eax, %xmm5, %xmm3
        vsubsd  %xmm2, %xmm1, %xmm1
        testl   %esi, %esi
        movl    %r10d, %r9d
        vdivsd  %xmm3, %xmm1, %xmm3
        cmovg   %esi, %r9d
        cmpl    $3, %esi
        jle     .L371
        vaddsd  %xmm2, %xmm3, %xmm0
        movl    %r9d, %ecx
        shrl    $2, %ecx
        vaddsd  %xmm3, %xmm0, %xmm1
        salq    $5, %rcx
        vunpcklpd       %xmm0, %xmm2, %xmm0
        vaddsd  %xmm3, %xmm1, %xmm4
        addq    %r8, %rcx
        movq    %r8, %rax
        vunpcklpd       %xmm4, %xmm1, %xmm1
        vmulsd  %xmm6, %xmm3, %xmm4
        vinsertf128     $0x1, %xmm1, %ymm0, %ymm0
        vbroadcastsd    %xmm4, %ymm4
        .p2align 4,,10
        .p2align 3
.L250:
        vmovapd %ymm0, %ymm1
        vmovupd %ymm1, (%rax)
        addq    $32, %rax
        vaddpd  %ymm4, %ymm0, %ymm0
        cmpq    %rax, %rcx
        jne     .L250
        movl    %r9d, %eax
        andl    $-4, %eax
        vcvtsi2sdl      %eax, %xmm5, %xmm0
        leal    1(%rax), %ecx
        vfmadd231sd     %xmm3, %xmm0, %xmm2
        cmpl    %r9d, %eax
        je      .L251
        movslq  %ecx, %rcx
        addq    %rdx, %rcx
        addl    $2, %eax
        vmovsd  %xmm2, (%r11,%rcx,8)
        vaddsd  %xmm3, %xmm2, %xmm2
        cmpl    %esi, %eax
        jg      .L251
.L270:
        movslq  %eax, %rcx
        addq    %rdx, %rcx
        incl    %eax
        vmovsd  %xmm2, (%r11,%rcx,8)
        vaddsd  %xmm3, %xmm2, %xmm2
        cmpl    %eax, %esi
        jl      .L251
        cltq
        addq    %rdx, %rax
        vmovsd  %xmm2, (%r11,%rax,8)
.L251:
        movq    136(%rdi), %rdi
        addq    %r15, %rdx
        addq    %r12, %r8
        testq   %rdi, %rdi
        jne     .L248

So there's probably nothing we can do right now? I'm removing wrong-code as
it's about floating point precision
when using -Ofast and vectors, which is a known limitation if I'm correct.

Reply via email to