https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94427
Jan Hubicka <hubicka at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |hubicka at gcc dot gnu.org --- Comment #3 from Jan Hubicka <hubicka at gcc dot gnu.org> --- With profile feedback on zen4 we now get hottest loops as: │ dc[k] = dc[k-1] + tpdd[k-1]; ▒ │16b0:┌─ vmovd (%r14,%rdx,1),%xmm2 ▒ 0.15 │ │ vpaddd %xmm2,%xmm0,%xmm0 ▒ 5.79 │ │ vmovd %xmm0,0x4(%rax,%rdx,1) ▒ │ │if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; ▒ 8.04 │ │ vmovd (%r15,%rdx,1),%xmm7 ▒ 0.16 │ │ vmovd (%rcx,%rdx,1),%xmm2 ▒ 0.41 │ │ vpaddd %xmm7,%xmm2,%xmm2 ▒ │ │if (dc[k] < -INFTY) dc[k] = -INFTY; ▒ 0.71 │ │ vmovdqa _IO_stdin_used+0x560,%xmm7 ◆ 1.07 │ │ vpmaxsd %xmm7,%xmm2,%xmm2 ▒ 0.73 │ │ vpmaxsd %xmm0,%xmm2,%xmm0 ▒ 5.83 │ │ vmovd %xmm0,0x4(%rax,%rdx,1) ▒ │ │for (k = 1; k <= M; k++) { ▒ 5.86 │ │ add $0x4,%rdx ▒ 1.40 │ ├──cmp %rdx,%r13 ▒ 0.00 │ └──jne 16b0 ▒ no time is spent in cold section. Without profile I get: 88.80% hmmer_peak.chn- [.] P7Viterbi ◆ 5.10% hmmer_peak.chn- [.] sre_random ▒ 2.31% hmmer_peak.chn- [.] FChoose ▒ 1.35% hmmer_peak.chn- [.] RandomSequence ▒ so no time in cold section either. internal loop almost identical: │17e0:┌─ vmovd (%r11,%rdi,4),%xmm3 ▒ 0.07 │ │ mov %rdi,%r8 ▒ 0.09 │ │ vpaddd %xmm3,%xmm0,%xmm0 ▒ 6.20 │ │ vmovd %xmm0,0x4(%rdx,%rdi,4) ▒ │ │if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; ▒ 7.00 │ │ vmovd (%rax,%rdi,4),%xmm6 ▒ 0.19 │ │ vmovd (%r10,%rdi,4),%xmm3 ▒ 0.16 │ │ vpaddd %xmm3,%xmm6,%xmm3 ◆ │ │if (dc[k] < -INFTY) dc[k] = -INFTY; ▒ 1.25 │ │ vmovdqa _IO_stdin_used+0x600,%xmm6 ▒ 0.89 │ │ vpmaxsd %xmm6,%xmm3,%xmm3 ▒ 0.46 │ │ vpmaxsd %xmm0,%xmm3,%xmm0 ▒ 5.85 │ │ vmovd %xmm0,0x4(%rdx,%rdi,4) ▒ │ │for (k = 1; k <= M; k++) { ▒ 6.02 │ │ inc %rdi ▒ 2.48 │ ├──cmp %r8,%r9 ▒ 0.00 │ └──jne 17e0 ▒ However the hottest loop seems to be completely elsewhere then shown by you since it is FP loop and yours seems integer?