https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105275

Jan Hubicka <hubicka at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
     Ever confirmed|0                           |1
   Last reconfirmed|                            |2025-04-26

--- Comment #7 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
I built with -march=native -Ofast -flto -fno-ipa-cp + PGO (without -flto the
problem does not reproduce).

imagick has bad train run and the hottest loop is not trained at all. Compared
to GCC15 the cold loop is now much slower:

  50.26%  imagick_r_peak.  imagick_r_peak.trunk-pgolto-Ofast-native-m64        
 [.] MorphologyApply.cold
  15.85%  imagick_r_peak.  imagick_r_peak.gcc15-pgolto-Ofast-native-m64        
 [.] MorphologyApply.cold
   9.93%  imagick_r_peak.  imagick_r_peak.gcc15-pgolto-Ofast-native-m64        
 [.] MeanShiftImage
   9.24%  imagick_r_peak.  imagick_r_peak.trunk-pgolto-Ofast-native-m64        
 [.] MeanShiftImage
   4.41%  imagick_r_peak.  imagick_r_peak.trunk-pgolto-Ofast-native-m64        
 [.] GetVirtualPixelsFromNexus
   3.64%  imagick_r_peak.  imagick_r_peak.gcc15-pgolto-Ofast-native-m64        
 [.] GetVirtualPixelsFromNexus
   1.51%  imagick_r_peak.  imagick_r_peak.gcc15-pgolto-Ofast-native-m64        
 [.] MagickRound.lto_priv.1.lto_priv.0
   1.38%  imagick_r_peak.  imagick_r_peak.trunk-pgolto-Ofast-native-m64        
 [.] MagickRound.lto_priv.1.lto_priv.0
   1.21%  imagick_r_peak.  imagick_r_peak.trunk-pgolto-Ofast-native-m64        
 [.] GetOneCacheViewVirtualPixel
   1.18%  imagick_r_peak.  imagick_r_peak.gcc15-pgolto-Ofast-native-m64        
 [.] GetOneCacheViewVirtualPixel

Codegen of the internal loop is same for both versions:

   0.00 │2afd:┌─ cmp          %rdx,%rcx                                ▒
   0.00 │     │  jge          312f                                     ▒
        │     │result.red     += (*k)*k_pixels[u].red;                 ▒
        │     │  imul         $0xfffffffffffffff8,%rcx,%r11            ▒
  17.91 │     │  vmovsd       (%rax,%r11,1),%xmm5                      ▒
   0.00 │     │  movzwl       0x4(%rbx,%rcx,8),%r11d                   ▒
   0.00 │     │  vcvtsi2sd    %r11d,%xmm6,%xmm6                        ▒
        │     │result.green   += (*k)*k_pixels[u].green;               ▒
  22.46 │     │  movzwl       0x2(%rbx,%rcx,8),%r11d                   ▒
        │     │result.red     += (*k)*k_pixels[u].red;                 ▒
   0.00 │     │  vfmadd231sd  %xmm6,%xmm5,%xmm4                        ▒
        │     │result.green   += (*k)*k_pixels[u].green;               ▒
   0.04 │     │  vcvtsi2sd    %r11d,%xmm6,%xmm6                        ▒
        │     │result.blue    += (*k)*k_pixels[u].blue;                ▒
  17.09 │     │  movzwl       (%rbx,%rcx,8),%r11d                      ▒
        │     │result.green   += (*k)*k_pixels[u].green;               ▒
   0.00 │     │  vfmadd231sd  %xmm6,%xmm5,%xmm3                        ▒
        │     │result.blue    += (*k)*k_pixels[u].blue;                ▒
  12.99 │     │  vcvtsi2sd    %r11d,%xmm6,%xmm6                        ▒
        │     │result.opacity += (*k)*k_pixels[u].opacity;             ▒
   0.15 │     │  movzwl       0x6(%rbx,%rcx,8),%r11d                   ▒
        │     │result.blue    += (*k)*k_pixels[u].blue;                ▒
   0.00 │     │  vfmadd231sd  %xmm6,%xmm5,%xmm2                        ▒
        │     │result.opacity += (*k)*k_pixels[u].opacity;             ▒
  19.48 │     │  vcvtsi2sd    %r11d,%xmm6,%xmm6                        ▒
   0.00 │     │  vfmadd231sd  %xmm6,%xmm5,%xmm1                        ▒
        │     │if ( image->colorspace == CMYKColorspace)               ▒
   9.61 │     │  cmp          $0xc,%esi                                ◆
   0.00 │     │  je           317c                                     ▒
        │     │for (u=0; u < (ssize_t) kernel->width; u++, k--) {      ▒
        │2b58:│  inc          %rcx                                     ▒
   0.00 │     └──jmp          2afd                                     ▒


   0.02 │2b34:┌─ cmp          %rdx,%rcx                                ▒
   3.62 │     │  jge          3166                                     ▒
        │     │result.red     += (*k)*k_pixels[u].red;                 ▒
        │     │  imul         $0xfffffffffffffff8,%rcx,%r11            ▒
   0.19 │     │  vmovsd       (%rax,%r11,1),%xmm5                      ▒
   8.36 │     │  movzwl       0x4(%rbx,%rcx,8),%r11d                   ▒
   0.00 │     │  vcvtsi2sd    %r11d,%xmm12,%xmm6                       ▒
        │     │result.green   += (*k)*k_pixels[u].green;               ▒
   7.13 │     │  movzwl       0x2(%rbx,%rcx,8),%r11d                   ▒
        │     │result.red     += (*k)*k_pixels[u].red;                 ▒
  19.29 │     │  vfmadd231sd  %xmm6,%xmm5,%xmm4                        ▒
        │     │result.green   += (*k)*k_pixels[u].green;               ▒
   0.10 │     │  vcvtsi2sd    %r11d,%xmm12,%xmm6                       ▒
        │     │result.blue    += (*k)*k_pixels[u].blue;                ▒
   4.76 │     │  movzwl       (%rbx,%rcx,8),%r11d                      ▒
        │     │result.green   += (*k)*k_pixels[u].green;               ▒
  13.90 │     │  vfmadd231sd  %xmm6,%xmm5,%xmm3                        ▒
        │     │result.blue    += (*k)*k_pixels[u].blue;                ▒
   0.03 │     │  vcvtsi2sd    %r11d,%xmm12,%xmm6                       ▒
        │     │result.opacity += (*k)*k_pixels[u].opacity;             ▒
   7.87 │     │  movzwl       0x6(%rbx,%rcx,8),%r11d                   ▒
        │     │result.blue    += (*k)*k_pixels[u].blue;                ▒
  11.34 │     │  vfmadd231sd  %xmm6,%xmm5,%xmm2                        ▒
        │     │result.opacity += (*k)*k_pixels[u].opacity;             ▒
   0.07 │     │  vcvtsi2sd    %r11d,%xmm12,%xmm6                       ▒
   4.52 │     │  vfmadd231sd  %xmm6,%xmm5,%xmm1                        ▒
        │     │if ( image->colorspace == CMYKColorspace)               ◆
   7.03 │     │  cmp          $0xc,%esi                                ▒
   0.01 │     │  je           31b3                                     ▒
        │     │for (u=0; u < (ssize_t) kernel->width; u++, k--) {      ▒
        │2b8f:│  inc          %rcx                                     ▒
  11.13 │     └──jmp          2b34                                     ▒

Reply via email to