https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111551

Jan Hubicka <hubicka at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
     Ever confirmed|0                           |1
   Last reconfirmed|                            |2025-03-05
             Status|UNCONFIRMED                 |NEW

--- Comment #2 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
Building imagemagick with -Ofast -fprofile-use (no -flto) yields:

  52.50%  imagick_r_peak.  imagick_r_peak.trunk-pgolto-Ofast-native-m64        
 [.] MorphologyApply.cold
  28.65%  imagick_r_peak.  imagick_r_peak.trunk-pgolto-Ofast-native-m64        
 [.] MeanShiftImage
  10.81%  imagick_r_peak.  imagick_r_peak.trunk-pgolto-Ofast-native-m64        
 [.] GetVirtualPixelsFromNexus
   3.67%  imagick_r_peak.  imagick_r_peak.trunk-pgolto-Ofast-native-m64        
 [.] GetOneCacheViewVirtualPixel
   2.26%  imagick_r_peak.  imagick_r_peak.trunk-pgolto-Ofast-native-m64        
 [.] MagickRound
   0.11%  imagick_r_peak.  imagick_r_peak.trunk-pgolto-Ofast-native-m64        
 [.] HorizontalFilter

So we declare hot loop of MorphologyApply as cold.  This does not seem to be
due to train run missing hot spot of ref run, since it preproduces even if
train run data is replaced by ref run data.

Hot loop is the kernel of Morphology

ercent │     │  je          3143                                 ▒
        │     │for (v=0; v < (ssize_t) kernel->height; v++) {     ▒
        │2dcb:│  inc         %rcx                                 ▒
        │     │  jmp         2d6a                                 ▒
        │     │return((Quantum) 0);                               ▒
        │2dd0:│  xor         %eax,%eax                            ▒
        │     │  jmp         207d                                 ▒
        │     │return(QuantumRange);                              ▒
        │2dd7:│  or          $0xffffffff,%eax                     ▒
        │     │  jmp         207d                                 ▒
        │     │return((Quantum) (value+0.5f));                    ▒
        │2ddf:│  vaddss      0xbfe65(%rip),%xmm0,%xmm0        # 53▒
        │     │  vcvttss2si  %xmm0,%eax                           ▒
        │     │  jmp         1f02                                 ▒
        │     │result.red     += (*k)*k_pixels[u].red;            ▒
        │2df0:│  imul        $0xfffffffffffffff8,%rax,%r9         ▒
   5.68 │     │  vxorpd      %xmm7,%xmm7,%xmm7                    ▒
   0.14 │     │  vmovsd      (%rcx,%r9,1),%xmm5                   ▒
   0.05 │     │  movzwl      0x4(%rbx,%rax,8),%r9d                ▒
   6.26 │     │  vcvtsi2sd   %r9d,%xmm7,%xmm6                     ▒
        │     │result.green   += (*k)*k_pixels[u].green;          ▒
   9.60 │     │  movzwl      0x2(%rbx,%rax,8),%r9d                ▒
        │     │result.red     += (*k)*k_pixels[u].red;            ▒
   0.00 │     │  vfmadd231sd %xmm6,%xmm5,%xmm4                    ▒
        │     │result.green   += (*k)*k_pixels[u].green;          ▒
  12.66 │     │  vcvtsi2sd   %r9d,%xmm7,%xmm6                     ▒
        │     │result.blue    += (*k)*k_pixels[u].blue;           ▒
   7.89 │     │  movzwl      (%rbx,%rax,8),%r9d                   ▒
        │     │result.green   += (*k)*k_pixels[u].green;          ▒
        │     │  vfmadd231sd %xmm6,%xmm5,%xmm3                    ▒
        │     │result.blue    += (*k)*k_pixels[u].blue;           ▒
  21.48 │     │  vcvtsi2sd   %r9d,%xmm7,%xmm6                     ▒
        │     │result.opacity += (*k)*k_pixels[u].opacity;        ▒
   2.67 │     │  movzwl      0x6(%rbx,%rax,8),%r9d                ▒
        │     │result.blue    += (*k)*k_pixels[u].blue;           ▒
        │     │  vfmadd231sd %xmm6,%xmm5,%xmm2                    ▒
        │     │result.opacity += (*k)*k_pixels[u].opacity;        ▒
  14.91 │     │  vcvtsi2sd   %r9d,%xmm7,%xmm6                     ▒
   2.07 │     │  vfmadd231sd %xmm6,%xmm5,%xmm1                    ▒
        │     │if ( image->colorspace == CMYKColorspace)          ▒
  11.53 │     │  cmp         $0xc,%r13d                           ▒
   4.63 │     │  je          2e4b                                 ▒
        │     │for (u=0; u < (ssize_t) kernel->width; u++, k--) { ▒
        │2e43:│  inc         %rax                                 ▒
   0.00 │     └──jmp         1f68                                 ▒

If I replace train run by refrate run we consider the loop hot, so it seems
like bad train run.

        │     result.opacity += (*k)*k_pixels[u].opacity;                       
   3.56 │700:   vmovdqu      (%rdx),%ymm0                                       
        │     result.red     += (*k)*k_pixels[u].red;                           
   5.34 │       vmovupd      (%r8),%ymm2                                        
   3.51 │       add          $0x20,%rdx                                         
   2.15 │       sub          $0x20,%r8                                          
        │     result.opacity += (*k)*k_pixels[u].opacity;                       
   4.38 │       vpmovzxwd    %xmm0,%ymm1                                        
   1.26 │       vextracti128 $0x1,%ymm0,%xmm0                                   
   2.52 │       vpermpd      $0xaa,%ymm2,%ymm11                                 
   3.37 │       vpermpd      $0x55,%ymm2,%ymm10                                 
   3.53 │       vcvtdq2pd    %xmm1,%ymm12                                       
   1.55 │       vextracti128 $0x1,%ymm1,%xmm1                                   
   3.61 │       vpmovzxwd    %xmm0,%ymm0                                        
   1.80 │       vbroadcastsd %xmm2,%ymm9                                        
   5.27 │       vcvtdq2pd    %xmm1,%ymm1                                        
   5.37 │       vpermpd      $0xff,%ymm2,%ymm2                                  
   2.68 │       vfmadd231pd  %ymm12,%ymm2,%ymm3                                 
   2.62 │       vfmadd231pd  %ymm11,%ymm1,%ymm6                                 
   6.87 │       vcvtdq2pd    %xmm0,%ymm1                                        
   4.28 │       vextracti128 $0x1,%ymm0,%xmm0                                   
   8.40 │       vcvtdq2pd    %xmm0,%ymm0                                        
   4.32 │       vfmadd231pd  %ymm1,%ymm10,%ymm8                                 
   4.22 │       vfmadd231pd  %ymm0,%ymm9,%ymm7                                  
        │     for (u=0; u < (ssize_t) kernel->width; u++, k--) {                
   4.17 │       cmp          %rdx,%r9                                           
        │     ↑ jne          700                   

So it seems we simply miss vectorization because we optimize for size.

runtime is
 217s with -Ofast -fprofile-use and train run
 170s with -Ofast 
 165s with -Ofast -fprofile-use and train run hacked to be refrate run

Reply via email to