[Bug target/119298] [15 Regression] 538.imagick_r is faster when compiled with GCC 14.2 and -Ofast -flto -march=native than with master on Zen5 since r15-3441-g4292297a0f938f

hubicka at gcc dot gnu.org via Gcc-bugs Wed, 09 Apr 2025 07:53:44 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119298


--- Comment #15 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
I made sily stand-alone test:

long test[4];
__attribute__ ((noipa))
void
foo (unsigned long a, unsigned long b, unsigned long c, unsigned long d)
{
        test[0]=a;
        test[1]=b;
        test[2]=c;
        test[3]=d;
}

int
main()
{
        long s = 0;
        for (int i = 0; i < 1000000000; i++)
        {
                foo (i,i+1,i+2,i+3);
                s+=test[0];
                s+=test[1];
                s+=test[2];
                s+=test[3];
        }
        return s;
}

And curiously enough it strongly prefers cost of 2 over 3. 

jh@shroud:~/trunk/build/gcc> perf stat ./test-noslp 

 Performance counter stats for './test-noslp':

          1,211.17 msec task-clock:u                     #    1.000 CPUs
utilized             
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
                55      page-faults:u                    #   45.411 /sec        
     5,000,372,342      cycles:u                         #    4.129 GHz         
     2,000,253,750      stalled-cycles-frontend:u        #   40.00% frontend
cycles idle      
    17,000,136,662      instructions:u                   #    3.40  insn per
cycle            
                                                  #    0.12  stalled cycles per
insn   
     3,000,030,827      branches:u                       #    2.477 G/sec       
             2,592      branch-misses:u                  #    0.00% of all
branches           

       1.211767440 seconds time elapsed

       1.211832000 seconds user
       0.000000000 seconds sys


jh@shroud:~/trunk/build/gcc> perf stat ./test-cost3

 Performance counter stats for './test-cost3':

          7,266.90 msec task-clock:u                     #    1.000 CPUs
utilized             
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
                55      page-faults:u                    #    7.569 /sec        
    30,001,467,995      cycles:u                         #    4.129 GHz         
         1,111,876      stalled-cycles-frontend:u        #    0.00% frontend
cycles idle      
    23,000,138,491      instructions:u                   #    0.77  insn per
cycle            
                                                  #    0.00  stalled cycles per
insn   
     3,000,032,652      branches:u                       #  412.835 M/sec       
             4,455      branch-misses:u                  #    0.00% of all
branches           

       7.267898755 seconds time elapsed

       7.267379000 seconds user
       0.000000000 seconds sys


jh@shroud:~/trunk/build/gcc> perf stat ./test-cost2

 Performance counter stats for './test-cost2':

          1,089.54 msec task-clock:u                     #    1.000 CPUs
utilized             
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
                55      page-faults:u                    #   50.480 /sec        
     4,501,104,318      cycles:u                         #    4.131 GHz         
         5,495,394      stalled-cycles-frontend:u        #    0.12% frontend
cycles idle      
    24,000,136,630      instructions:u                   #    5.33  insn per
cycle            
                                                  #    0.00  stalled cycles per
insn   
     3,000,030,793      branches:u                       #    2.753 G/sec       
             2,492      branch-misses:u                  #    0.00% of all
branches           

       1.090067946 seconds time elapsed

       1.090267000 seconds user
       0.000000000 seconds sys


Cost2 variant does:

00000000004011c0 <_Z3foommmm>:
  4011c0:       c4 e1 f9 6e d2          vmovq  %rdx,%xmm2
  4011c5:       c4 e1 f9 6e df          vmovq  %rdi,%xmm3
  4011ca:       c4 e3 e9 22 c9 01       vpinsrq $0x1,%rcx,%xmm2,%xmm1
  4011d0:       c4 e3 e1 22 c6 01       vpinsrq $0x1,%rsi,%xmm3,%xmm0
  4011d6:       62 f3 fd 28 38 c1 01    vinserti64x2 $0x1,%xmm1,%ymm0,%ymm0
  4011dd:       c5 fd 7f 05 5b 2e 00    vmovdqa %ymm0,0x2e5b(%rip)        #
404040 <test>
  4011e4:       00 
  4011e5:       c5 f8 77                vzeroupper
  4011e8:       c3                      ret
....
  401059:       c5 fd 6f 0d df 2f 00    vmovdqa 0x2fdf(%rip),%ymm1        #
404040 <test>
  401060:       00 
  401061:       62 f3 fd 28 39 c8 01    vextracti64x2 $0x1,%ymm1,%xmm0
  401068:       c5 f9 d4 c1             vpaddq %xmm1,%xmm0,%xmm0
  40106c:       c5 f1 73 d8 08          vpsrldq $0x8,%xmm0,%xmm1
  401071:       c5 f9 d4 c1             vpaddq %xmm1,%xmm0,%xmm0
  401075:       c4 e1 f9 7e c0          vmovq  %xmm0,%rax
  40107a:       49 01 c4                add    %rax,%r12


while cost3 variant does:
00000000004011c0 <_Z3foommmm>:
  4011c0:       c4 e1 f9 6e d7          vmovq  %rdi,%xmm2
  4011c5:       c4 e1 f9 6e da          vmovq  %rdx,%xmm3
  4011ca:       c4 e3 e9 22 ce 01       vpinsrq $0x1,%rsi,%xmm2,%xmm1
  4011d0:       c4 e3 e1 22 c1 01       vpinsrq $0x1,%rcx,%xmm3,%xmm0
  4011d6:       c5 f9 7f 0d 62 2e 00    vmovdqa %xmm1,0x2e62(%rip)        #
404040 <test>
  4011dd:       00 
  4011de:       c5 f9 7f 05 6a 2e 00    vmovdqa %xmm0,0x2e6a(%rip)        #
404050 <test+0x10>
  4011e5:       00 
  4011e6:       c3                      ret
....
  401059:       c5 fd 6f 0d df 2f 00    vmovdqa 0x2fdf(%rip),%ymm1        #
404040 <test>
  401060:       00 
  401061:       62 f3 fd 28 39 c8 01    vextracti64x2 $0x1,%ymm1,%xmm0
  401068:       c5 f9 d4 c1             vpaddq %xmm1,%xmm0,%xmm0
  40106c:       c5 f1 73 d8 08          vpsrldq $0x8,%xmm0,%xmm1
  401071:       c5 f9 d4 c1             vpaddq %xmm1,%xmm0,%xmm0
  401075:       c4 e1 f9 7e c0          vmovq  %xmm0,%rax
  40107a:       49 01 c4                add    %rax,%r12


noslp
00000000004011a0 <_Z3foommmm>:
  4011a0:       48 89 3d 99 2e 00 00    mov    %rdi,0x2e99(%rip)        #
404040 <test>
  4011a7:       48 89 35 9a 2e 00 00    mov    %rsi,0x2e9a(%rip)        #
404048 <test+0x8>
  4011ae:       48 89 15 9b 2e 00 00    mov    %rdx,0x2e9b(%rip)        #
404050 <test+0x10>
  4011b5:       48 89 0d 9c 2e 00 00    mov    %rcx,0x2e9c(%rip)        #
404058 <test+0x18>
  4011bc:       c3                      ret
....
  401046:       48 03 1d f3 2f 00 00    add    0x2ff3(%rip),%rbx        #
404040 <test>
  40104d:       48 03 1d f4 2f 00 00    add    0x2ff4(%rip),%rbx        #
404048 <test+0x8>
  401054:       48 03 1d f5 2f 00 00    add    0x2ff5(%rip),%rbx        #
404050 <test+0x10>
  40105b:       48 03 1d f6 2f 00 00    add    0x2ff6(%rip),%rbx        #
404058 <test+0x18>

[Bug target/119298] [15 Regression] 538.imagick_r is faster when compiled with GCC 14.2 and -Ofast -flto -march=native than with master on Zen5 since r15-3441-g4292297a0f938f

Reply via email to