[Bug target/119298] [15 Regression] 538.imagick_r is faster when compiled with GCC 14.2 and -Ofast -flto -march=native than with master on Zen5 since r15-3441-g4292297a0f938f

rguenth at gcc dot gnu.org via Gcc-bugs Wed, 09 Apr 2025 23:12:39 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119298


--- Comment #16 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Jan Hubicka from comment #15)
> I made sily stand-alone test:
> 
> long test[4];
> __attribute__ ((noipa))
> void
> foo (unsigned long a, unsigned long b, unsigned long c, unsigned long d)
> {
>         test[0]=a;
>         test[1]=b;
>         test[2]=c;
>         test[3]=d;
> }
> 
> int
> main()
> {
>         long s = 0;
>         for (int i = 0; i < 1000000000; i++)
>         {
>                 foo (i,i+1,i+2,i+3);
>                 s+=test[0];
>                 s+=test[1];
>                 s+=test[2];
>                 s+=test[3];
>         }
>         return s;
> }
> 
> And curiously enough it strongly prefers cost of 2 over 3. 
> 
> jh@shroud:~/trunk/build/gcc> perf stat ./test-noslp 
> 
>  Performance counter stats for './test-noslp':
> 
>           1,211.17 msec task-clock:u                     #    1.000 CPUs
> utilized             
>                  0      context-switches:u               #    0.000 /sec    
> 
>                  0      cpu-migrations:u                 #    0.000 /sec    
> 
>                 55      page-faults:u                    #   45.411 /sec    
> 
>      5,000,372,342      cycles:u                         #    4.129 GHz     
> 
>      2,000,253,750      stalled-cycles-frontend:u        #   40.00% frontend
> cycles idle      
>     17,000,136,662      instructions:u                   #    3.40  insn per
> cycle            
>                                                   #    0.12  stalled cycles
> per insn   
>      3,000,030,827      branches:u                       #    2.477 G/sec   
> 
>              2,592      branch-misses:u                  #    0.00% of all
> branches           
> 
>        1.211767440 seconds time elapsed
> 
>        1.211832000 seconds user
>        0.000000000 seconds sys
> 
> 
> jh@shroud:~/trunk/build/gcc> perf stat ./test-cost3
> 
>  Performance counter stats for './test-cost3':
> 
>           7,266.90 msec task-clock:u                     #    1.000 CPUs
> utilized             
>                  0      context-switches:u               #    0.000 /sec    
> 
>                  0      cpu-migrations:u                 #    0.000 /sec    
> 
>                 55      page-faults:u                    #    7.569 /sec    
> 
>     30,001,467,995      cycles:u                         #    4.129 GHz     
> 
>          1,111,876      stalled-cycles-frontend:u        #    0.00% frontend
> cycles idle      
>     23,000,138,491      instructions:u                   #    0.77  insn per
> cycle            
>                                                   #    0.00  stalled cycles
> per insn   
>      3,000,032,652      branches:u                       #  412.835 M/sec   
> 
>              4,455      branch-misses:u                  #    0.00% of all
> branches           
> 
>        7.267898755 seconds time elapsed
> 
>        7.267379000 seconds user
>        0.000000000 seconds sys
> 
> 
> jh@shroud:~/trunk/build/gcc> perf stat ./test-cost2
> 
>  Performance counter stats for './test-cost2':
> 
>           1,089.54 msec task-clock:u                     #    1.000 CPUs
> utilized             
>                  0      context-switches:u               #    0.000 /sec    
> 
>                  0      cpu-migrations:u                 #    0.000 /sec    
> 
>                 55      page-faults:u                    #   50.480 /sec    
> 
>      4,501,104,318      cycles:u                         #    4.131 GHz     
> 
>          5,495,394      stalled-cycles-frontend:u        #    0.12% frontend
> cycles idle      
>     24,000,136,630      instructions:u                   #    5.33  insn per
> cycle            
>                                                   #    0.00  stalled cycles
> per insn   
>      3,000,030,793      branches:u                       #    2.753 G/sec   
> 
>              2,492      branch-misses:u                  #    0.00% of all
> branches           
> 
>        1.090067946 seconds time elapsed
> 
>        1.090267000 seconds user
>        0.000000000 seconds sys
> 
> 
> Cost2 variant does:
> 
> 00000000004011c0 <_Z3foommmm>:
>   4011c0:       c4 e1 f9 6e d2          vmovq  %rdx,%xmm2
>   4011c5:       c4 e1 f9 6e df          vmovq  %rdi,%xmm3
>   4011ca:       c4 e3 e9 22 c9 01       vpinsrq $0x1,%rcx,%xmm2,%xmm1
>   4011d0:       c4 e3 e1 22 c6 01       vpinsrq $0x1,%rsi,%xmm3,%xmm0
>   4011d6:       62 f3 fd 28 38 c1 01    vinserti64x2 $0x1,%xmm1,%ymm0,%ymm0
>   4011dd:       c5 fd 7f 05 5b 2e 00    vmovdqa %ymm0,0x2e5b(%rip)        #
> 404040 <test>
>   4011e4:       00 
>   4011e5:       c5 f8 77                vzeroupper
>   4011e8:       c3                      ret
> ....
>   401059:       c5 fd 6f 0d df 2f 00    vmovdqa 0x2fdf(%rip),%ymm1        #
> 404040 <test>

that will forward nicely

>   401060:       00 
>   401061:       62 f3 fd 28 39 c8 01    vextracti64x2 $0x1,%ymm1,%xmm0
>   401068:       c5 f9 d4 c1             vpaddq %xmm1,%xmm0,%xmm0
>   40106c:       c5 f1 73 d8 08          vpsrldq $0x8,%xmm0,%xmm1
>   401071:       c5 f9 d4 c1             vpaddq %xmm1,%xmm0,%xmm0
>   401075:       c4 e1 f9 7e c0          vmovq  %xmm0,%rax
>   40107a:       49 01 c4                add    %rax,%r12
> 
> 
> while cost3 variant does:
> 00000000004011c0 <_Z3foommmm>:
>   4011c0:       c4 e1 f9 6e d7          vmovq  %rdi,%xmm2
>   4011c5:       c4 e1 f9 6e da          vmovq  %rdx,%xmm3
>   4011ca:       c4 e3 e9 22 ce 01       vpinsrq $0x1,%rsi,%xmm2,%xmm1
>   4011d0:       c4 e3 e1 22 c1 01       vpinsrq $0x1,%rcx,%xmm3,%xmm0
>   4011d6:       c5 f9 7f 0d 62 2e 00    vmovdqa %xmm1,0x2e62(%rip)        #
> 404040 <test>
>   4011dd:       00 
>   4011de:       c5 f9 7f 05 6a 2e 00    vmovdqa %xmm0,0x2e6a(%rip)        #
> 404050 <test+0x10>
>   4011e5:       00 
>   4011e6:       c3                      ret
> ....
>   401059:       c5 fd 6f 0d df 2f 00    vmovdqa 0x2fdf(%rip),%ymm1        #
> 404040 <test>

this will fail to forward, thus a huge penalty.

>   401060:       00 
>   401061:       62 f3 fd 28 39 c8 01    vextracti64x2 $0x1,%ymm1,%xmm0
>   401068:       c5 f9 d4 c1             vpaddq %xmm1,%xmm0,%xmm0
>   40106c:       c5 f1 73 d8 08          vpsrldq $0x8,%xmm0,%xmm1
>   401071:       c5 f9 d4 c1             vpaddq %xmm1,%xmm0,%xmm0
>   401075:       c4 e1 f9 7e c0          vmovq  %xmm0,%rax
>   40107a:       49 01 c4                add    %rax,%r12
> 
> 
> noslp
> 00000000004011a0 <_Z3foommmm>:
>   4011a0:       48 89 3d 99 2e 00 00    mov    %rdi,0x2e99(%rip)        #
> 404040 <test>
>   4011a7:       48 89 35 9a 2e 00 00    mov    %rsi,0x2e9a(%rip)        #
> 404048 <test+0x8>
>   4011ae:       48 89 15 9b 2e 00 00    mov    %rdx,0x2e9b(%rip)        #
> 404050 <test+0x10>
>   4011b5:       48 89 0d 9c 2e 00 00    mov    %rcx,0x2e9c(%rip)        #
> 404058 <test+0x18>
>   4011bc:       c3                      ret
> ....
>   401046:       48 03 1d f3 2f 00 00    add    0x2ff3(%rip),%rbx        #
> 404040 <test>
>   40104d:       48 03 1d f4 2f 00 00    add    0x2ff4(%rip),%rbx        #
> 404048 <test+0x8>
>   401054:       48 03 1d f5 2f 00 00    add    0x2ff5(%rip),%rbx        #
> 404050 <test+0x10>
>   40105b:       48 03 1d f6 2f 00 00    add    0x2ff6(%rip),%rbx        #
> 404058 <test+0x18>

[Bug target/119298] [15 Regression] 538.imagick_r is faster when compiled with GCC 14.2 and -Ofast -flto -march=native than with master on Zen5 since r15-3441-g4292297a0f938f

Reply via email to