https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109690

--- Comment #7 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
Thanks a lot!  There however still seems to be problem with vectorization

On zen4 i now get:
jh@ryzen4:~/gcc/build/gcc> ./xgcc -B ./ -O2 -march=native slp.c  ; perf stat
./a.out

 Performance counter stats for './a.out':

          1,835.21 msec task-clock:u                     #    1.000 CPUs
utilized             
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
                53      page-faults:u                    #   28.880 /sec        
    10,000,113,961      cycles:u                         #    5.449 GHz        
                (83.22%)
            31,284      stalled-cycles-frontend:u        #    0.00% frontend
cycles idle        (83.23%)
            64,771      stalled-cycles-backend:u         #    0.00% backend
cycles idle         (83.43%)
     9,000,118,863      instructions:u                   #    0.90  insn per
cycle            
                                                  #    0.00  stalled cycles per
insn     (83.44%)
     2,999,980,507      branches:u                       #    1.635 G/sec      
                (83.44%)
             1,445      branch-misses:u                  #    0.00% of all
branches             (83.25%)

       1.835610338 seconds time elapsed

       1.835628000 seconds user
       0.000000000 seconds sys


jh@ryzen4:~/gcc/build/gcc> ./xgcc -B ./ -O2 -march=native -fno-tree-vectorize
slp.c  ; perf stat ./a.out

 Performance counter stats for './a.out':

          1,107.63 msec task-clock:u                     #    1.000 CPUs
utilized             
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
                53      page-faults:u                    #   47.850 /sec        
     6,000,774,547      cycles:u                         #    5.418 GHz        
                (83.35%)
            32,208      stalled-cycles-frontend:u        #    0.00% frontend
cycles idle        (83.39%)
            57,126      stalled-cycles-backend:u         #    0.00% backend
cycles idle         (83.39%)
     7,999,763,446      instructions:u                   #    1.33  insn per
cycle            
                                                  #    0.00  stalled cycles per
insn     (83.39%)
     2,999,982,314      branches:u                       #    2.708 G/sec      
                (83.39%)
               911      branch-misses:u                  #    0.00% of all
branches             (83.09%)

       1.108032230 seconds time elapsed

       1.104079000 seconds user
       0.003985000 seconds sys


with -fno-tree-slp-vectorize I get:
loop:
.LFB0:
        .cfi_startproc
        sall    a(%rip)
        sall    a+4(%rip)
        sall    a+8(%rip)
        ret

Which seem to be still faster. It is same if I do a[i]++
jh@ryzen4:~/gcc/build/gcc> ./xgcc -B ./ -O2 -march=native slp2.c  ; perf stat
./a.out

 Performance counter stats for './a.out':

          1,832.63 msec task-clock:u                     #    1.000 CPUs
utilized             
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
                54      page-faults:u                    #   29.466 /sec        
    10,000,535,003      cycles:u                         #    5.457 GHz        
                (83.19%)
            36,576      stalled-cycles-frontend:u        #    0.00% frontend
cycles idle        (83.34%)
            75,320      stalled-cycles-backend:u         #    0.00% backend
cycles idle         (83.41%)
     9,999,890,371      instructions:u                   #    1.00  insn per
cycle            
                                                  #    0.00  stalled cycles per
insn     (83.41%)
     2,999,935,610      branches:u                       #    1.637 G/sec      
                (83.41%)
             1,447      branch-misses:u                  #    0.00% of all
branches             (83.23%)

       1.833046939 seconds time elapsed

       1.833062000 seconds user
       0.000000000 seconds sys


jh@ryzen4:~/gcc/build/gcc> ./xgcc -B ./ -O2 -march=native slp2.c
-fno-tree-vectorize ; perf stat ./a.out

 Performance counter stats for './a.out':

          1,110.15 msec task-clock:u                     #    1.000 CPUs
utilized             
                 0      context-switches:u               #    0.000 /sec        
                 0      cpu-migrations:u                 #    0.000 /sec        
                51      page-faults:u                    #   45.940 /sec        
     6,000,096,821      cycles:u                         #    5.405 GHz        
                (83.17%)
            28,459      stalled-cycles-frontend:u        #    0.00% frontend
cycles idle        (83.43%)
            48,165      stalled-cycles-backend:u         #    0.00% backend
cycles idle         (83.43%)
     7,999,665,012      instructions:u                   #    1.33  insn per
cycle            
                                                  #    0.00  stalled cycles per
insn     (83.43%)
     2,999,941,619      branches:u                       #    2.702 G/sec      
                (83.43%)
               719      branch-misses:u                  #    0.00% of all
branches             (83.12%)

       1.110557635 seconds time elapsed

       1.110575000 seconds user
       0.000000000 seconds sys


jh@ryzen4:~/gcc/build/gcc> cat slp2.c
int a[100];

[[gnu::noipa]]
void loop()
{
          for (int i = 0; i < 3; i++)
                  a[i]++;
}
int
main()
{
        for (int j = 0; j < 1000000000; j++)
          loop ();
        return 0;
}

Reply via email to