https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64500

--- Comment #9 from ak at gcc dot gnu.org ---
I can test it later, but it would surprise me if it helps. The problem is not
the computation but the misses. When profiling it I see a lot of cache misses
on "cmp" memory load. So likely need to do something about the data structure.

Looking at some LBR data the list walks just seem to be too long. Several of
the iterations exceeded the 32 entry limit of the Intel LBR. A 90+ cycle
latency must be multiple cache misses. I saw up to 340 cycles just for the loop
body.

e.g. here is an excerpt with cycle data

      0000000001278705                        jnz 0x12786e0                    
      # PRED 74 cycles [74]
        00000000012786e0                        cmpw  $0x2, (%rbx)
        00000000012786e4                        jz 0x1278e20
        00000000012786ea                        movq  0x20(%rbx), %rbp
        00000000012786ee                        test %rbp, %rbp
        00000000012786f1                        jz 0x12786fe
        00000000012786f3                        cmpq  $0x0, 0x38(%rbp)
        00000000012786f8                        jnz 0x1278868
        00000000012786fe                        movq  0x10(%rbx), %rbx
        0000000001278702                        test %rbx, %rbx
        0000000001278705                        jnz 0x12786e0                  
        # PRED 78 cycles [152] 0.13 IPC
        00000000012786e0                        cmpw  $0x2, (%rbx)
        00000000012786e4                        jz 0x1278e20
        00000000012786ea                        movq  0x20(%rbx), %rbp
        00000000012786ee                        test %rbp, %rbp
        00000000012786f1                        jz 0x12786fe
        00000000012786f3                        cmpq  $0x0, 0x38(%rbp)
        00000000012786f8                        jnz 0x1278868
        00000000012786fe                        movq  0x10(%rbx), %rbx
        0000000001278702                        test %rbx, %rbx
        0000000001278705                        jnz 0x12786e0                  
        # PRED 356 cycles [508] 0.03 IPC
        00000000012786e0                        cmpw  $0x2, (%rbx)
        00000000012786e4                        jz 0x1278e20
        00000000012786ea                        movq  0x20(%rbx), %rbp
        00000000012786ee                        test %rbp, %rbp
        00000000012786f1                        jz 0x12786fe
        00000000012786f3                        cmpq  $0x0, 0x38(%rbp)
        00000000012786f8                        jnz 0x1278868
        00000000012786fe                        movq  0x10(%rbx), %rbx
        0000000001278702                        test %rbx, %rbx
        0000000001278705                        jnz 0x12786e0                  
        # PRED 24 cycles [532] 0.42 IPC
        00000000012786e0                        cmpw  $0x2, (%rbx)
        00000000012786e4                        jz 0x1278e20
        00000000012786ea                        movq  0x20(%rbx), %rbp
        00000000012786ee                        test %rbp, %rbp
        00000000012786f1                        jz 0x12786fe
        00000000012786f3                        cmpq  $0x0, 0x38(%rbp)
        00000000012786f8                        jnz 0x1278868
        00000000012786fe                        movq  0x10(%rbx), %rbx
        0000000001278702                        test %rbx, %rbx
        0000000001278705                        jnz 0x12786e0                  
        # PRED 94 cycles [626] 0.11 IPC
        00000000012786e0                        cmpw  $0x2, (%rbx)
        00000000012786e4                        jz 0x1278e20
        00000000012786ea                        movq  0x20(%rbx), %rbp
        00000000012786ee                        test %rbp, %rbp
        00000000012786f1                        jz 0x12786fe
        00000000012786f3                        cmpq  $0x0, 0x38(%rbp)
        00000000012786f8                        jnz 0x1278868
        00000000012786fe                        movq  0x10(%rbx), %rbx
        0000000001278702                        test %rbx, %rbx
        0000000001278705                        jnz 0x12786e0                  
        # PRED 70 cycles [696] 0.14 IPC
 ...

Reply via email to