https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84481

--- Comment #2 from Martin Jambor <jamborm at gcc dot gnu.org> ---
Regarding the generic tuning issue, the difference comes down to the
order of the three instructions at offset 46 in the hottest loop below
(left is fast, right is slow, both along with their perf samples):

    38082423 |30:   sub    %rsi,%rcx             37881074 |30:   sub   
%rsi,%rcx
    33361536 |      add    $0x1,%rax             29965960 |      add   
$0x1,%rax
    14727831 |      mov    %rcx,(%rdx)           11839813 |      mov   
%rcx,(%rdx)
   306224188 |      mov    0x10(%rdx),%rcx      280934119 |      mov   
0x10(%rdx),%rcx
     7929159 |      test   %rcx,%rcx              3987929 |      test  
%rcx,%rcx
    11735894 |      je     69                     5855925 |      je     69
             |43:   mov    %rcx,%rdx                      |43:   mov   
%rcx,%rdx
   239584355 |46:   cmpl   $0x1,0x8(%rdx)       225344308 |46:   mov   
0x18(%rdx),%rcx
 10777052578 |      mov    0x18(%rdx),%rcx    21488318830 |      mov   
0x30(%rdx),%rsi
  4358414249 |      mov    0x30(%rdx),%rsi     6773073327 |      cmpl  
$0x1,0x8(%rdx)
  4227512903 |      mov    (%rcx),%rcx         1386678856 |      mov   
(%rcx),%rcx
  6128900849 |      mov    (%rsi),%rsi         6005737871 |      mov   
(%rsi),%rsi
 220097857758|      jne    30                 263974962392|      jne    30
    74107789 |      add    %rsi,%rcx             47610508 |      add   
%rsi,%rcx
    29107594 |      mov    %rcx,(%rdx)           31975201 |      mov   
%rcx,(%rdx)
    28866535 |      mov    0x10(%rdx),%rcx       31974627 |      mov   
0x10(%rdx),%rcx
     2996253 |      test   %rcx,%rcx              6035544 |      test  
%rcx,%rcx
    37486332 |      jne    43                    24769958 |      jne    43

Reply via email to