https://gcc.gnu.org/g:f8f686a12989a0dcf8ab0235641cf4a8dceae67c

commit r15-6348-gf8f686a12989a0dcf8ab0235641cf4a8dceae67c
Author: Tamar Christina <tamar.christ...@arm.com>
Date:   Wed Dec 18 16:39:25 2024 +0000

    libstdc++: Adjust probabilities of hashmap loop conditions
    
    We are currently generating a loop which has more comparisons than you'd
    typically need as the probablities on the small size loop are such that it
    assumes the likely case is that an element is not found.
    
    This again generates a pattern that's harder for branch predictors to 
follow,
    but also just generates more instructions for the what one could say is the
    typical case: That your hashtable contains the entry you are looking for.
    
    This patch adds a __builtin_expect in _M_find_before_node where at the 
moment
    the loop is optimized for the case where we don't do any iterations.
    
    A simple testcase is (compiled with -fno-split-path to simulate the loop
    in libstdc++):
    
    #include <stdbool.h>
    
    bool foo (int **a, int n, int val, int *tkn)
    {
        for (int i = 0; i < n; i++)
        {
            if (!a[i] || a[i]==tkn)
              return false;
    
            if (*a[i] == val)
              return true;
        }
    }
    
    which generataes:
    
    foo:
            cmp     w1, 0
            ble     .L1
            add     x1, x0, w1, uxtw 3
            b       .L4
    .L9:
            ldr     w4, [x4]
            cmp     w4, w2
            beq     .L6
            cmp     x0, x1
            beq     .L1
    .L4:
            ldr     x4, [x0]
            add     x0, x0, 8
            cmp     x4, 0
            ccmp    x4, x3, 4, ne
            bne     .L9
            mov     w0, 0
    .L1:
            ret
    .L6:
            mov     w0, 1
            ret
    
    i.e. BB rotation makes is generate an unconditional branch to a conditional
    branch. However this method is only called when the size is above a certain
    threshold, and so it's likely that we have to do that first iteration.
    
    Adding:
    
    #include <stdbool.h>
    
    bool foo (int **a, int n, int val, int *tkn)
    {
        for (int i = 0; i < n; i++)
        {
            if (__builtin_expect(!a[i] || a[i]==tkn, 0))
              return false;
    
            if (*a[i] == val)
              return true;
        }
    }
    
    to indicate that we will likely do an iteration more generates:
    
    foo:
            cmp     w1, 0
            ble     .L1
            add     x1, x0, w1, uxtw 3
    .L4:
            ldr     x4, [x0]
            add     x0, x0, 8
            cmp     x4, 0
            ccmp    x4, x3, 4, ne
            beq     .L5
            ldr     w4, [x4]
            cmp     w4, w2
            beq     .L6
            cmp     x0, x1
            bne     .L4
    .L1:
            ret
    .L5:
            mov     w0, 0
            ret
    .L6:
            mov     w0, 1
            ret
    
    which results in ~0-10% extra on top of the previous patch.
    
    In table form:
    
    
+-------------+---------------+-------+--------------------+-------------------+-----------------+
    | benchmark   | Type          | Size  | Inline vs baseline | final vs 
baseline | final vs inline |
    
+-------------+---------------+-------+--------------------+-------------------+-----------------+
    | find many   | uint64_t      | 11253 | -15.67%            | -22.96%        
   | -8.65%          |
    | find many   | uint64_t      | 11253 | -16.74%            | -23.37%        
   | -7.96%          |
    | find single | uint64_t      | 345   | -5.88%             | -11.54%        
   | -6.02%          |
    | find many   | string        | 11253 | -4.50%             | -9.56%         
   | -5.29%          |
    | find single | uint64_t      | 345   | -4.38%             | -9.41%         
   | -5.26%          |
    | find single | shared string | 11253 | -6.67%             | -11.00%        
   | -4.64%          |
    | find single | shared string | 11253 | -4.63%             | -9.03%         
   | -4.61%          |
    | find single | shared string | 345   | -10.41%            | -14.44%        
   | -4.50%          |
    | find many   | string        | 11253 | -3.41%             | -7.51%         
   | -4.24%          |
    | find many   | shared string | 11253 | -2.30%             | -5.72%         
   | -3.50%          |
    | find many   | string        | 13    | 2.86%              | -0.30%         
   | -3.07%          |
    | find single | string        | 11253 | 4.47%              | 1.34%          
   | -3.00%          |
    | find many   | custom string | 11253 | 0.25%              | -2.75%         
   | -2.99%          |
    | find single | uint64_t      | 345   | 2.99%              | 0.01%          
   | -2.90%          |
    | find single | shared string | 345   | -11.53%            | -13.67%        
   | -2.41%          |
    | find single | uint64_t      | 11253 | 0.49%              | -1.59%         
   | -2.07%          |
    
+-------------+---------------+-------+--------------------+-------------------+-----------------+
    
    libstdc++-v3/ChangeLog:
    
            * include/bits/hashtable.h
            (_M_find_before_node): Make it likely that the map has at least one
            entry and so we do at least one iteration.

Diff:
---
 libstdc++-v3/include/bits/hashtable.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/include/bits/hashtable.h 
b/libstdc++-v3/include/bits/hashtable.h
index cd60fb58c758..5792cc5e3922 100644
--- a/libstdc++-v3/include/bits/hashtable.h
+++ b/libstdc++-v3/include/bits/hashtable.h
@@ -2177,7 +2177,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
          if (this->_M_equals(__k, __code, *__p))
            return __prev_p;
 
-         if (!__p->_M_nxt || _M_bucket_index(*__p->_M_next()) != __bkt)
+         if (__builtin_expect (!__p->_M_nxt || 
_M_bucket_index(*__p->_M_next()) != __bkt, 0))
            break;
          __prev_p = __p;
        }
@@ -2207,7 +2207,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
            if (this->_M_equals_tr(__k, __code, *__p))
              return __prev_p;
 
-           if (!__p->_M_nxt || _M_bucket_index(*__p->_M_next()) != __bkt)
+           if (__builtin_expect (!__p->_M_nxt || 
_M_bucket_index(*__p->_M_next()) != __bkt, 0))
              break;
            __prev_p = __p;
          }

Reply via email to