[gcc r15-6348] libstdc++: Adjust probabilities of hashmap loop conditions

2024-12-18 Thread Tamar Christina via Libstdc++-cvs
https://gcc.gnu.org/g:f8f686a12989a0dcf8ab0235641cf4a8dceae67c

commit r15-6348-gf8f686a12989a0dcf8ab0235641cf4a8dceae67c
Author: Tamar Christina 
Date:   Wed Dec 18 16:39:25 2024 +

libstdc++: Adjust probabilities of hashmap loop conditions

We are currently generating a loop which has more comparisons than you'd
typically need as the probablities on the small size loop are such that it
assumes the likely case is that an element is not found.

This again generates a pattern that's harder for branch predictors to 
follow,
but also just generates more instructions for the what one could say is the
typical case: That your hashtable contains the entry you are looking for.

This patch adds a __builtin_expect in _M_find_before_node where at the 
moment
the loop is optimized for the case where we don't do any iterations.

A simple testcase is (compiled with -fno-split-path to simulate the loop
in libstdc++):

#include 

bool foo (int **a, int n, int val, int *tkn)
{
for (int i = 0; i < n; i++)
{
if (!a[i] || a[i]==tkn)
  return false;

if (*a[i] == val)
  return true;
}
}

which generataes:

foo:
cmp w1, 0
ble .L1
add x1, x0, w1, uxtw 3
b   .L4
.L9:
ldr w4, [x4]
cmp w4, w2
beq .L6
cmp x0, x1
beq .L1
.L4:
ldr x4, [x0]
add x0, x0, 8
cmp x4, 0
ccmpx4, x3, 4, ne
bne .L9
mov w0, 0
.L1:
ret
.L6:
mov w0, 1
ret

i.e. BB rotation makes is generate an unconditional branch to a conditional
branch. However this method is only called when the size is above a certain
threshold, and so it's likely that we have to do that first iteration.

Adding:

#include 

bool foo (int **a, int n, int val, int *tkn)
{
for (int i = 0; i < n; i++)
{
if (__builtin_expect(!a[i] || a[i]==tkn, 0))
  return false;

if (*a[i] == val)
  return true;
}
}

to indicate that we will likely do an iteration more generates:

foo:
cmp w1, 0
ble .L1
add x1, x0, w1, uxtw 3
.L4:
ldr x4, [x0]
add x0, x0, 8
cmp x4, 0
ccmpx4, x3, 4, ne
beq .L5
ldr w4, [x4]
cmp w4, w2
beq .L6
cmp x0, x1
bne .L4
.L1:
ret
.L5:
mov w0, 0
ret
.L6:
mov w0, 1
ret

which results in ~0-10% extra on top of the previous patch.

In table form:


+-+---+---++---+-+
| benchmark   | Type  | Size  | Inline vs baseline | final vs 
baseline | final vs inline |

+-+---+---++---+-+
| find many   | uint64_t  | 11253 | -15.67%| -22.96%
   | -8.65%  |
| find many   | uint64_t  | 11253 | -16.74%| -23.37%
   | -7.96%  |
| find single | uint64_t  | 345   | -5.88% | -11.54%
   | -6.02%  |
| find many   | string| 11253 | -4.50% | -9.56% 
   | -5.29%  |
| find single | uint64_t  | 345   | -4.38% | -9.41% 
   | -5.26%  |
| find single | shared string | 11253 | -6.67% | -11.00%
   | -4.64%  |
| find single | shared string | 11253 | -4.63% | -9.03% 
   | -4.61%  |
| find single | shared string | 345   | -10.41%| -14.44%
   | -4.50%  |
| find many   | string| 11253 | -3.41% | -7.51% 
   | -4.24%  |
| find many   | shared string | 11253 | -2.30% | -5.72% 
   | -3.50%  |
| find many   | string| 13| 2.86%  | -0.30% 
   | -3.07%  |
| find single | string| 11253 | 4.47%  | 1.34%  
   | -3.00%  |
| find many   | custom string | 11253 | 0.25%  | -2.75% 
   | -2.99%  |
| find single | uint64_t  | 345   | 2.99%  | 0.01%  
   | -2.90%  |
| find single | shared string | 345   | -11.53%| -13.67%
   | -2.41%  |
| find single | uint64_t  | 11253 | 0.49%  | -1.59% 
   | -2.07%  |

[gcc r15-6324] libstdc++: Add inline keyword to _M_locate

2024-12-18 Thread Tamar Christina via Libstdc++-cvs
https://gcc.gnu.org/g:18aff7644ad1e44dc146d36a2b7e397977aa47ac

commit r15-6324-g18aff7644ad1e44dc146d36a2b7e397977aa47ac
Author: Tamar Christina 
Date:   Wed Dec 18 09:02:46 2024 +

libstdc++: Add inline keyword to _M_locate

In GCC 12 there was a ~40% regression in the performance of hashmap->find.

This regression came about accidentally:

Before GCC 12 the find function was small enough that IPA would inline it 
even
though it wasn't marked inline.  In GCC-12 an optimization was added to 
perform
a linear search when the entries in the hashmap are small.

This increased the size of the function enough that IPA would no longer 
inline.
Inlining had two benefits:

1.  The return value is a reference. so it has to be returned and 
dereferenced
even though the search loop may have already dereference it.
2.  The pattern is a hard pattern to track for branch predictors.  This 
causes
a large number of branch misses if the value is immediately checked and
branched on. i.e. if (a != m.end()) which is a common pattern.

The patch fixes both these issues by adding the inline keyword to _M_locate
to allow the inliner to consider inlining again.

This and the other patches have been ran through serveral benchmarks where
the size, number of elements searched for and type (reference vs value) etc
were tested.

The change shows no statistical regression, but an average find improvement 
of
~27% and a range between ~10-60% improvements.  A selection of the results:

+---++---+--+
| Group | Benchmark  | Size  | % Inline |
+---++---+--+
| Find  | unord
-auto
+inline auto
 _Hashtable<_Key, _Value, _Alloc, _ExtractKey, _Equal,
   _Hash, _RangeHash, _Unused, _RehashPolicy, _Traits>::
 _M_locate(const key_type& __k) const


[gcc r14-11200] libstdc++: backport inline keyword on std::find

2025-01-10 Thread Tamar Christina via Libstdc++-cvs
https://gcc.gnu.org/g:e4a9fb7448a687f4fd7e621942006c2820b803d6

commit r14-11200-ge4a9fb7448a687f4fd7e621942006c2820b803d6
Author: Tamar Christina 
Date:   Fri Jan 10 21:37:40 2025 +

libstdc++: backport inline keyword on std::find

This is a backport version of the same patch as
g:18aff7644ad1e44dc146d36a2b7e397977aa47ac

In GCC 12 there was a ~40% regression in the performance of hashmap->find.

This regression came about accidentally:

Before GCC 12 the find function was small enough that IPA would inline it 
even
though it wasn't marked inline.  In GCC-12 an optimization was added to 
perform
a linear search when the entries in the hashmap are small.

This increased the size of the function enough that IPA would no longer 
inline.
Inlining had two benefits:

1.  The return value is a reference. so it has to be returned and 
dereferenced
even though the search loop may have already dereference it.
2.  The pattern is a hard pattern to track for branch predictors.  This 
causes
a large number of branch misses if the value is immediately checked and
branched on. i.e. if (a != m.end()) which is a common pattern.

The patch fixes both these issues by adding the inline keyword to _M_locate
to allow the inliner to consider inlining again.

This and the other patches have been ran through serveral benchmarks where
the size, number of elements searched for and type (reference vs value) etc
were tested.

The change shows no statistical regression, but an average find improvement 
of
~27% and a range between ~10-60% improvements.

Thanks,
Tamar

libstdc++-v3/ChangeLog:

* include/bits/hashtable.h (find): Add inline keyword.

Diff:
---
 libstdc++-v3/include/bits/hashtable.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/include/bits/hashtable.h 
b/libstdc++-v3/include/bits/hashtable.h
index 834288c747c2..f5f421d2fd32 100644
--- a/libstdc++-v3/include/bits/hashtable.h
+++ b/libstdc++-v3/include/bits/hashtable.h
@@ -1723,7 +1723,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   typename _ExtractKey, typename _Equal,
   typename _Hash, typename _RangeHash, typename _Unused,
   typename _RehashPolicy, typename _Traits>
-auto
+auto inline
 _Hashtable<_Key, _Value, _Alloc, _ExtractKey, _Equal,
   _Hash, _RangeHash, _Unused, _RehashPolicy, _Traits>::
 find(const key_type& __k)
@@ -1746,7 +1746,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   typename _ExtractKey, typename _Equal,
   typename _Hash, typename _RangeHash, typename _Unused,
   typename _RehashPolicy, typename _Traits>
-auto
+auto inline
 _Hashtable<_Key, _Value, _Alloc, _ExtractKey, _Equal,
   _Hash, _RangeHash, _Unused, _RehashPolicy, _Traits>::
 find(const key_type& __k) const


[gcc r13-9303] libstdc++: backport inline keyword on std::find

2025-01-10 Thread Tamar Christina via Libstdc++-cvs
https://gcc.gnu.org/g:f00e19a0491223d2782f9f863a4f3a31d509f76b

commit r13-9303-gf00e19a0491223d2782f9f863a4f3a31d509f76b
Author: Tamar Christina 
Date:   Fri Jan 10 21:37:40 2025 +

libstdc++: backport inline keyword on std::find

This is a backport version of the same patch as
g:18aff7644ad1e44dc146d36a2b7e397977aa47ac

In GCC 12 there was a ~40% regression in the performance of hashmap->find.

This regression came about accidentally:

Before GCC 12 the find function was small enough that IPA would inline it 
even
though it wasn't marked inline.  In GCC-12 an optimization was added to 
perform
a linear search when the entries in the hashmap are small.

This increased the size of the function enough that IPA would no longer 
inline.
Inlining had two benefits:

1.  The return value is a reference. so it has to be returned and 
dereferenced
even though the search loop may have already dereference it.
2.  The pattern is a hard pattern to track for branch predictors.  This 
causes
a large number of branch misses if the value is immediately checked and
branched on. i.e. if (a != m.end()) which is a common pattern.

The patch fixes both these issues by adding the inline keyword to _M_locate
to allow the inliner to consider inlining again.

This and the other patches have been ran through serveral benchmarks where
the size, number of elements searched for and type (reference vs value) etc
were tested.

The change shows no statistical regression, but an average find improvement 
of
~27% and a range between ~10-60% improvements.

Thanks,
Tamar

libstdc++-v3/ChangeLog:

* include/bits/hashtable.h (find): Add inline keyword.

(cherry picked from commit e4a9fb7448a687f4fd7e621942006c2820b803d6)

Diff:
---
 libstdc++-v3/include/bits/hashtable.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/include/bits/hashtable.h 
b/libstdc++-v3/include/bits/hashtable.h
index 1b5d0a7f42f4..c9ae0ed2c013 100644
--- a/libstdc++-v3/include/bits/hashtable.h
+++ b/libstdc++-v3/include/bits/hashtable.h
@@ -1660,7 +1660,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   typename _ExtractKey, typename _Equal,
   typename _Hash, typename _RangeHash, typename _Unused,
   typename _RehashPolicy, typename _Traits>
-auto
+auto inline
 _Hashtable<_Key, _Value, _Alloc, _ExtractKey, _Equal,
   _Hash, _RangeHash, _Unused, _RehashPolicy, _Traits>::
 find(const key_type& __k)
@@ -1683,7 +1683,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   typename _ExtractKey, typename _Equal,
   typename _Hash, typename _RangeHash, typename _Unused,
   typename _RehashPolicy, typename _Traits>
-auto
+auto inline
 _Hashtable<_Key, _Value, _Alloc, _ExtractKey, _Equal,
   _Hash, _RangeHash, _Unused, _RehashPolicy, _Traits>::
 find(const key_type& __k) const