C11 __rte_ring_headtail_move_head_mt() uses output
parameter: 'uint32_t *old_head' directly within CAS operation.
In x86_64 that cause gcc to generate extra instructions to
store return value of CAS (eax) within 'old_head' memory location,
even when CAS was not successful and another attempt should be
performed. In some cases, even extra branch can be observed.
To be more specific the code like that is generated:
// start of 'do { } while();' loop
.L2
...
lock cmpxchgl %r8d, (%rdi)
jne .L17 //
.L1: // <---- successful completion of CAS, finish
movl %edx, %eax
ret
.L17: // <---- unsuccessful completion of CAS, repeat
movl %eax, (%r9)
jmp .L2
In constrast, x86 specific version that uses
__sync_bool_compare_and_swap() doesn't exibit such problem,
as __sync_bool_compare_and_swap() doesn't update the 'old_head'
with new value, and we have to re-read it explicitly on each iteration.
Overcome that problem by using local variable 'head' inside the loop,
and updaing '*old_head' value only at exit.
With such change gcc manages to avoid extra store(/branch).
Depends-on: series-38225 ("deprecate rte_atomicNN family")
Signed-off-by: Konstantin Ananyev <[email protected]>
---
lib/ring/rte_ring_c11_pvt.h | 19 +++++++++++--------
lib/ring/rte_ring_elem_pvt.h | 5 -----
2 files changed, 11 insertions(+), 13 deletions(-)
diff --git a/lib/ring/rte_ring_c11_pvt.h b/lib/ring/rte_ring_c11_pvt.h
index 3efe011f08..ee98155bea 100644
--- a/lib/ring/rte_ring_c11_pvt.h
+++ b/lib/ring/rte_ring_c11_pvt.h
@@ -52,6 +52,7 @@ __rte_ring_headtail_move_head_mt(struct rte_ring_headtail *d,
unsigned int n, enum rte_ring_queue_behavior behavior,
uint32_t *old_head, uint32_t *new_head, uint32_t *entries)
{
+ uint32_t head;
unsigned int max = n;
/*
@@ -61,7 +62,7 @@ __rte_ring_headtail_move_head_mt(struct rte_ring_headtail *d,
* d->head.
* If not, an unsafe partial order may ensue.
*/
- *old_head = rte_atomic_load_explicit(&d->head,
rte_memory_order_acquire);
+ head = rte_atomic_load_explicit(&d->head, rte_memory_order_acquire);
do {
/* Reset n to the initial burst count */
n = max;
@@ -76,10 +77,10 @@ __rte_ring_headtail_move_head_mt(struct rte_ring_headtail
*d,
/* The subtraction is done between two unsigned 32bits value
* (the result is always modulo 32 bits even if we have
- * *old_head > s->tail). So 'entries' is always between 0
+ * head > s->tail). So 'entries' is always between 0
* and capacity (which is < size).
*/
- *entries = capacity + stail - *old_head;
+ *entries = capacity + stail - head;
/* check that we have enough room in ring */
if (unlikely(n > *entries))
@@ -87,11 +88,11 @@ __rte_ring_headtail_move_head_mt(struct rte_ring_headtail
*d,
0 : *entries;
if (n == 0)
- return 0;
+ break;
- *new_head = *old_head + n;
+ *new_head = head + n;
- /* on failure, *old_head is updated */
+ /* on failure, head is updated */
/*
* R1/A2.
* R1: Establishes a synchronizing edge with A0 of a
@@ -99,11 +100,13 @@ __rte_ring_headtail_move_head_mt(struct rte_ring_headtail
*d,
* A2: Establishes a synchronizing edge with R1 of a
* different thread to observe same value for stail
* observed by that thread on CAS failure (to retry
- * with an updated *old_head).
+ * with an updated head).
*/
} while (unlikely(!rte_atomic_compare_exchange_strong_explicit(
- &d->head, old_head, *new_head,
+ &d->head, &head, *new_head,
rte_memory_order_release,
rte_memory_order_acquire)));
+
+ *old_head = head;
return n;
}
diff --git a/lib/ring/rte_ring_elem_pvt.h b/lib/ring/rte_ring_elem_pvt.h
index 9d1da12a92..51176b0405 100644
--- a/lib/ring/rte_ring_elem_pvt.h
+++ b/lib/ring/rte_ring_elem_pvt.h
@@ -396,12 +396,7 @@ __rte_ring_headtail_move_head_st(struct rte_ring_headtail
*d,
return n;
}
-/* There are two choices because GCC optimizer does poorly on
atomic_compare_exchange */
-#if defined(RTE_TOOLCHAIN_GCC) && defined(RTE_ARCH_X86)
-#include "rte_ring_x86_pvt.h"
-#else
#include "rte_ring_c11_pvt.h"
-#endif
/**
* @internal This function updates the producer head for enqueue
--
2.51.0