https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71191
Bug ID: 71191
Summary: aarch64 and others:
__atomic_load;arithmetic;__atomic_compare_exchange
loops should be able to generate better code with
LL/SC-type constructs than a CAS loop
Product: gcc
Version: 6.1.1
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: dhowells at redhat dot com
Target Milestone: ---
In the kernel, we have various bits of code that boil down to:
int cur = __atomic_load_n(&v->counter, __ATOMIC_RELAXED);
int new;
do {
new = do_something_to cur;
} while (!__atomic_compare_exchange_n(&v->counter,
&cur, new,
false,
__ATOMIC_SEQ_CST,
__ATOMIC_RELAXED));
and:
int cur = __atomic_load_n(&v->counter, __ATOMIC_RELAXED);
int new;
do {
if (new == not_this_value)
break;
new = do_something_to cur;
} while (!__atomic_compare_exchange_n(&v->counter,
&cur, new,
false,
__ATOMIC_SEQ_CST,
__ATOMIC_RELAXED));
For example:
static __always_inline int __atomic_add_unless(atomic_t *v,
int addend, int unless)
{
int cur = __atomic_load_n(&v->counter, __ATOMIC_RELAXED);
int new;
do {
if (__builtin_expect(cur == unless, 0))
break;
new = cur + addend;
} while (!__atomic_compare_exchange_n(&v->counter,
&cur, new,
false,
__ATOMIC_SEQ_CST,
__ATOMIC_RELAXED));
return cur;
}
int test_atomic_add_unless(atomic_t *counter)
{
return __atomic_add_unless(counter, 0x56, 0x23);
}
If the CPU has LL/SC constructs available, something like this is probably
better implemented using that than a CMPXCHG loop - _provided_ the bit between
the __atomic_load and the __atomic_compare_exchange doesn't resolve to more
than a few instructions
So, using armv8-a as an example, the test_atomic_add_unless() function above
compiles to:
test_atomic_add_unless:
sub sp, sp, #16 # unnecessary
ldr w1, [x0] # __atomic_load_n()
str w1, [sp, 12] # bug 70825
.L5:
ldr w1, [sp, 12] # bug 70825
cmp w1, 35
beq .L4
ldr w3, [sp, 12] # bug 70825
add w1, w1, 86
.L7:
ldaxr w2, [x0] # }
cmp w2, w3 # } __atomic_compare_exchange()
bne .L8 # }
stlxr w4, w1, [x0] # }
cbnz w4, .L7 # }
.L8:
beq .L4
str w2, [sp, 12] # bug 70825
b .L5
.L4:
ldr w0, [sp, 12] # bug 70825
add sp, sp, 16 # unnecessary
ret
Removing the bits added by gcc by bug 70825 and using registers throughout,
this can be reduced to:
test_atomic_add_unless:
ldr w1, [x0] # __atomic_load_n()
.L5:
cmp w1, 35
beq .L4
add w2, w1, 86
mov w3, w1
.L7:
ldaxr w1, [x0] # }
cmp w1, w3 # } __atomic_compare_exchange()
bne .L5 # }
stlxr w4, w2, [x0] # }
cbnz w4, .L7 # }
.L4:
mov w1, w0
ret
However, the atomic load, the arithmetic and the compare exchange can be
condensed further by moving the arithmetic inside the LL/SC section:
test_atomic_add_unless:
.L7:
ldaxr w1, [x0] # __atomic_load_n()
cmp w1, 35 # } if (cur == unless)
beq .L4 # } break
add w2, w1, 86 # new = cur + addend
stlxr w4, w2, [x0]
cbnz w4, .L7
.L4:
mov w1, w0
ret
When compiled for -march=armv8-a+lse, however, the code resolves to a
compare-exchange loop using the CASAL instruction:
test_atomic_add_unless:
sub sp, sp, #16 # unnecessary
ldr w1, [x0] # __atomic_load_n()
str w1, [sp, 12] # bug 70825
.L5:
ldr w1, [sp, 12] # bug 70825
cmp w1, 35
beq .L4
ldr w3, [sp, 12] # bug 70825
add w1, w1, 86
mov w2, w3
casal w2, w1, [x0] # __atomic_compare_exchange_n()
cmp w2, w3
beq .L4
str w2, [sp, 12] # bug 70825
b .L5
.L4:
ldr w0, [sp, 12] # bug 70825
add sp, sp, 16 # unnecessary
ret
but LDAXR/STLXR might actually be better.