SC-type constructs than a CAS loop

dhowells at redhat dot com Thu, 19 May 2016 02:44:22 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71191


            Bug ID: 71191
           Summary: aarch64 and others:
                    __atomic_load;arithmetic;__atomic_compare_exchange
                    loops should be able to generate better code with
                    LL/SC-type constructs than a CAS loop
           Product: gcc
           Version: 6.1.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: dhowells at redhat dot com
  Target Milestone: ---

In the kernel, we have various bits of code that boil down to:

        int cur = __atomic_load_n(&v->counter, __ATOMIC_RELAXED);
        int new;
        do {
                new = do_something_to cur;
        } while (!__atomic_compare_exchange_n(&v->counter,
                                                &cur, new,
                                                false,
                                                __ATOMIC_SEQ_CST,
                                                __ATOMIC_RELAXED));

and:

        int cur = __atomic_load_n(&v->counter, __ATOMIC_RELAXED);
        int new;
        do {
                if (new == not_this_value)
                        break;
                new = do_something_to cur;
        } while (!__atomic_compare_exchange_n(&v->counter,
                                                &cur, new,
                                                false,
                                                __ATOMIC_SEQ_CST,
                                                __ATOMIC_RELAXED));

For example:

        static __always_inline int __atomic_add_unless(atomic_t *v,
                                                       int addend, int unless)
        {
                int cur = __atomic_load_n(&v->counter, __ATOMIC_RELAXED);
                int new;

                do {
                        if (__builtin_expect(cur == unless, 0))
                                break;
                        new = cur + addend;
                } while (!__atomic_compare_exchange_n(&v->counter,
                                                      &cur, new,
                                                      false,
                                                      __ATOMIC_SEQ_CST,
                                                      __ATOMIC_RELAXED));
                return cur;
        }

        int test_atomic_add_unless(atomic_t *counter)
        {
                return __atomic_add_unless(counter, 0x56, 0x23);
        }

If the CPU has LL/SC constructs available, something like this is probably
better implemented using that than a CMPXCHG loop - _provided_ the bit between
the __atomic_load and the __atomic_compare_exchange doesn't resolve to more
than a few instructions

So, using armv8-a as an example, the test_atomic_add_unless() function above
compiles to:

        test_atomic_add_unless:
                sub     sp, sp, #16             # unnecessary
                ldr     w1, [x0]                # __atomic_load_n()
                str     w1, [sp, 12]            # bug 70825
        .L5:
                ldr     w1, [sp, 12]            # bug 70825
                cmp     w1, 35
                beq     .L4
                ldr     w3, [sp, 12]            # bug 70825
                add     w1, w1, 86
        .L7:
                ldaxr   w2, [x0]                # }
                cmp     w2, w3                  # } __atomic_compare_exchange()
                bne     .L8                     # }
                stlxr   w4, w1, [x0]            # }
                cbnz    w4, .L7                 # }
        .L8:
                beq     .L4
                str     w2, [sp, 12]            # bug 70825
                b       .L5
        .L4:
                ldr     w0, [sp, 12]            # bug 70825
                add     sp, sp, 16              # unnecessary
                ret

Removing the bits added by gcc by bug 70825 and using registers throughout,
this can be reduced to:

        test_atomic_add_unless:
                ldr     w1, [x0]                # __atomic_load_n()
        .L5:
                cmp     w1, 35
                beq     .L4
                add     w2, w1, 86
                mov     w3, w1
        .L7:
                ldaxr   w1, [x0]                # }
                cmp     w1, w3                  # } __atomic_compare_exchange()
                bne     .L5                     # }
                stlxr   w4, w2, [x0]            # }
                cbnz    w4, .L7                 # }
        .L4:
                mov     w1, w0
                ret

However, the atomic load, the arithmetic and the compare exchange can be
condensed further by moving the arithmetic inside the LL/SC section:

        test_atomic_add_unless:
        .L7:
                ldaxr   w1, [x0]                # __atomic_load_n()
                cmp     w1, 35                  # } if (cur == unless)
                beq     .L4                     # }     break
                add     w2, w1, 86              # new = cur + addend
                stlxr   w4, w2, [x0]
                cbnz    w4, .L7
        .L4:
                mov     w1, w0
                ret

When compiled for -march=armv8-a+lse, however, the code resolves to a
compare-exchange loop using the CASAL instruction:

        test_atomic_add_unless:
                sub     sp, sp, #16             # unnecessary
                ldr     w1, [x0]                # __atomic_load_n()
                str     w1, [sp, 12]            # bug 70825
        .L5:
                ldr     w1, [sp, 12]            # bug 70825
                cmp     w1, 35
                beq     .L4
                ldr     w3, [sp, 12]            # bug 70825
                add     w1, w1, 86
                mov     w2, w3
                casal   w2, w1, [x0]            # __atomic_compare_exchange_n()
                cmp     w2, w3
                beq     .L4
                str     w2, [sp, 12]            # bug 70825
                b       .L5
        .L4:
                ldr     w0, [sp, 12]            # bug 70825
                add     sp, sp, 16              # unnecessary
                ret

but LDAXR/STLXR might actually be better.

[Bug target/71191] New: aarch64 and others: __atomic_load;arithmetic;__atomic_compare_exchange loops should be able to generate better code with LL/SC-type constructs than a CAS loop

Reply via email to