At present, `atomic_16.S' groups different implementations of the same functions together in the file. Therefore, as an example, the LSE128 implementation of `exchange_16' follows on immediately from its core implementation, as does the `fetch_or_16' LSE128 implementation.
Such architectural extension-dependent implementations are dependent both on ifunc and assembler support. They may therefore conceivably be guarded by 2 preprocessor macros, e.g. `#if HAVE_IFUNC' and `#if HAVE_FEAT_LSE128'. Having to apply these guards on a per-function basis adds unnecessary clutter to the file and makes its maintenance more error-prone. We therefore reorganize the layout of the file in such a way that all core implementations needing no `#ifdef's are placed first, followed by all ifunc-dependent implementations, which can all be guarded by a single `#if HAVE_IFUNC'. Within the guard, these are then subdivided and organized according to architectural extension requirements such that in the case of LSE128-specific functions, for example, they can all be guarded by a single `#if HAVE_FEAT_LSE128', greatly reducing the overall number of required `#ifdef' macros. libatomic/ChangeLog: * config/linux/aarch64/atomic_16.S: reshuffle functions. --- libatomic/config/linux/aarch64/atomic_16.S | 583 ++++++++++----------- 1 file changed, 288 insertions(+), 295 deletions(-) diff --git a/libatomic/config/linux/aarch64/atomic_16.S b/libatomic/config/linux/aarch64/atomic_16.S index 16ff03057ab..27363f82b75 100644 --- a/libatomic/config/linux/aarch64/atomic_16.S +++ b/libatomic/config/linux/aarch64/atomic_16.S @@ -40,15 +40,12 @@ #include "auto-config.h" -#if !HAVE_IFUNC -# undef HAVE_FEAT_LSE128 -# define HAVE_FEAT_LSE128 0 -#endif - -#define HAVE_FEAT_LSE2 HAVE_IFUNC - -#if HAVE_FEAT_LSE128 +#if HAVE_IFUNC +# if HAVE_FEAT_LSE128 .arch armv9-a+lse128 +# else + .arch armv8-a+lse +# endif #else .arch armv8-a+lse #endif @@ -124,6 +121,8 @@ NAME: \ #define ACQ_REL 4 #define SEQ_CST 5 +/* Core atomic operation implementations. These are available irrespective of + ifunc support or the presence of additional architectural extensions. */ ENTRY (load_16) mov x5, x0 @@ -143,31 +142,6 @@ ENTRY (load_16) END (load_16) -#if HAVE_FEAT_LSE2 -ENTRY_FEAT (load_16, LSE2) - cbnz w1, 1f - - /* RELAXED. */ - ldp res0, res1, [x0] - ret -1: - cmp w1, SEQ_CST - b.eq 2f - - /* ACQUIRE/CONSUME (Load-AcquirePC semantics). */ - ldp res0, res1, [x0] - dmb ishld - ret - - /* SEQ_CST. */ -2: ldar tmp0, [x0] /* Block reordering with Store-Release instr. */ - ldp res0, res1, [x0] - dmb ishld - ret -END_FEAT (load_16, LSE2) -#endif - - ENTRY (store_16) cbnz w4, 2f @@ -185,23 +159,6 @@ ENTRY (store_16) END (store_16) -#if HAVE_FEAT_LSE2 -ENTRY_FEAT (store_16, LSE2) - cbnz w4, 1f - - /* RELAXED. */ - stp in0, in1, [x0] - ret - - /* RELEASE/SEQ_CST. */ -1: ldxp xzr, tmp0, [x0] - stlxp w4, in0, in1, [x0] - cbnz w4, 1b - ret -END_FEAT (store_16, LSE2) -#endif - - ENTRY (exchange_16) mov x5, x0 cbnz w4, 2f @@ -229,31 +186,6 @@ ENTRY (exchange_16) END (exchange_16) -#if HAVE_FEAT_LSE128 -ENTRY_FEAT (exchange_16, LSE128) - mov tmp0, x0 - mov res0, in0 - mov res1, in1 - cbnz w4, 1f - - /* RELAXED. */ - swpp res0, res1, [tmp0] - ret -1: - cmp w4, ACQUIRE - b.hi 2f - - /* ACQUIRE/CONSUME. */ - swppa res0, res1, [tmp0] - ret - - /* RELEASE/ACQ_REL/SEQ_CST. */ -2: swppal res0, res1, [tmp0] - ret -END_FEAT (exchange_16, LSE128) -#endif - - ENTRY (compare_exchange_16) ldp exp0, exp1, [x1] cbz w4, 3f @@ -301,43 +233,97 @@ ENTRY (compare_exchange_16) END (compare_exchange_16) -#if HAVE_FEAT_LSE2 -ENTRY_FEAT (compare_exchange_16, LSE) - ldp exp0, exp1, [x1] - mov tmp0, exp0 - mov tmp1, exp1 - cbz w4, 2f - cmp w4, RELEASE - b.hs 3f +ENTRY (fetch_or_16) + mov x5, x0 + cbnz w4, 2f - /* ACQUIRE/CONSUME. */ - caspa exp0, exp1, in0, in1, [x0] -0: - cmp exp0, tmp0 - ccmp exp1, tmp1, 0, eq - bne 1f - mov x0, 1 + /* RELAXED. */ +1: ldxp res0, res1, [x5] + orr tmp0, res0, in0 + orr tmp1, res1, in1 + stxp w4, tmp0, tmp1, [x5] + cbnz w4, 1b ret -1: - stp exp0, exp1, [x1] - mov x0, 0 + + /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ +2: ldaxp res0, res1, [x5] + orr tmp0, res0, in0 + orr tmp1, res1, in1 + stlxp w4, tmp0, tmp1, [x5] + cbnz w4, 2b ret +END (fetch_or_16) + + +ENTRY (or_fetch_16) + mov x5, x0 + cbnz w4, 2f /* RELAXED. */ -2: casp exp0, exp1, in0, in1, [x0] - b 0b +1: ldxp res0, res1, [x5] + orr res0, res0, in0 + orr res1, res1, in1 + stxp w4, res0, res1, [x5] + cbnz w4, 1b + ret - /* RELEASE. */ -3: b.hi 4f - caspl exp0, exp1, in0, in1, [x0] - b 0b + /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ +2: ldaxp res0, res1, [x5] + orr res0, res0, in0 + orr res1, res1, in1 + stlxp w4, res0, res1, [x5] + cbnz w4, 2b + ret +END (or_fetch_16) + + +ENTRY (fetch_and_16) + mov x5, x0 + cbnz w4, 2f + + /* RELAXED. */ +1: ldxp res0, res1, [x5] + and tmp0, res0, in0 + and tmp1, res1, in1 + stxp w4, tmp0, tmp1, [x5] + cbnz w4, 1b + ret + + /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ +2: ldaxp res0, res1, [x5] + and tmp0, res0, in0 + and tmp1, res1, in1 + stlxp w4, tmp0, tmp1, [x5] + cbnz w4, 2b + ret +END (fetch_and_16) + + +ENTRY (and_fetch_16) + mov x5, x0 + cbnz w4, 2f + + /* RELAXED. */ +1: ldxp res0, res1, [x5] + and res0, res0, in0 + and res1, res1, in1 + stxp w4, res0, res1, [x5] + cbnz w4, 1b + ret + + /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ +2: ldaxp res0, res1, [x5] + and res0, res0, in0 + and res1, res1, in1 + stlxp w4, res0, res1, [x5] + cbnz w4, 2b + ret +END (and_fetch_16) - /* ACQ_REL/SEQ_CST. */ -4: caspal exp0, exp1, in0, in1, [x0] - b 0b -END_FEAT (compare_exchange_16, LSE) -#endif +/* The following functions are currently single-implementation operations, + so they are never assigned an ifunc selector. As such, they must be + reachable from __atomic_* entrypoints. */ ENTRY_ALIASED (fetch_add_16) mov x5, x0 @@ -427,309 +413,316 @@ ENTRY_ALIASED (sub_fetch_16) END (sub_fetch_16) -ENTRY (fetch_or_16) +ENTRY_ALIASED (fetch_xor_16) mov x5, x0 cbnz w4, 2f /* RELAXED. */ 1: ldxp res0, res1, [x5] - orr tmp0, res0, in0 - orr tmp1, res1, in1 + eor tmp0, res0, in0 + eor tmp1, res1, in1 stxp w4, tmp0, tmp1, [x5] cbnz w4, 1b ret /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ 2: ldaxp res0, res1, [x5] - orr tmp0, res0, in0 - orr tmp1, res1, in1 + eor tmp0, res0, in0 + eor tmp1, res1, in1 stlxp w4, tmp0, tmp1, [x5] cbnz w4, 2b ret -END (fetch_or_16) +END (fetch_xor_16) -#if HAVE_FEAT_LSE128 -ENTRY_FEAT (fetch_or_16, LSE128) - mov tmp0, x0 - mov res0, in0 - mov res1, in1 - cbnz w4, 1f +ENTRY_ALIASED (xor_fetch_16) + mov x5, x0 + cbnz w4, 2f /* RELAXED. */ - ldsetp res0, res1, [tmp0] - ret -1: - cmp w4, ACQUIRE - b.hi 2f - - /* ACQUIRE/CONSUME. */ - ldsetpa res0, res1, [tmp0] +1: ldxp res0, res1, [x5] + eor res0, res0, in0 + eor res1, res1, in1 + stxp w4, res0, res1, [x5] + cbnz w4, 1b ret - /* RELEASE/ACQ_REL/SEQ_CST. */ -2: ldsetpal res0, res1, [tmp0] + /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ +2: ldaxp res0, res1, [x5] + eor res0, res0, in0 + eor res1, res1, in1 + stlxp w4, res0, res1, [x5] + cbnz w4, 2b ret -END_FEAT (fetch_or_16, LSE128) -#endif +END (xor_fetch_16) -ENTRY (or_fetch_16) +ENTRY_ALIASED (fetch_nand_16) mov x5, x0 + mvn in0, in0 + mvn in1, in1 cbnz w4, 2f /* RELAXED. */ 1: ldxp res0, res1, [x5] - orr res0, res0, in0 - orr res1, res1, in1 - stxp w4, res0, res1, [x5] + orn tmp0, in0, res0 + orn tmp1, in1, res1 + stxp w4, tmp0, tmp1, [x5] cbnz w4, 1b ret /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ 2: ldaxp res0, res1, [x5] - orr res0, res0, in0 - orr res1, res1, in1 - stlxp w4, res0, res1, [x5] + orn tmp0, in0, res0 + orn tmp1, in1, res1 + stlxp w4, tmp0, tmp1, [x5] cbnz w4, 2b ret -END (or_fetch_16) +END (fetch_nand_16) -#if HAVE_FEAT_LSE128 -ENTRY_FEAT (or_fetch_16, LSE128) - cbnz w4, 1f - mov tmp0, in0 - mov tmp1, in1 +ENTRY_ALIASED (nand_fetch_16) + mov x5, x0 + mvn in0, in0 + mvn in1, in1 + cbnz w4, 2f /* RELAXED. */ - ldsetp in0, in1, [x0] - orr res0, in0, tmp0 - orr res1, in1, tmp1 +1: ldxp res0, res1, [x5] + orn res0, in0, res0 + orn res1, in1, res1 + stxp w4, res0, res1, [x5] + cbnz w4, 1b ret -1: - cmp w4, ACQUIRE - b.hi 2f - /* ACQUIRE/CONSUME. */ - ldsetpa in0, in1, [x0] - orr res0, in0, tmp0 - orr res1, in1, tmp1 + /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ +2: ldaxp res0, res1, [x5] + orn res0, in0, res0 + orn res1, in1, res1 + stlxp w4, res0, res1, [x5] + cbnz w4, 2b ret +END (nand_fetch_16) - /* RELEASE/ACQ_REL/SEQ_CST. */ -2: ldsetpal in0, in1, [x0] - orr res0, in0, tmp0 - orr res1, in1, tmp1 - ret -END_FEAT (or_fetch_16, LSE128) -#endif +/* __atomic_test_and_set is always inlined, so this entry is unused and + only required for completeness. */ +ENTRY_ALIASED (test_and_set_16) -ENTRY (fetch_and_16) + /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ mov x5, x0 - cbnz w4, 2f - - /* RELAXED. */ -1: ldxp res0, res1, [x5] - and tmp0, res0, in0 - and tmp1, res1, in1 - stxp w4, tmp0, tmp1, [x5] +1: ldaxrb w0, [x5] + stlxrb w4, w2, [x5] cbnz w4, 1b ret +END (test_and_set_16) - /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ -2: ldaxp res0, res1, [x5] - and tmp0, res0, in0 - and tmp1, res1, in1 - stlxp w4, tmp0, tmp1, [x5] - cbnz w4, 2b - ret -END (fetch_and_16) - +/* Ensure extension-specific implementations are not included unless ifunc + support is present, along with necessary assembler support. */ -#if HAVE_FEAT_LSE128 -ENTRY_FEAT (fetch_and_16, LSE128) - mov tmp0, x0 - mvn res0, in0 - mvn res1, in1 - cbnz w4, 1f +#if HAVE_IFUNC +ENTRY_FEAT (load_16, LSE2) + cbnz w1, 1f /* RELAXED. */ - ldclrp res0, res1, [tmp0] + ldp res0, res1, [x0] ret - 1: - cmp w4, ACQUIRE - b.hi 2f + cmp w1, SEQ_CST + b.eq 2f - /* ACQUIRE/CONSUME. */ - ldclrpa res0, res1, [tmp0] + /* ACQUIRE/CONSUME (Load-AcquirePC semantics). */ + ldp res0, res1, [x0] + dmb ishld ret - /* RELEASE/ACQ_REL/SEQ_CST. */ -2: ldclrpal res0, res1, [tmp0] + /* SEQ_CST. */ +2: ldar tmp0, [x0] /* Block reordering with Store-Release instr. */ + ldp res0, res1, [x0] + dmb ishld ret -END_FEAT (fetch_and_16, LSE128) -#endif +END_FEAT (load_16, LSE2) -ENTRY (and_fetch_16) - mov x5, x0 - cbnz w4, 2f +ENTRY_FEAT (store_16, LSE2) + cbnz w4, 1f /* RELAXED. */ -1: ldxp res0, res1, [x5] - and res0, res0, in0 - and res1, res1, in1 - stxp w4, res0, res1, [x5] + stp in0, in1, [x0] + ret + + /* RELEASE/SEQ_CST. */ +1: ldxp xzr, tmp0, [x0] + stlxp w4, in0, in1, [x0] cbnz w4, 1b ret +END_FEAT (store_16, LSE2) - /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ -2: ldaxp res0, res1, [x5] - and res0, res0, in0 - and res1, res1, in1 - stlxp w4, res0, res1, [x5] - cbnz w4, 2b + +ENTRY_FEAT (compare_exchange_16, LSE) + ldp exp0, exp1, [x1] + mov tmp0, exp0 + mov tmp1, exp1 + cbz w4, 2f + cmp w4, RELEASE + b.hs 3f + + /* ACQUIRE/CONSUME. */ + caspa exp0, exp1, in0, in1, [x0] +0: + cmp exp0, tmp0 + ccmp exp1, tmp1, 0, eq + bne 1f + mov x0, 1 ret -END (and_fetch_16) +1: + stp exp0, exp1, [x1] + mov x0, 0 + ret + + /* RELAXED. */ +2: casp exp0, exp1, in0, in1, [x0] + b 0b + + /* RELEASE. */ +3: b.hi 4f + caspl exp0, exp1, in0, in1, [x0] + b 0b + + /* ACQ_REL/SEQ_CST. */ +4: caspal exp0, exp1, in0, in1, [x0] + b 0b +END_FEAT (compare_exchange_16, LSE) #if HAVE_FEAT_LSE128 -ENTRY_FEAT (and_fetch_16, LSE128) - mvn tmp0, in0 - mvn tmp0, in1 +ENTRY_FEAT (exchange_16, LSE128) + mov tmp0, x0 + mov res0, in0 + mov res1, in1 cbnz w4, 1f /* RELAXED. */ - ldclrp tmp0, tmp1, [x0] - and res0, tmp0, in0 - and res1, tmp1, in1 + swpp res0, res1, [tmp0] ret - 1: cmp w4, ACQUIRE b.hi 2f /* ACQUIRE/CONSUME. */ - ldclrpa tmp0, tmp1, [x0] - and res0, tmp0, in0 - and res1, tmp1, in1 + swppa res0, res1, [tmp0] ret /* RELEASE/ACQ_REL/SEQ_CST. */ -2: ldclrpal tmp0, tmp1, [x5] - and res0, tmp0, in0 - and res1, tmp1, in1 +2: swppal res0, res1, [tmp0] ret -END_FEAT (and_fetch_16, LSE128) -#endif +END_FEAT (exchange_16, LSE128) -ENTRY_ALIASED (fetch_xor_16) - mov x5, x0 - cbnz w4, 2f +ENTRY_FEAT (fetch_or_16, LSE128) + mov tmp0, x0 + mov res0, in0 + mov res1, in1 + cbnz w4, 1f /* RELAXED. */ -1: ldxp res0, res1, [x5] - eor tmp0, res0, in0 - eor tmp1, res1, in1 - stxp w4, tmp0, tmp1, [x5] - cbnz w4, 1b + ldsetp res0, res1, [tmp0] ret +1: + cmp w4, ACQUIRE + b.hi 2f - /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ -2: ldaxp res0, res1, [x5] - eor tmp0, res0, in0 - eor tmp1, res1, in1 - stlxp w4, tmp0, tmp1, [x5] - cbnz w4, 2b + /* ACQUIRE/CONSUME. */ + ldsetpa res0, res1, [tmp0] ret -END (fetch_xor_16) + /* RELEASE/ACQ_REL/SEQ_CST. */ +2: ldsetpal res0, res1, [tmp0] + ret +END_FEAT (fetch_or_16, LSE128) -ENTRY_ALIASED (xor_fetch_16) - mov x5, x0 - cbnz w4, 2f + +ENTRY_FEAT (or_fetch_16, LSE128) + cbnz w4, 1f + mov tmp0, in0 + mov tmp1, in1 /* RELAXED. */ -1: ldxp res0, res1, [x5] - eor res0, res0, in0 - eor res1, res1, in1 - stxp w4, res0, res1, [x5] - cbnz w4, 1b + ldsetp in0, in1, [x0] + orr res0, in0, tmp0 + orr res1, in1, tmp1 ret +1: + cmp w4, ACQUIRE + b.hi 2f - /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ -2: ldaxp res0, res1, [x5] - eor res0, res0, in0 - eor res1, res1, in1 - stlxp w4, res0, res1, [x5] - cbnz w4, 2b + /* ACQUIRE/CONSUME. */ + ldsetpa in0, in1, [x0] + orr res0, in0, tmp0 + orr res1, in1, tmp1 ret -END (xor_fetch_16) + /* RELEASE/ACQ_REL/SEQ_CST. */ +2: ldsetpal in0, in1, [x0] + orr res0, in0, tmp0 + orr res1, in1, tmp1 + ret +END_FEAT (or_fetch_16, LSE128) -ENTRY_ALIASED (fetch_nand_16) - mov x5, x0 - mvn in0, in0 - mvn in1, in1 - cbnz w4, 2f + +ENTRY_FEAT (fetch_and_16, LSE128) + mov tmp0, x0 + mvn res0, in0 + mvn res1, in1 + cbnz w4, 1f /* RELAXED. */ -1: ldxp res0, res1, [x5] - orn tmp0, in0, res0 - orn tmp1, in1, res1 - stxp w4, tmp0, tmp1, [x5] - cbnz w4, 1b + ldclrp res0, res1, [tmp0] ret - /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ -2: ldaxp res0, res1, [x5] - orn tmp0, in0, res0 - orn tmp1, in1, res1 - stlxp w4, tmp0, tmp1, [x5] - cbnz w4, 2b +1: + cmp w4, ACQUIRE + b.hi 2f + + /* ACQUIRE/CONSUME. */ + ldclrpa res0, res1, [tmp0] ret -END (fetch_nand_16) + /* RELEASE/ACQ_REL/SEQ_CST. */ +2: ldclrpal res0, res1, [tmp0] + ret +END_FEAT (fetch_and_16, LSE128) -ENTRY_ALIASED (nand_fetch_16) - mov x5, x0 - mvn in0, in0 - mvn in1, in1 - cbnz w4, 2f - /* RELAXED. */ -1: ldxp res0, res1, [x5] - orn res0, in0, res0 - orn res1, in1, res1 - stxp w4, res0, res1, [x5] - cbnz w4, 1b - ret +ENTRY_FEAT (and_fetch_16, LSE128) + mvn tmp0, in0 + mvn tmp0, in1 + cbnz w4, 1f - /* ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ -2: ldaxp res0, res1, [x5] - orn res0, in0, res0 - orn res1, in1, res1 - stlxp w4, res0, res1, [x5] - cbnz w4, 2b + /* RELAXED. */ + ldclrp tmp0, tmp1, [x0] + and res0, tmp0, in0 + and res1, tmp1, in1 ret -END (nand_fetch_16) +1: + cmp w4, ACQUIRE + b.hi 2f -/* __atomic_test_and_set is always inlined, so this entry is unused and - only required for completeness. */ -ENTRY_ALIASED (test_and_set_16) + /* ACQUIRE/CONSUME. */ + ldclrpa tmp0, tmp1, [x0] + and res0, tmp0, in0 + and res1, tmp1, in1 + ret - /* RELAXED/ACQUIRE/CONSUME/RELEASE/ACQ_REL/SEQ_CST. */ - mov x5, x0 -1: ldaxrb w0, [x5] - stlxrb w4, w2, [x5] - cbnz w4, 1b + /* RELEASE/ACQ_REL/SEQ_CST. */ +2: ldclrpal tmp0, tmp1, [x5] + and res0, tmp0, in0 + and res1, tmp1, in1 ret -END (test_and_set_16) +END_FEAT (and_fetch_16, LSE128) +#endif /* HAVE_FEAT_LSE128 */ +#endif /* HAVE_IFUNC */ /* GNU_PROPERTY_AARCH64_* macros from elf.h for use in asm code. */ -- 2.34.1