Wilco Dijkstra via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> Add support for AArch64 LSE and LSE2 to libatomic.  Disable outline atomics,
> and use LSE ifuncs for 1-8 byte atomics and LSE2 ifuncs for 16-byte atomics.
> On Neoverse V1, 16-byte atomics are ~4x faster due to avoiding locks.
>
> Note this is safe since we swap all 16-byte atomics using the same ifunc,
> so they either use locks or LSE2 atomics, but never a mix. This also improves
> ABI compatibility with LLVM: its inlined 16-byte atomics are compatible with
> the new libatomic if LSE2 is supported.
>
> Passes regress, OK for commit?
>
> libatomic/
>         Makefile.in: Regenerated with automake 1.15.1.
>         Makefile.am: Add atomic_16.S for AArch64.
>         configure.tgt: Disable outline atomics in AArch64 build.
>         config/linux/aarch64/atomic_16.S: New file - implementation of
>         ifuncs for 128-bit atomics.
>         config/linux/aarch64/host-config.h: Enable ifuncs, use LSE 
> (HWCAP_ATOMICS)
>         for 1-8-byte atomics and LSE2 (HWCAP_USCAT) for 16-byte atomics.
>
> ---
> diff --git a/libatomic/Makefile.am b/libatomic/Makefile.am
> index 
> d88515e4a03bd812334ae0b7bf4c0bba119455dc..41e5da28512150780a2018386e22b4e70afcfa3f
>  100644
> --- a/libatomic/Makefile.am
> +++ b/libatomic/Makefile.am
> @@ -127,6 +127,8 @@ if HAVE_IFUNC
>  if ARCH_AARCH64_LINUX
>  IFUNC_OPTIONS             = -march=armv8-a+lse
>  libatomic_la_LIBADD += $(foreach s,$(SIZES),$(addsuffix 
> _$(s)_1_.lo,$(SIZEOBJS)))
> +libatomic_la_SOURCES += atomic_16.S
> +
>  endif
>  if ARCH_ARM_LINUX
>  IFUNC_OPTIONS             = -march=armv7-a+fp -DHAVE_KERNEL64
> diff --git a/libatomic/Makefile.in b/libatomic/Makefile.in
> index 
> 80d25653dc75cca995c8b0b2107a55f1234a6d52..89e29fc60a7fb74341b2f0f805e461847073082c
>  100644
> --- a/libatomic/Makefile.in
> +++ b/libatomic/Makefile.in
> @@ -90,13 +90,14 @@ build_triplet = @build@
>  host_triplet = @host@
>  target_triplet = @target@
>  @ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_1 = $(foreach 
> s,$(SIZES),$(addsuffix _$(s)_1_.lo,$(SIZEOBJS)))
> -@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_2 = $(foreach \
> +@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_2 = atomic_16.S
> +@ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(foreach \
>  @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@       s,$(SIZES),$(addsuffix \
>  @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@       _$(s)_1_.lo,$(SIZEOBJS))) \
>  @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@       $(addsuffix \
>  @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@       _8_2_.lo,$(SIZEOBJS))
> -@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_3 = $(addsuffix 
> _8_1_.lo,$(SIZEOBJS))
> -@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix 
> _16_1_.lo,$(SIZEOBJS)) \
> +@ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@am__append_4 = $(addsuffix 
> _8_1_.lo,$(SIZEOBJS))
> +@ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@am__append_5 = $(addsuffix 
> _16_1_.lo,$(SIZEOBJS)) \
>  @ARCH_X86_64_TRUE@@HAVE_IFUNC_TRUE@                 $(addsuffix 
> _16_2_.lo,$(SIZEOBJS))
>  
>  subdir = .
> @@ -154,8 +155,11 @@ am__uninstall_files_from_dir = { \
>    }
>  am__installdirs = "$(DESTDIR)$(toolexeclibdir)"
>  LTLIBRARIES = $(noinst_LTLIBRARIES) $(toolexeclib_LTLIBRARIES)
> +@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@am__objects_1 =  \
> +@ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@   atomic_16.lo
>  am_libatomic_la_OBJECTS = gload.lo gstore.lo gcas.lo gexch.lo \
> -     glfree.lo lock.lo init.lo fenv.lo fence.lo flag.lo
> +     glfree.lo lock.lo init.lo fenv.lo fence.lo flag.lo \
> +     $(am__objects_1)
>  libatomic_la_OBJECTS = $(am_libatomic_la_OBJECTS)
>  AM_V_lt = $(am__v_lt_@AM_V@)
>  am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
> @@ -165,9 +169,9 @@ libatomic_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC 
> $(AM_LIBTOOLFLAGS) \
>       $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
>       $(libatomic_la_LDFLAGS) $(LDFLAGS) -o $@
>  libatomic_convenience_la_DEPENDENCIES = $(libatomic_la_LIBADD)
> -am__objects_1 = gload.lo gstore.lo gcas.lo gexch.lo glfree.lo lock.lo \
> -     init.lo fenv.lo fence.lo flag.lo
> -am_libatomic_convenience_la_OBJECTS = $(am__objects_1)
> +am__objects_2 = gload.lo gstore.lo gcas.lo gexch.lo glfree.lo lock.lo \
> +     init.lo fenv.lo fence.lo flag.lo $(am__objects_1)
> +am_libatomic_convenience_la_OBJECTS = $(am__objects_2)
>  libatomic_convenience_la_OBJECTS =  \
>       $(am_libatomic_convenience_la_OBJECTS)
>  AM_V_P = $(am__v_P_@AM_V@)
> @@ -185,6 +189,16 @@ am__v_at_1 =
>  depcomp = $(SHELL) $(top_srcdir)/../depcomp
>  am__depfiles_maybe = depfiles
>  am__mv = mv -f
> +CPPASCOMPILE = $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
> +     $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CCASFLAGS) $(CCASFLAGS)
> +LTCPPASCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \
> +     $(LIBTOOLFLAGS) --mode=compile $(CCAS) $(DEFS) \
> +     $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
> +     $(AM_CCASFLAGS) $(CCASFLAGS)
> +AM_V_CPPAS = $(am__v_CPPAS_@AM_V@)
> +am__v_CPPAS_ = $(am__v_CPPAS_@AM_DEFAULT_V@)
> +am__v_CPPAS_0 = @echo "  CPPAS   " $@;
> +am__v_CPPAS_1 = 
>  COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
>       $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
>  LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
> @@ -369,6 +383,7 @@ pdfdir = @pdfdir@
>  prefix = @prefix@
>  program_transform_name = @program_transform_name@
>  psdir = @psdir@
> +runstatedir = @runstatedir@
>  sbindir = @sbindir@
>  sharedstatedir = @sharedstatedir@
>  srcdir = @srcdir@
> @@ -404,9 +419,8 @@ noinst_LTLIBRARIES = libatomic_convenience.la
>  
> @LIBAT_BUILD_VERSIONED_SHLIB_SUN_TRUE@@LIBAT_BUILD_VERSIONED_SHLIB_TRUE@libatomic_version_dep
>  = libatomic.map-sun
>  libatomic_version_info = -version-info $(libtool_VERSION)
>  libatomic_la_LDFLAGS = $(libatomic_version_info) $(libatomic_version_script) 
> $(lt_host_flags)
> -libatomic_la_SOURCES = gload.c gstore.c gcas.c gexch.c glfree.c lock.c 
> init.c \
> -     fenv.c fence.c flag.c
> -
> +libatomic_la_SOURCES = gload.c gstore.c gcas.c gexch.c glfree.c lock.c \
> +     init.c fenv.c fence.c flag.c $(am__append_2)
>  SIZEOBJS = load store cas exch fadd fsub fand fior fxor fnand tas
>  EXTRA_libatomic_la_SOURCES = $(addsuffix _n.c,$(SIZEOBJS))
>  libatomic_la_DEPENDENCIES = $(libatomic_la_LIBADD) $(libatomic_version_dep)
> @@ -432,8 +446,8 @@ all_c_files := $(foreach dir,$(search_path),$(wildcard 
> $(dir)/*.c))
>  # Then sort through them to find the one we want, and select the first.
>  M_SRC = $(firstword $(filter %/$(M_FILE), $(all_c_files)))
>  libatomic_la_LIBADD = $(foreach s,$(SIZES),$(addsuffix \
> -     _$(s)_.lo,$(SIZEOBJS))) $(am__append_1) $(am__append_2) \
> -     $(am__append_3) $(am__append_4)
> +     _$(s)_.lo,$(SIZEOBJS))) $(am__append_1) $(am__append_3) \
> +     $(am__append_4) $(am__append_5)
>  @ARCH_AARCH64_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv8-a+lse
>  @ARCH_ARM_LINUX_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=armv7-a+fp 
> -DHAVE_KERNEL64
>  @ARCH_I386_TRUE@@HAVE_IFUNC_TRUE@IFUNC_OPTIONS = -march=i586
> @@ -450,7 +464,7 @@ all: auto-config.h
>       $(MAKE) $(AM_MAKEFLAGS) all-recursive
>  
>  .SUFFIXES:
> -.SUFFIXES: .c .lo .o .obj
> +.SUFFIXES: .S .c .lo .o .obj
>  am--refresh: Makefile
>       @:
>  $(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am 
> $(top_srcdir)/../multilib.am $(am__configure_deps)
> @@ -559,6 +573,7 @@ mostlyclean-compile:
>  distclean-compile:
>       -rm -f *.tab.c
>  
> +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomic_16.Plo@am__quote@
>  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fence.Plo@am__quote@
>  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fenv.Plo@am__quote@
>  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/flag.Plo@am__quote@
> @@ -570,6 +585,27 @@ distclean-compile:
>  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/init.Plo@am__quote@
>  @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@
>  
> +.S.o:
> +@am__fastdepCCAS_TRUE@       $(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF 
> $(DEPDIR)/$*.Tpo -c -o $@ $<
> +@am__fastdepCCAS_TRUE@       $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo 
> $(DEPDIR)/$*.Po
> +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@  $(AM_V_CPPAS)source='$<' object='$@' 
> libtool=no @AMDEPBACKSLASH@
> +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@  DEPDIR=$(DEPDIR) $(CCASDEPMODE) 
> $(depcomp) @AMDEPBACKSLASH@
> +@am__fastdepCCAS_FALSE@      $(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o 
> $@ $<
> +
> +.S.obj:
> +@am__fastdepCCAS_TRUE@       $(AM_V_CPPAS)$(CPPASCOMPILE) -MT $@ -MD -MP -MF 
> $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
> +@am__fastdepCCAS_TRUE@       $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo 
> $(DEPDIR)/$*.Po
> +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@  $(AM_V_CPPAS)source='$<' object='$@' 
> libtool=no @AMDEPBACKSLASH@
> +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@  DEPDIR=$(DEPDIR) $(CCASDEPMODE) 
> $(depcomp) @AMDEPBACKSLASH@
> +@am__fastdepCCAS_FALSE@      $(AM_V_CPPAS@am__nodep@)$(CPPASCOMPILE) -c -o 
> $@ `$(CYGPATH_W) '$<'`
> +
> +.S.lo:
> +@am__fastdepCCAS_TRUE@       $(AM_V_CPPAS)$(LTCPPASCOMPILE) -MT $@ -MD -MP 
> -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
> +@am__fastdepCCAS_TRUE@       $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo 
> $(DEPDIR)/$*.Plo
> +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@  $(AM_V_CPPAS)source='$<' object='$@' 
> libtool=yes @AMDEPBACKSLASH@
> +@AMDEP_TRUE@@am__fastdepCCAS_FALSE@  DEPDIR=$(DEPDIR) $(CCASDEPMODE) 
> $(depcomp) @AMDEPBACKSLASH@
> +@am__fastdepCCAS_FALSE@      $(AM_V_CPPAS@am__nodep@)$(LTCPPASCOMPILE) -c -o 
> $@ $<
> +
>  .c.o:
>  @am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF 
> $(DEPDIR)/$*.Tpo -c -o $@ $<
>  @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
> diff --git a/libatomic/config/linux/aarch64/atomic_16.S 
> b/libatomic/config/linux/aarch64/atomic_16.S
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..5f23dba4529528c39425221402323d07a14cc518
> --- /dev/null
> +++ b/libatomic/config/linux/aarch64/atomic_16.S
> @@ -0,0 +1,422 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU Atomic Library (libatomic).
> +
> +   Libatomic is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3 of the License, or
> +   (at your option) any later version.
> +
> +   Libatomic is distributed in the hope that it will be useful, but WITHOUT 
> ANY
> +   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
> +   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> +   more details.
> +
> +   Under Section 7 of GPL version 3, you are granted additional
> +   permissions described in the GCC Runtime Library Exception, version
> +   3.1, as published by the Free Software Foundation.
> +
> +   You should have received a copy of the GNU General Public License and
> +   a copy of the GCC Runtime Library Exception along with this program;
> +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +
> +     .arch   armv8-a+lse
> +
> +#define ENTRY(name)          \
> +     .global name;           \
> +     .hidden name;           \
> +     .type name,%function;   \
> +     .p2align 4;             \
> +name:                                \
> +     .cfi_startproc;         \
> +     hint    34      // bti c
> +
> +#define END(name)            \
> +     .cfi_endproc;           \
> +     .size name, .-name;
> +
> +#define res0 x0
> +#define res1 x1
> +#define in0  x2
> +#define in1  x3
> +#define tmp0 x6
> +#define tmp1 x7
> +#define exp0 x8
> +#define exp1 x9
> +
> +#ifdef __AARCH64EB__
> +# define reslo x1
> +# define reshi x0
> +# define inlo  x3
> +# define inhi  x2
> +# define tmplo x7
> +# define tmphi x6
> +#else
> +# define reslo x0
> +# define reshi x1
> +# define inlo  x2
> +# define inhi  x3
> +# define tmplo x6
> +# define tmphi x7
> +#endif
> +
> +#define RELAXED 0
> +#define CONSUME 1
> +#define ACQUIRE 2
> +#define RELEASE 3
> +#define ACQ_REL 4
> +#define SEQ_CST 5
> +
> +
> +ENTRY (libat_load_16_i1)
> +     cbnz    w1, 1f
> +     ldp     res0, res1, [x0]
> +     ret
> +1:
> +     cmp     w1, ACQUIRE
> +     b.hi    2f
> +     ldp     res0, res1, [x0]
> +     dmb     ishld
> +     ret
> +2:
> +     ldp     res0, res1, [x0]
> +     dmb     ish
> +     ret
> +END (libat_load_16_i1)
> +
> +
> +ENTRY (libat_store_16_i1)
> +     cbnz    w4, 1f
> +     stp     in0, in1, [x0]
> +     ret
> +1:
> +     dmb     ish
> +     stp     in0, in1, [x0]
> +     cmp     w4, SEQ_CST
> +     beq     2f
> +     ret
> +2:
> +     dmb     ish
> +     ret
> +END (libat_store_16_i1)
> +
> +
> +ENTRY (libat_exchange_16_i1)
> +     mov     x5, x0
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     stxp    w4, in0, in1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     cmp     w4, ACQUIRE
> +     b.hi    4f
> +3:
> +     ldaxp   res0, res1, [x5]
> +     stxp    w4, in0, in1, [x5]
> +     cbnz    w4, 3b
> +     ret
> +4:
> +     cmp     w4, RELEASE
> +     b.ne    6f
> +5:
> +     ldxp    res0, res1, [x5]
> +     stlxp   w4, in0, in1, [x5]
> +     cbnz    w4, 5b
> +     ret
> +6:
> +     ldaxp   res0, res1, [x5]
> +     stlxp   w4, in0, in1, [x5]
> +     cbnz    w4, 6b
> +     ret
> +END (libat_exchange_16_i1)
> +
> +
> +ENTRY (libat_compare_exchange_16_i1)
> +     ldp     exp0, exp1, [x1]
> +     mov     tmp0, exp0
> +     mov     tmp1, exp1
> +     cbz     w5, 2f
> +     cmp     w5, RELEASE
> +     b.hs    3f
> +     caspa   exp0, exp1, in0, in1, [x0]
> +0:
> +     cmp     exp0, tmp0
> +     ccmp    exp1, tmp1, 0, eq
> +     bne     1f
> +     mov     x0, 1
> +     ret
> +1:
> +     stp     exp0, exp1, [x1]
> +     mov     x0, 0
> +     ret
> +2:
> +     casp    exp0, exp1, in0, in1, [x0]
> +     b       0b
> +3:
> +     b.hi    4f
> +     caspl   exp0, exp1, in0, in1, [x0]
> +     b       0b
> +4:
> +     caspal  exp0, exp1, in0, in1, [x0]
> +     b       0b
> +END (libat_compare_exchange_16_i1)

As discussed off-list, it looks like this function should use w4 rather
than w5.  OK with that change, thanks.

Obviously completely separate work, but it would be nice to teach gcc to
use ORN for the inline nand expansion.  Maybe that's not heavily used though.

Richard

> +ENTRY (libat_fetch_add_16_i1)
> +     mov     x5, x0
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     adds    tmplo, reslo, inlo
> +     adc     tmphi, reshi, inhi
> +     stxp    w4, tmp0, tmp1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     ldaxp   res0, res1, [x5]
> +     adds    tmplo, reslo, inlo
> +     adc     tmphi, reshi, inhi
> +     stlxp   w4, tmp0, tmp1, [x5]
> +     cbnz    w4, 2b
> +     ret
> +END (libat_fetch_add_16_i1)
> +
> +
> +ENTRY (libat_add_fetch_16_i1)
> +     mov     x5, x0
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     adds    reslo, reslo, inlo
> +     adc     reshi, reshi, inhi
> +     stxp    w4, res0, res1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     ldaxp   res0, res1, [x5]
> +     adds    reslo, reslo, inlo
> +     adc     reshi, reshi, inhi
> +     stlxp   w4, res0, res1, [x5]
> +     cbnz    w4, 2b
> +     ret
> +END (libat_add_fetch_16_i1)
> +
> +
> +ENTRY (libat_fetch_sub_16_i1)
> +     mov     x5, x0
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     subs    tmplo, reslo, inlo
> +     sbc     tmphi, reshi, inhi
> +     stxp    w4, tmp0, tmp1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     ldaxp   res0, res1, [x5]
> +     subs    tmplo, reslo, inlo
> +     sbc     tmphi, reshi, inhi
> +     stlxp   w4, tmp0, tmp1, [x5]
> +     cbnz    w4, 2b
> +     ret
> +END (libat_fetch_sub_16_i1)
> +
> +
> +ENTRY (libat_sub_fetch_16_i1)
> +     mov     x5, x0
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     subs    reslo, reslo, inlo
> +     sbc     reshi, reshi, inhi
> +     stxp    w4, res0, res1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     ldaxp   res0, res1, [x5]
> +     subs    reslo, reslo, inlo
> +     sbc     reshi, reshi, inhi
> +     stlxp   w4, res0, res1, [x5]
> +     cbnz    w4, 2b
> +     ret
> +END (libat_sub_fetch_16_i1)
> +
> +
> +ENTRY (libat_fetch_or_16_i1)
> +     mov     x5, x0
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     orr     tmp0, res0, in0
> +     orr     tmp1, res1, in1
> +     stxp    w4, tmp0, tmp1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     ldaxp   res0, res1, [x5]
> +     orr     tmp0, res0, in0
> +     orr     tmp1, res1, in1
> +     stlxp   w4, tmp0, tmp1, [x5]
> +     cbnz    w4, 2b
> +     ret
> +END (libat_fetch_or_16_i1)
> +
> +
> +ENTRY (libat_or_fetch_16_i1)
> +     mov     x5, x0
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     orr     res0, res0, in0
> +     orr     res1, res1, in1
> +     stxp    w4, res0, res1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     ldaxp   res0, res1, [x5]
> +     orr     res0, res0, in0
> +     orr     res1, res1, in1
> +     stlxp   w4, res0, res1, [x5]
> +     cbnz    w4, 2b
> +     ret
> +END (libat_or_fetch_16_i1)
> +
> +
> +ENTRY (libat_fetch_and_16_i1)
> +     mov     x5, x0
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     and     tmp0, res0, in0
> +     and     tmp1, res1, in1
> +     stxp    w4, tmp0, tmp1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     ldaxp   res0, res1, [x5]
> +     and     tmp0, res0, in0
> +     and     tmp1, res1, in1
> +     stlxp   w4, tmp0, tmp1, [x5]
> +     cbnz    w4, 2b
> +     ret
> +END (libat_fetch_and_16_i1)
> +
> +
> +ENTRY (libat_and_fetch_16_i1)
> +     mov     x5, x0
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     and     res0, res0, in0
> +     and     res1, res1, in1
> +     stxp    w4, res0, res1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     ldaxp   res0, res1, [x5]
> +     and     res0, res0, in0
> +     and     res1, res1, in1
> +     stlxp   w4, res0, res1, [x5]
> +     cbnz    w4, 2b
> +     ret
> +END (libat_and_fetch_16_i1)
> +
> +
> +ENTRY (libat_fetch_xor_16_i1)
> +     mov     x5, x0
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     eor     tmp0, res0, in0
> +     eor     tmp1, res1, in1
> +     stxp    w4, tmp0, tmp1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     ldaxp   res0, res1, [x5]
> +     eor     tmp0, res0, in0
> +     eor     tmp1, res1, in1
> +     stlxp   w4, tmp0, tmp1, [x5]
> +     cbnz    w4, 2b
> +     ret
> +END (libat_fetch_xor_16_i1)
> +
> +
> +ENTRY (libat_xor_fetch_16_i1)
> +     mov     x5, x0
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     eor     res0, res0, in0
> +     eor     res1, res1, in1
> +     stxp    w4, res0, res1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     ldaxp   res0, res1, [x5]
> +     eor     res0, res0, in0
> +     eor     res1, res1, in1
> +     stlxp   w4, res0, res1, [x5]
> +     cbnz    w4, 2b
> +     ret
> +END (libat_xor_fetch_16_i1)
> +
> +
> +ENTRY (libat_fetch_nand_16_i1)
> +     mov     x5, x0
> +     mvn     in0, in0
> +     mvn     in1, in1
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     orn     tmp0, in0, res0
> +     orn     tmp1, in1, res1
> +     stxp    w4, tmp0, tmp1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     ldaxp   res0, res1, [x5]
> +     orn     tmp0, in0, res0
> +     orn     tmp1, in1, res1
> +     stlxp   w4, tmp0, tmp1, [x5]
> +     cbnz    w4, 2b
> +     ret
> +END (libat_fetch_nand_16_i1)
> +
> +
> +ENTRY (libat_nand_fetch_16_i1)
> +     mov     x5, x0
> +     mvn     in0, in0
> +     mvn     in1, in1
> +     cbnz    w4, 2f
> +1:
> +     ldxp    res0, res1, [x5]
> +     orn     res0, in0, res0
> +     orn     res1, in1, res1
> +     stxp    w4, res0, res1, [x5]
> +     cbnz    w4, 1b
> +     ret
> +2:
> +     ldaxp   res0, res1, [x5]
> +     orn     res0, in0, res0
> +     orn     res1, in1, res1
> +     stlxp   w4, res0, res1, [x5]
> +     cbnz    w4, 2b
> +     ret
> +END (libat_nand_fetch_16_i1)
> +
> +
> +ENTRY (libat_test_and_set_16_i1)
> +     mov     w2, 1
> +     cbnz    w1, 2f
> +     swpb    w0, w2, [x0]
> +     ret
> +
> +2:   swpalb  w0, w2, [x0]
> +     ret
> +END (libat_test_and_set_16_i1)
> +
> diff --git a/libatomic/config/linux/aarch64/host-config.h 
> b/libatomic/config/linux/aarch64/host-config.h
> index 
> 769ba6edc600099122b03af754cbbb079134596a..d9b5ab31bc85cfe1d5f3773c42442e408b174cbc
>  100644
> --- a/libatomic/config/linux/aarch64/host-config.h
> +++ b/libatomic/config/linux/aarch64/host-config.h
> @@ -22,14 +22,22 @@
>     <http://www.gnu.org/licenses/>.  */
>  
>  #if HAVE_IFUNC
> -#include <stdlib.h>
> +#include <sys/auxv.h>
>  
> -# ifdef HWCAP_ATOMICS
> -#  define IFUNC_COND_1       (hwcap & HWCAP_ATOMICS)
> +#ifdef HWCAP_USCAT
> +# if N == 16
> +#  define IFUNC_COND_1       (hwcap & HWCAP_USCAT)
>  # else
> -#  define IFUNC_COND_1       (false)
> +#  define IFUNC_COND_1       (hwcap & HWCAP_ATOMICS)
>  # endif
> -# define IFUNC_NCOND(N)      (1)
> +#else
> +#  define IFUNC_COND_1       (false)
> +#endif
> +#define IFUNC_NCOND(N)       (1)
> +
> +#if N == 16 && IFUNC_ALT != 0
> +# define DONE 1
> +#endif
>  
>  #endif /* HAVE_IFUNC */
>  
> diff --git a/libatomic/configure.tgt b/libatomic/configure.tgt
> index 
> 33f8c91ce7718336b05e1077d3e91feb5b706730..113420f7beca143b5040fc9eb871461c2163ae44
>  100644
> --- a/libatomic/configure.tgt
> +++ b/libatomic/configure.tgt
> @@ -49,6 +49,7 @@ case "${target_cpu}" in
>               fi
>               ;;
>       esac
> +     XCFLAGS="${XCFLAGS} -mno-outline-atomics"
>       ;;
>    arm*)
>       ARCH=arm

Reply via email to