> On 14 Oct 2024, at 15:28, Ricardo Jesus <r...@nvidia.com> wrote:
>
> External email: Use caution opening links or attachments
>
>
> This patch modifies the implementation of the vectorized mersenne
> twister random number generator to use __builtin_shufflevector instead
> of __builtin_shuffle. This makes it (almost) compatible with Clang.
>
> To make the implementation fully compatible with Clang, Clang will need
> to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
> currently it does not. This looks like an oversight in Clang and so will
> be addressed separately.
>
> I see no codegen change with this patch.
FWIW the change looks ok to me from an aarch64 perspective.
The only potential problem could have been a compile error so as long as the CI
buildbots are happy I think this would be safe to take.
Thanks,
Kyrill
>
> Bootstrapped and tested on aarch64-none-linux-gnu.
>
> Signed-off-by: Ricardo Jesus <r...@nvidia.com>
>
> 2024-09-05 Ricardo Jesus <r...@nvidia.com>
>
> * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
> of __builtin_shuffle with __builtin_shufflevector.
> (__aarch64_lsl_128): Move shift amount to a template parameter.
> (__aarch64_lsr_128): Move shift amount to a template parameter.
> (__aarch64_recursion): Update call sites of __aarch64_lsl_128
> and __aarch64_lsr_128.
> ---
> .../config/cpu/aarch64/opt/ext/opt_random.h | 28 +++++++++++--------
> 1 file changed, 16 insertions(+), 12 deletions(-)
>
> diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> index 7f756d1572f..7eb816abcd0 100644
> --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
> @@ -35,13 +35,13 @@
> #ifdef __ARM_NEON
>
> #ifdef __ARM_BIG_ENDIAN
> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
> - {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> - 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
> + 16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
> + 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
> #else
> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
> - {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> - _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
> + _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
> + _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
> #endif
>
> #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> namespace {
> // Logical Shift right 128-bits by c * 8 bits
>
> - __extension__ extern __inline __Uint32x4_t
> + __extension__
> + template<int __c>
> + extern __inline __Uint32x4_t
> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> - __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
> + __aarch64_lsr_128 (__Uint8x16_t __a)
> {
> const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
> 0, 0, 0, 0, 0, 0, 0, 0};
> @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>
> // Logical Shift left 128-bits by c * 8 bits
>
> - __extension__ extern __inline __Uint32x4_t
> + __extension__
> + template<int __c>
> + extern __inline __Uint32x4_t
> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> - __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
> + __aarch64_lsl_128 (__Uint8x16_t __a)
> {
> const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
> 0, 0, 0, 0, 0, 0, 0, 0};
> @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> __Uint32x4_t __e)
> {
> __Uint32x4_t __y = (__b >> __sr1);
> - __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
> + __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);
>
> __Uint32x4_t __v = __d << __sl1;
>
> __z = __z ^ __a;
> __z = __z ^ __v;
>
> - __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2);
> + __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a);
>
> __y = __y & __e;
> __z = __z ^ __x;
> --
> 2.44.0
>