On 10/21/24 18:30, Jonathan Wakely wrote: > On Mon, 21 Oct 2024 at 17:29, Ricardo Jesus <r...@nvidia.com> wrote: >> >> On 10/14/24 14:37, Jonathan Wakely wrote: >>> >>> >>> On Mon, 14 Oct 2024 at 14:36, Kyrylo Tkachov <ktkac...@nvidia.com> wrote: >>>> >>>> >>>> >>>>> On 14 Oct 2024, at 15:28, Ricardo Jesus <r...@nvidia.com> wrote: >>>>> >>>>> External email: Use caution opening links or attachments >>>>> >>>>> >>>>> This patch modifies the implementation of the vectorized mersenne >>>>> twister random number generator to use __builtin_shufflevector instead >>>>> of __builtin_shuffle. This makes it (almost) compatible with Clang. >>>>> >>>>> To make the implementation fully compatible with Clang, Clang will need >>>>> to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which >>>>> currently it does not. This looks like an oversight in Clang and so will >>>>> be addressed separately. >>>>> >>>>> I see no codegen change with this patch. >>>> >>>> FWIW the change looks ok to me from an aarch64 perspective. >>>> The only potential problem could have been a compile error so as long as >>>> the CI buildbots are happy I think this would be safe to take. >>> >>> OK thanks for looking. I'll wait to see that CI is green (thanks for >>> resubmitting it, Ricardo!) and then push it. >>> >> >> Hi Jonathan, >> >> When you have a chance, could you please check if the CI is looking good? > > Yes, it's green now: > https://patchwork.sourceware.org/project/gcc/patch/20241014132802.506972-1-...@nvidia.com/ > > With Kyrill's ack for the aarch64 codegen side, I'm happy to approve > this and will push it. Thanks! >
Perfect, thank you very much! Ricardo >> >> Thanks very much! >> >> Ricardo >> >>> >>>> Thanks, >>>> Kyrill >>>> >>>>> >>>>> Bootstrapped and tested on aarch64-none-linux-gnu. >>>>> >>>>> Signed-off-by: Ricardo Jesus <r...@nvidia.com> >>>>> >>>>> 2024-09-05 Ricardo Jesus <r...@nvidia.com> >>>>> >>>>> * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses >>>>> of __builtin_shuffle with __builtin_shufflevector. >>>>> (__aarch64_lsl_128): Move shift amount to a template parameter. >>>>> (__aarch64_lsr_128): Move shift amount to a template parameter. >>>>> (__aarch64_recursion): Update call sites of __aarch64_lsl_128 >>>>> and __aarch64_lsr_128. >>>>> --- >>>>> .../config/cpu/aarch64/opt/ext/opt_random.h | 28 +++++++++++-------- >>>>> 1 file changed, 16 insertions(+), 12 deletions(-) >>>>> >>>>> diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h >>>>> b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h >>>>> index 7f756d1572f..7eb816abcd0 100644 >>>>> --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h >>>>> +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h >>>>> @@ -35,13 +35,13 @@ >>>>> #ifdef __ARM_NEON >>>>> >>>>> #ifdef __ARM_BIG_ENDIAN >>>>> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \ >>>>> - {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \ >>>>> - 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C}) >>>>> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \ >>>>> + 16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \ >>>>> + 24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C) >>>>> #else >>>>> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \ >>>>> - {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \ >>>>> - _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15}) >>>>> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \ >>>>> + _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \ >>>>> + _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15) >>>>> #endif >>>>> >>>>> #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ >>>>> @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION >>>>> namespace { >>>>> // Logical Shift right 128-bits by c * 8 bits >>>>> >>>>> - __extension__ extern __inline __Uint32x4_t >>>>> + __extension__ >>>>> + template<int __c> >>>>> + extern __inline __Uint32x4_t >>>>> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>>>> - __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c) >>>>> + __aarch64_lsr_128 (__Uint8x16_t __a) >>>>> { >>>>> const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0, >>>>> 0, 0, 0, 0, 0, 0, 0, 0}; >>>>> @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION >>>>> >>>>> // Logical Shift left 128-bits by c * 8 bits >>>>> >>>>> - __extension__ extern __inline __Uint32x4_t >>>>> + __extension__ >>>>> + template<int __c> >>>>> + extern __inline __Uint32x4_t >>>>> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) >>>>> - __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c) >>>>> + __aarch64_lsl_128 (__Uint8x16_t __a) >>>>> { >>>>> const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0, >>>>> 0, 0, 0, 0, 0, 0, 0, 0}; >>>>> @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION >>>>> __Uint32x4_t __e) >>>>> { >>>>> __Uint32x4_t __y = (__b >> __sr1); >>>>> - __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2); >>>>> + __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c); >>>>> >>>>> __Uint32x4_t __v = __d << __sl1; >>>>> >>>>> __z = __z ^ __a; >>>>> __z = __z ^ __v; >>>>> >>>>> - __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2); >>>>> + __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a); >>>>> >>>>> __y = __y & __e; >>>>> __z = __z ^ __x; >>>>> -- >>>>> 2.44.0 >>>>> >>>> >>> >> >