On 10/21/24 18:30, Jonathan Wakely wrote:
> On Mon, 21 Oct 2024 at 17:29, Ricardo Jesus <r...@nvidia.com> wrote:
>>
>> On 10/14/24 14:37, Jonathan Wakely wrote:
>>>
>>>
>>> On Mon, 14 Oct 2024 at 14:36, Kyrylo Tkachov <ktkac...@nvidia.com> wrote:
>>>>
>>>>
>>>>
>>>>> On 14 Oct 2024, at 15:28, Ricardo Jesus <r...@nvidia.com> wrote:
>>>>>
>>>>> External email: Use caution opening links or attachments
>>>>>
>>>>>
>>>>> This patch modifies the implementation of the vectorized mersenne
>>>>> twister random number generator to use __builtin_shufflevector instead
>>>>> of __builtin_shuffle. This makes it (almost) compatible with Clang.
>>>>>
>>>>> To make the implementation fully compatible with Clang, Clang will need
>>>>> to support internal Neon types like __Uint8x16_t and __Uint32x4_t, which
>>>>> currently it does not. This looks like an oversight in Clang and so will
>>>>> be addressed separately.
>>>>>
>>>>> I see no codegen change with this patch.
>>>>
>>>> FWIW the change looks ok to me from an aarch64 perspective.
>>>> The only potential problem could have been a compile error so as long as 
>>>> the CI buildbots are happy I think this would be safe to take.
>>>
>>> OK thanks for looking. I'll wait to see that CI is green (thanks for
>>> resubmitting it, Ricardo!) and then push it.
>>>
>>
>> Hi Jonathan,
>>
>> When you have a chance, could you please check if the CI is looking good?
> 
> Yes, it's green now:
> https://patchwork.sourceware.org/project/gcc/patch/20241014132802.506972-1-...@nvidia.com/
> 
> With Kyrill's ack for the aarch64 codegen side, I'm happy to approve
> this and will push it. Thanks!
> 

Perfect, thank you very much!

Ricardo

>>
>> Thanks very much!
>>
>> Ricardo
>>
>>>
>>>> Thanks,
>>>> Kyrill
>>>>
>>>>>
>>>>> Bootstrapped and tested on aarch64-none-linux-gnu.
>>>>>
>>>>> Signed-off-by: Ricardo Jesus <r...@nvidia.com>
>>>>>
>>>>> 2024-09-05  Ricardo Jesus  <r...@nvidia.com>
>>>>>
>>>>>        * config/cpu/aarch64/opt/ext/opt_random.h (__VEXT): Replace uses
>>>>>        of __builtin_shuffle with __builtin_shufflevector.
>>>>>        (__aarch64_lsl_128): Move shift amount to a template parameter.
>>>>>        (__aarch64_lsr_128): Move shift amount to a template parameter.
>>>>>        (__aarch64_recursion): Update call sites of __aarch64_lsl_128
>>>>>        and __aarch64_lsr_128.
>>>>> ---
>>>>> .../config/cpu/aarch64/opt/ext/opt_random.h   | 28 +++++++++++--------
>>>>> 1 file changed, 16 insertions(+), 12 deletions(-)
>>>>>
>>>>> diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h 
>>>>> b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
>>>>> index 7f756d1572f..7eb816abcd0 100644
>>>>> --- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
>>>>> +++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
>>>>> @@ -35,13 +35,13 @@
>>>>> #ifdef __ARM_NEON
>>>>>
>>>>> #ifdef __ARM_BIG_ENDIAN
>>>>> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_A, _B, (__Uint8x16_t) \
>>>>> -    {16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
>>>>> -     24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C})
>>>>> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_A, _B, \
>>>>> +    16-_C, 17-_C, 18-_C, 19-_C, 20-_C, 21-_C, 22-_C, 23-_C, \
>>>>> +    24-_C, 25-_C, 26-_C, 27-_C, 28-_C, 29-_C, 30-_C, 31-_C)
>>>>> #else
>>>>> -# define __VEXT(_A,_B,_C) __builtin_shuffle (_B, _A, (__Uint8x16_t) \
>>>>> -    {_C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
>>>>> -     _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15})
>>>>> +# define __VEXT(_A,_B,_C) __builtin_shufflevector (_B, _A, \
>>>>> +    _C, _C+1, _C+2, _C+3, _C+4, _C+5, _C+6, _C+7, \
>>>>> +    _C+8, _C+9, _C+10, _C+11, _C+12, _C+13, _C+14, _C+15)
>>>>> #endif
>>>>>
>>>>> #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
>>>>> @@ -52,9 +52,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>>>>>   namespace {
>>>>>     // Logical Shift right 128-bits by c * 8 bits
>>>>>
>>>>> -    __extension__ extern __inline __Uint32x4_t
>>>>> +    __extension__
>>>>> +    template<int __c>
>>>>> +    extern __inline __Uint32x4_t
>>>>>     __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>>>>> -    __aarch64_lsr_128 (__Uint8x16_t __a, __const int __c)
>>>>> +    __aarch64_lsr_128 (__Uint8x16_t __a)
>>>>>     {
>>>>>       const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
>>>>>                                   0, 0, 0, 0, 0, 0, 0, 0};
>>>>> @@ -64,9 +66,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>>>>>
>>>>>     // Logical Shift left 128-bits by c * 8 bits
>>>>>
>>>>> -    __extension__ extern __inline __Uint32x4_t
>>>>> +    __extension__
>>>>> +    template<int __c>
>>>>> +    extern __inline __Uint32x4_t
>>>>>     __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>>>>> -    __aarch64_lsl_128 (__Uint8x16_t __a, __const int __c)
>>>>> +    __aarch64_lsl_128 (__Uint8x16_t __a)
>>>>>     {
>>>>>       const __Uint8x16_t __zero = {0, 0, 0, 0, 0, 0, 0, 0,
>>>>>                                   0, 0, 0, 0, 0, 0, 0, 0};
>>>>> @@ -82,14 +86,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>>>>>                                               __Uint32x4_t __e)
>>>>>     {
>>>>>       __Uint32x4_t __y = (__b >> __sr1);
>>>>> -      __Uint32x4_t __z = __aarch64_lsr_128 ((__Uint8x16_t) __c, __sr2);
>>>>> +      __Uint32x4_t __z = __aarch64_lsr_128<__sr2> ((__Uint8x16_t) __c);
>>>>>
>>>>>       __Uint32x4_t __v = __d << __sl1;
>>>>>
>>>>>       __z = __z ^ __a;
>>>>>       __z = __z ^ __v;
>>>>>
>>>>> -      __Uint32x4_t __x = __aarch64_lsl_128 ((__Uint8x16_t) __a, __sl2);
>>>>> +      __Uint32x4_t __x = __aarch64_lsl_128<__sl2> ((__Uint8x16_t) __a);
>>>>>
>>>>>       __y = __y & __e;
>>>>>       __z = __z ^ __x;
>>>>> --
>>>>> 2.44.0
>>>>>
>>>>
>>>
>>
> 

Reply via email to