llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Yonah Goldberg (YonahGoldberg) <details> <summary>Changes</summary> Currently CUDA fp16 is implemented as inline PTX in `cuda_fp16.hpp` in the CTK. In CUDA 13.3, we are moving the implementations to libdevice. This PR declares the new libdevice functions, which operate on LLVM `half`, and adds wrappers for the CUDA `__half` and `__half2` types, which are just `unsigned short` and `unsigned int` under the hood. --- Patch is 49.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174005.diff 2 Files Affected: - (modified) clang/lib/Headers/__clang_cuda_device_functions.h (+744-4) - (modified) clang/lib/Headers/__clang_cuda_libdevice_declares.h (+190) ``````````diff diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h index 0226fe95abab6..e31cb87a25140 100644 --- a/clang/lib/Headers/__clang_cuda_device_functions.h +++ b/clang/lib/Headers/__clang_cuda_device_functions.h @@ -1086,7 +1086,6 @@ __DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { : "r"(__a), "r"(__b), "r"(0)); return r; } - __DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) { unsigned int r; __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" @@ -1404,7 +1403,6 @@ __DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) { __DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) { return (((__a ^ __b) >> 1) & ~0x80808080u) + (__a & __b); } - __DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) { unsigned int r; if ((__a & 0x8000) && (__b & 0x8000)) { @@ -1496,7 +1494,6 @@ __DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) { : "r"(__a), "r"(__b), "r"(0)); return r; } - __DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { unsigned int r; __asm__("vsub2.u32.u32.u32 %0,%1,%2,%3;" @@ -1505,7 +1502,6 @@ __DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { return r; } __DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); } - __DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) { unsigned int r; __asm__("vsub4.u32.u32.u32 %0,%1,%2,%3;" @@ -1550,6 +1546,750 @@ __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { } #endif // CUDA_VERSION >= 9020 +#if CUDA_VERSION >= 13030 +typedef _Float16 _Float16x2 __attribute__((ext_vector_type(2))); + +// fp16 conversion functions +__DEVICE__ unsigned short __f16_double2half(double __a) { + return __builtin_bit_cast(unsigned short, __nv_f16_double2half(__a)); +} +__DEVICE__ unsigned short __f16_float2half_rn(float __a) { + return __builtin_bit_cast(unsigned short, __nv_f16_float2half_rn(__a)); +} +__DEVICE__ unsigned short __f16_float2half_rz(float __a) { + return __builtin_bit_cast(unsigned short, __nv_f16_float2half_rz(__a)); +} +__DEVICE__ unsigned short __f16_float2half_rd(float __a) { + return __builtin_bit_cast(unsigned short, __nv_f16_float2half_rd(__a)); +} +__DEVICE__ unsigned short __f16_float2half_ru(float __a) { + return __builtin_bit_cast(unsigned short, __nv_f16_float2half_ru(__a)); +} +__DEVICE__ unsigned int __f16_float2half2_rn(float __a) { + return __builtin_bit_cast(unsigned int, __nv_f16_float2half2_rn(__a)); +} +__DEVICE__ unsigned int __f16_floats2half2_rn(float __a, float __b) { + return __builtin_bit_cast(unsigned int, __nv_f16_floats2half2_rn(__a, __b)); +} +__DEVICE__ float __f16_half2float(unsigned short __a) { + return __nv_f16_half2float(__builtin_bit_cast(_Float16, __a)); +} +__DEVICE__ float __f16_low2float(unsigned int __a) { + return __nv_f16_low2float(__builtin_bit_cast(_Float16x2, __a)); +} +__DEVICE__ float __f16_high2float(unsigned int __a) { + return __nv_f16_high2float(__builtin_bit_cast(_Float16x2, __a)); +} +__DEVICE__ char __f16_half2char_rz(unsigned short __h) { + return __nv_f16_half2char_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned char __f16_half2uchar_rz(unsigned short __h) { + return __nv_f16_half2uchar_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ short __f16_half2short_rz(unsigned short __h) { + return __nv_f16_half2short_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_half2ushort_rz(unsigned short __h) { + return __nv_f16_half2ushort_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ int __f16_half2int_rz(unsigned short __h) { + return __nv_f16_half2int_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned int __f16_half2uint_rz(unsigned short __h) { + return __nv_f16_half2uint_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ long long __f16_half2ll_rz(unsigned short __h) { + return __nv_f16_half2ll_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned long long __f16_half2ull_rz(unsigned short __h) { + return __nv_f16_half2ull_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ float2 __f16_half22float2(unsigned int __a) { + return __nv_f16_half22float2(__builtin_bit_cast(_Float16x2, __a)); +} +__DEVICE__ int __f16_half2int_rn(unsigned short __h) { + return __nv_f16_half2int_rn(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ int __f16_half2int_rd(unsigned short __h) { + return __nv_f16_half2int_rd(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ int __f16_half2int_ru(unsigned short __h) { + return __nv_f16_half2int_ru(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_int2half_rn(int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_int2half_rn(__i)); +} +__DEVICE__ unsigned short __f16_int2half_rz(int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_int2half_rz(__i)); +} +__DEVICE__ unsigned short __f16_int2half_rd(int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_int2half_rd(__i)); +} +__DEVICE__ unsigned short __f16_int2half_ru(int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_int2half_ru(__i)); +} +__DEVICE__ short __f16_half2short_rn(unsigned short __h) { + return __nv_f16_half2short_rn(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ short __f16_half2short_rd(unsigned short __h) { + return __nv_f16_half2short_rd(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ short __f16_half2short_ru(unsigned short __h) { + return __nv_f16_half2short_ru(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_short2half_rn(short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_short2half_rn(__i)); +} +__DEVICE__ unsigned short __f16_short2half_rz(short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_short2half_rz(__i)); +} +__DEVICE__ unsigned short __f16_short2half_rd(short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_short2half_rd(__i)); +} +__DEVICE__ unsigned short __f16_short2half_ru(short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_short2half_ru(__i)); +} +__DEVICE__ unsigned int __f16_half2uint_rn(unsigned short __h) { + return __nv_f16_half2uint_rn(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned int __f16_half2uint_rd(unsigned short __h) { + return __nv_f16_half2uint_rd(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned int __f16_half2uint_ru(unsigned short __h) { + return __nv_f16_half2uint_ru(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_uint2half_rn(unsigned int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_uint2half_rn(__i)); +} +__DEVICE__ unsigned short __f16_uint2half_rz(unsigned int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_uint2half_rz(__i)); +} +__DEVICE__ unsigned short __f16_uint2half_rd(unsigned int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_uint2half_rd(__i)); +} +__DEVICE__ unsigned short __f16_uint2half_ru(unsigned int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_uint2half_ru(__i)); +} +__DEVICE__ unsigned short __f16_half2ushort_rn(unsigned short __h) { + return __nv_f16_half2ushort_rn(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_half2ushort_rd(unsigned short __h) { + return __nv_f16_half2ushort_rd(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_half2ushort_ru(unsigned short __h) { + return __nv_f16_half2ushort_ru(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_ushort2half_rn(unsigned short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ushort2half_rn(__i)); +} +__DEVICE__ unsigned short __f16_ushort2half_rz(unsigned short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ushort2half_rz(__i)); +} +__DEVICE__ unsigned short __f16_ushort2half_rd(unsigned short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ushort2half_rd(__i)); +} +__DEVICE__ unsigned short __f16_ushort2half_ru(unsigned short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ushort2half_ru(__i)); +} +__DEVICE__ unsigned long long __f16_half2ull_rn(unsigned short __h) { + return __nv_f16_half2ull_rn(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned long long __f16_half2ull_rd(unsigned short __h) { + return __nv_f16_half2ull_rd(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned long long __f16_half2ull_ru(unsigned short __h) { + return __nv_f16_half2ull_ru(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_ull2half_rn(unsigned long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ull2half_rn(__i)); +} +__DEVICE__ unsigned short __f16_ull2half_rz(unsigned long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ull2half_rz(__i)); +} +__DEVICE__ unsigned short __f16_ull2half_rd(unsigned long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ull2half_rd(__i)); +} +__DEVICE__ unsigned short __f16_ull2half_ru(unsigned long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ull2half_ru(__i)); +} +__DEVICE__ long long __f16_half2ll_rn(unsigned short __h) { + return __nv_f16_half2ll_rn(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ long long __f16_half2ll_rd(unsigned short __h) { + return __nv_f16_half2ll_rd(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ long long __f16_half2ll_ru(unsigned short __h) { + return __nv_f16_half2ll_ru(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_ll2half_rn(long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ll2half_rn(__i)); +} +__DEVICE__ unsigned short __f16_ll2half_rz(long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ll2half_rz(__i)); +} +__DEVICE__ unsigned short __f16_ll2half_rd(long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ll2half_rd(__i)); +} +__DEVICE__ unsigned short __f16_ll2half_ru(long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ll2half_ru(__i)); +} + +// fp16 rounding functions +__DEVICE__ unsigned short __f16_trunc(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_trunc(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned short __f16_ceil(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_ceil(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned short __f16_floor(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_floor(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned short __f16_rint(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_rint(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_trunc(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_trunc(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned int __f16x2_floor(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_floor(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned int __f16x2_ceil(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_ceil(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned int __f16x2_rint(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_rint(__builtin_bit_cast(_Float16x2, __x))); +} + +// half2 utilities +__DEVICE__ unsigned int __f16_lows2half2(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16_lows2half2(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16_highs2half2(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16_highs2half2(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned short __f16_low2half(unsigned int __a) { + return __builtin_bit_cast( + unsigned short, __nv_f16_low2half(__builtin_bit_cast(_Float16x2, __a))); +} +__DEVICE__ unsigned int __f16_low2half2(unsigned int __a) { + return __builtin_bit_cast( + unsigned int, __nv_f16_low2half2(__builtin_bit_cast(_Float16x2, __a))); +} +__DEVICE__ unsigned int __f16_high2half2(unsigned int __a) { + return __builtin_bit_cast( + unsigned int, __nv_f16_high2half2(__builtin_bit_cast(_Float16x2, __a))); +} +__DEVICE__ unsigned short __f16_high2half(unsigned int __a) { + return __builtin_bit_cast( + unsigned short, __nv_f16_high2half(__builtin_bit_cast(_Float16x2, __a))); +} +__DEVICE__ unsigned int __f16_halves2half2(unsigned short __a, + unsigned short __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16_halves2half2(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b))); +} +__DEVICE__ unsigned int __f16_half2half2(unsigned short __a) { + return __builtin_bit_cast( + unsigned int, __nv_f16_half2half2(__builtin_bit_cast(_Float16, __a))); +} +__DEVICE__ unsigned int __f16_lowhigh2highlow(unsigned int __a) { + return __builtin_bit_cast( + unsigned int, + __nv_f16_lowhigh2highlow(__builtin_bit_cast(_Float16x2, __a))); +} + +// fp16 comparison functions +__DEVICE__ unsigned short __f16_max(unsigned short __x, unsigned short __y) { + return __builtin_bit_cast(unsigned short, + __nv_f16_max(__builtin_bit_cast(_Float16, __x), + __builtin_bit_cast(_Float16, __y))); +} +__DEVICE__ unsigned short __f16_min(unsigned short __x, unsigned short __y) { + return __builtin_bit_cast(unsigned short, + __nv_f16_min(__builtin_bit_cast(_Float16, __x), + __builtin_bit_cast(_Float16, __y))); +} +__DEVICE__ unsigned int __f16x2_max(unsigned int __x, unsigned int __y) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_max(__builtin_bit_cast(_Float16x2, __x), + __builtin_bit_cast(_Float16x2, __y))); +} +__DEVICE__ unsigned int __f16x2_min(unsigned int __x, unsigned int __y) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_min(__builtin_bit_cast(_Float16x2, __x), + __builtin_bit_cast(_Float16x2, __y))); +} +__DEVICE__ unsigned int __f16x2_eq(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast(unsigned int, + __nv_f16x2_eq(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_ne(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast(unsigned int, + __nv_f16x2_ne(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_le(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast(unsigned int, + __nv_f16x2_le(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_ge(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast(unsigned int, + __nv_f16x2_ge(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_lt(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast(unsigned int, + __nv_f16x2_lt(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_gt(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast(unsigned int, + __nv_f16x2_gt(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_equ(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_equ(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_neu(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_neu(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_leu(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_leu(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_geu(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_geu(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_ltu(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_ltu(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_gtu(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_gtu(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_eq_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_eq_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_ne_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_ne_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_le_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_le_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_ge_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_ge_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_lt_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_lt_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_gt_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_gt_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_equ_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_equ_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_neu_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_neu_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_leu_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_leu_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_geu_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_geu_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_ltu_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_ltu_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_gtu_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_gtu_mask(__builtin_bit_cast(_Float16x2, __a), + ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/174005 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
