https://github.com/chaitanyav updated https://github.com/llvm/llvm-project/pull/161056
>From bc897b3e9806bbafdd67e8ec75d43847c3553454 Mon Sep 17 00:00:00 2001 From: NagaChaitanya Vellanki <[email protected]> Date: Sun, 28 Sep 2025 00:29:57 -0700 Subject: [PATCH] [X86][Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow AVX/AVX512 IFMA madd52 intrinsics to be used in constexpr Resolves #160498 --- clang/include/clang/Basic/BuiltinsX86.td | 42 +-- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 22 ++ clang/lib/AST/ExprConstant.cpp | 51 ++++ clang/lib/Headers/avx512ifmaintrin.h | 71 ++--- clang/lib/Headers/avxifmaintrin.h | 18 +- .../test/AST/ByteCode/x86-ifma-constexpr.cpp | 36 +++ clang/test/Sema/x86-ifma-constexpr.cpp | 262 ++++++++++++++++++ 7 files changed, 443 insertions(+), 59 deletions(-) create mode 100644 clang/test/AST/ByteCode/x86-ifma-constexpr.cpp create mode 100644 clang/test/Sema/x86-ifma-constexpr.cpp diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 77e599587edc3..a5247629e255f 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -2101,27 +2101,6 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in { def movdqa64store256_mask : X86Builtin<"void(_Vector<4, long long int *>, _Vector<4, long long int>, unsigned char)">; } -let Features = "avx512ifma", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { - def vpmadd52huq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">; - def vpmadd52luq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">; -} - -let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpmadd52huq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">; -} - -let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpmadd52huq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">; -} - -let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vpmadd52luq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">; -} - -let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vpmadd52luq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">; -} - let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def vcomisd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>, _Constant int, _Constant int)">; def vcomiss : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>, _Constant int, _Constant int)">; @@ -3128,6 +3107,27 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in { def kordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">; } +let Features = "avx512ifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { + def vpmadd52huq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">; + def vpmadd52luq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">; +} + +let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { + def vpmadd52huq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">; +} + +let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def vpmadd52huq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">; +} + +let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { + def vpmadd52luq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">; +} + +let Features = "avx512ifma,avx512vl|avxifma", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def vpmadd52luq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">; +} + let Features = "avx512dq", Attributes = [NoThrow, Const] in { def kortestcqi : X86Builtin<"int(unsigned char, unsigned char)">; def kortestzqi : X86Builtin<"int(unsigned char, unsigned char)">; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 891344d4e6ed0..cf6a739cc5c60 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3564,6 +3564,28 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return F; }); + case X86::BI__builtin_ia32_vpmadd52luq128: + case X86::BI__builtin_ia32_vpmadd52luq256: + case X86::BI__builtin_ia32_vpmadd52luq512: + return interp__builtin_elementwise_triop( + S, OpPC, Call, [](const APSInt &A, const APSInt &B, const APSInt &C) { + APSInt result = A * B + C; + APSInt mask(APSInt::getAllOnes(52).zext(64), false); + APSInt masked_result = result & mask; + return APSInt(masked_result, true); // unsigned result + }); + case X86::BI__builtin_ia32_vpmadd52huq128: + case X86::BI__builtin_ia32_vpmadd52huq256: + case X86::BI__builtin_ia32_vpmadd52huq512: + return interp__builtin_elementwise_triop( + S, OpPC, Call, [](const APSInt &A, const APSInt &B, const APSInt &C) { + APSInt result = A * B + C; + APSInt mask(APSInt::getAllOnes(52).zext(64), false); + APSInt shifted_result = result >> 52; + APSInt masked_result = shifted_result & mask; + return APSInt(masked_result, true); // unsigned result + }); + case X86::BI__builtin_ia32_vpshldd128: case X86::BI__builtin_ia32_vpshldd256: case X86::BI__builtin_ia32_vpshldd512: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index b706b14945b6d..c9d8a2b01dd74 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -60,10 +60,12 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/LSP/Logging.h" #include "llvm/Support/SaveAndRestore.h" #include "llvm/Support/SipHash.h" #include "llvm/Support/TimeProfiler.h" #include "llvm/Support/raw_ostream.h" + #include <cstring> #include <functional> #include <limits> @@ -11869,6 +11871,55 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case X86::BI__builtin_ia32_vpmadd52luq128: + case X86::BI__builtin_ia32_vpmadd52luq256: + case X86::BI__builtin_ia32_vpmadd52luq512: { + APValue A, B, C; + if (!EvaluateAsRValue(Info, E->getArg(0), A) || + !EvaluateAsRValue(Info, E->getArg(1), B) || + !EvaluateAsRValue(Info, E->getArg(2), C)) + return false; + + unsigned ALen = A.getVectorLength(); + SmallVector<APValue, 4> ResultElements; + ResultElements.reserve(ALen); + + for (unsigned EltNum = 0; EltNum < ALen; EltNum += 1) { + APInt AElt = A.getVectorElt(EltNum).getInt(); + APInt BElt = B.getVectorElt(EltNum).getInt(); + APInt CElt = C.getVectorElt(EltNum).getInt(); + APInt ResElt(AElt.zext(128) * BElt.zext(128) + CElt.zext(128)); + APInt Mask(64, 0x000FFFFFFFFFFFFFULL); + ResultElements.push_back(APValue(APSInt(ResElt.trunc(64) & Mask, false))); + } + + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + case X86::BI__builtin_ia32_vpmadd52huq128: + case X86::BI__builtin_ia32_vpmadd52huq256: + case X86::BI__builtin_ia32_vpmadd52huq512: { + APValue A, B, C; + if (!EvaluateAsRValue(Info, E->getArg(0), A) || + !EvaluateAsRValue(Info, E->getArg(1), B) || + !EvaluateAsRValue(Info, E->getArg(2), C)) + return false; + + unsigned ALen = A.getVectorLength(); + SmallVector<APValue, 4> ResultElements; + ResultElements.reserve(ALen); + + for (unsigned EltNum = 0; EltNum < ALen; EltNum += 1) { + APInt AElt = A.getVectorElt(EltNum).getInt(); + APInt BElt = B.getVectorElt(EltNum).getInt(); + APInt CElt = C.getVectorElt(EltNum).getInt(); + APInt ResElt(AElt.zext(128) * BElt.zext(128) + CElt.zext(128)); + APInt Mask(64, 0x000FFFFFFFFFFFFFULL); + ResultElements.push_back( + APValue(APSInt(ResElt.lshr(52).trunc(64) & Mask, false))); + } + + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } case clang::X86::BI__builtin_ia32_vprotbi: case clang::X86::BI__builtin_ia32_vprotdi: case clang::X86::BI__builtin_ia32_vprotqi: diff --git a/clang/lib/Headers/avx512ifmaintrin.h b/clang/lib/Headers/avx512ifmaintrin.h index f01b322ce7787..6d800f25e5798 100644 --- a/clang/lib/Headers/avx512ifmaintrin.h +++ b/clang/lib/Headers/avx512ifmaintrin.h @@ -19,52 +19,55 @@ __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \ __min_vector_width__(512))) -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z) -{ - return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y, - (__v8di) __Z); +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) { + return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di)__X, (__v8di)__Y, + (__v8di)__Z); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y), - (__v8di)__W); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_madd52hi_epu64(__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) { + return (__m512i)__builtin_ia32_selectq_512( + __M, (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) -{ - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z), - (__v8di)_mm512_setzero_si512()); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_madd52hi_epu64(__mmask8 __M, __m512i __X, __m512i __Y, + __m512i __Z) { + return (__m512i)__builtin_ia32_selectq_512( + __M, (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z), + (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z) -{ - return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y, - (__v8di) __Z); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) { + return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di)__X, (__v8di)__Y, + (__v8di)__Z); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y), - (__v8di)__W); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_madd52lo_epu64(__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) { + return (__m512i)__builtin_ia32_selectq_512( + __M, (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) -{ - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z), - (__v8di)_mm512_setzero_si512()); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_madd52lo_epu64(__mmask8 __M, __m512i __X, __m512i __Y, + __m512i __Z) { + return (__m512i)__builtin_ia32_selectq_512( + __M, (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z), + (__v8di)_mm512_setzero_si512()); } #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h index 5c782d2a5b865..1a9aaaf53affa 100644 --- a/clang/lib/Headers/avxifmaintrin.h +++ b/clang/lib/Headers/avxifmaintrin.h @@ -22,6 +22,14 @@ __attribute__((__always_inline__, __nodebug__, __target__("avxifma"), \ __min_vector_width__(256))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#else +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#endif + // must vex-encoding /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y @@ -55,7 +63,7 @@ /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) { return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y, (__v2di)__Z); @@ -92,7 +100,7 @@ _mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) { /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) { return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y, (__v4di)__Z); @@ -129,7 +137,7 @@ _mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) { /// ENDFOR /// dst[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) { return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y, (__v2di)__Z); @@ -166,12 +174,14 @@ _mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) { /// ENDFOR /// dst[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) { return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y, (__v4di)__Z); } #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS256_CONSTEXPR +#undef __DEFAULT_FN_ATTRS128_CONSTEXPR #endif // __AVXIFMAINTRIN_H diff --git a/clang/test/AST/ByteCode/x86-ifma-constexpr.cpp b/clang/test/AST/ByteCode/x86-ifma-constexpr.cpp new file mode 100644 index 0000000000000..72dde7d27b3ec --- /dev/null +++ b/clang/test/AST/ByteCode/x86-ifma-constexpr.cpp @@ -0,0 +1,36 @@ +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++2a -fsyntax-only \ +// RUN: -triple x86_64-unknown-unknown -target-feature +avxifma -ffreestanding \ +// RUN: -verify %s + +// Test constexpr evaluation of X86 IFMA intrinsics with the ByteCode interpreter. + +typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); + +// Declare required IFMA builtin functions. +extern "C" { +__m128i __builtin_ia32_vpmadd52luq128(__m128i, __m128i, __m128i); +__m128i __builtin_ia32_vpmadd52huq128(__m128i, __m128i, __m128i); +} + +// Intrinsic wrapper functions. +static constexpr __inline__ __m128i __attribute__((__always_inline__, __nodebug__, __target__("avxifma"))) +_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) { + return __builtin_ia32_vpmadd52luq128(__X, __Y, __Z); +} + +// Simple test to check if IFMA intrinsics can be used in constexpr context +constexpr bool test_basic_ifma() { + __m128i a = (__m128i){5ULL, 3ULL}; + __m128i b = (__m128i){7ULL, 4ULL}; + __m128i c = (__m128i){2ULL, 1ULL}; + + // Just test that we can call the intrinsic in constexpr context + __m128i result = _mm_madd52lo_epu64(a, b, c); + (void)result; // Suppress unused variable warning + return true; +} + +// Basic test to verify constexpr evaluation works +static_assert(test_basic_ifma(), "Basic IFMA constexpr test failed"); + +// expected-no-diagnostics \ No newline at end of file diff --git a/clang/test/Sema/x86-ifma-constexpr.cpp b/clang/test/Sema/x86-ifma-constexpr.cpp new file mode 100644 index 0000000000000..eb1fca13dfea5 --- /dev/null +++ b/clang/test/Sema/x86-ifma-constexpr.cpp @@ -0,0 +1,262 @@ +// RUN: %clang_cc1 -std=c++2a -fsyntax-only -triple x86_64-unknown-unknown \ +// RUN: -target-feature +avxifma -ffreestanding -verify %s + +typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); +typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32))); +typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64))); + +typedef unsigned long long __v2du __attribute__((__vector_size__(16))); +typedef unsigned long long __v4du __attribute__((__vector_size__(32))); +typedef unsigned long long __v8du __attribute__((__vector_size__(64))); + +extern "C" { +__m128i __builtin_ia32_vpmadd52luq128(__m128i, __m128i, __m128i); +__m128i __builtin_ia32_vpmadd52huq128(__m128i, __m128i, __m128i); +__m256i __builtin_ia32_vpmadd52luq256(__m256i, __m256i, __m256i); +__m256i __builtin_ia32_vpmadd52huq256(__m256i, __m256i, __m256i); +__m512i __builtin_ia32_vpmadd52luq512(__m512i, __m512i, __m512i); +__m512i __builtin_ia32_vpmadd52huq512(__m512i, __m512i, __m512i); +} +static constexpr __inline__ __m128i __attribute__((__always_inline__, __nodebug__, __target__("avxifma"))) +_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) { + return __builtin_ia32_vpmadd52luq128(__X, __Y, __Z); +} + +static constexpr __inline__ __m128i __attribute__((__always_inline__, __nodebug__, __target__("avxifma"))) +_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) { + return __builtin_ia32_vpmadd52huq128(__X, __Y, __Z); +} + +static constexpr __inline__ __m256i __attribute__((__always_inline__, __nodebug__, __target__("avxifma"))) +_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) { + return __builtin_ia32_vpmadd52luq256(__X, __Y, __Z); +} + +static constexpr __inline__ __m256i __attribute__((__always_inline__, __nodebug__, __target__("avxifma"))) +_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) { + return __builtin_ia32_vpmadd52huq256(__X, __Y, __Z); +} + +static constexpr __inline__ __m512i __attribute__((__always_inline__, __nodebug__, __target__("avxifma"))) +_mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) { + return __builtin_ia32_vpmadd52luq512(__X, __Y, __Z); +} + +static constexpr __inline__ __m512i __attribute__((__always_inline__, __nodebug__, __target__("avxifma"))) +_mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) { + return __builtin_ia32_vpmadd52huq512(__X, __Y, __Z); +} + +#define TEST_CONSTEXPR(expr) static_assert(expr, "constexpr test failed") + +constexpr bool match_v2du(__m128i result, unsigned long long e0, unsigned long long e1) { + __v2du v = (__v2du)result; + return v[0] == e0 && v[1] == e1; +} + +constexpr bool match_v4du(__m256i result, unsigned long long e0, unsigned long long e1, + unsigned long long e2, unsigned long long e3) { + __v4du v = (__v4du)result; + return v[0] == e0 && v[1] == e1 && v[2] == e2 && v[3] == e3; +} + +constexpr bool match_v8du(__m512i result, unsigned long long e0, unsigned long long e1, + unsigned long long e2, unsigned long long e3, + unsigned long long e4, unsigned long long e5, + unsigned long long e6, unsigned long long e7) { + __v8du v = (__v8du)result; + return v[0] == e0 && v[1] == e1 && v[2] == e2 && v[3] == e3 && + v[4] == e4 && v[5] == e5 && v[6] == e6 && v[7] == e7; +} + +constexpr unsigned long long compute_madd52lo_manual( + unsigned long long a, unsigned long long b, unsigned long long c) { + constexpr unsigned long long mask52 = 0x000FFFFFFFFFFFFFULL; + return (a * b + c) & mask52; +} + +constexpr unsigned long long compute_madd52hi_manual( + unsigned long long a, unsigned long long b, unsigned long long c) { + constexpr unsigned long long mask52 = 0x000FFFFFFFFFFFFFULL; + return ((a * b + c) >> 52) & mask52; +} + +TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64( + (__m128i)(__v2du){5, 3}, + (__m128i)(__v2du){7, 4}, + (__m128i)(__v2du){2, 1}), + 37, 13)); + +TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64( + (__m256i)(__v4du){5, 3, 2, 6}, + (__m256i)(__v4du){7, 4, 8, 3}, + (__m256i)(__v4du){2, 1, 5, 4}), + 37, 13, 21, 22)); + +TEST_CONSTEXPR(match_v8du(_mm512_madd52lo_epu64( + (__m512i)(__v8du){1, 2, 3, 4, 5, 6, 7, 8}, + (__m512i)(__v8du){8, 7, 6, 5, 4, 3, 2, 1}, + (__m512i)(__v8du){1, 1, 1, 1, 1, 1, 1, 1}), + 9, 15, 19, 21, 21, 19, 15, 9)); + +TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64( + (__m128i)(__v2du){0, 0}, + (__m128i)(__v2du){0, 0}, + (__m128i)(__v2du){0, 0}), + 0, 0)); + +TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64( + (__m256i)(__v4du){0, 0, 0, 0}, + (__m256i)(__v4du){0, 0, 0, 0}, + (__m256i)(__v4du){0, 0, 0, 0}), + 0, 0, 0, 0)); + +TEST_CONSTEXPR(match_v8du(_mm512_madd52lo_epu64( + (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0}, + (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0}, + (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0}), + 0, 0, 0, 0, 0, 0, 0, 0)); + +TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64( + (__m128i)(__v2du){0, 0}, + (__m128i)(__v2du){123, 456}, + (__m128i)(__v2du){42, 73}), + 42, 73)); + +TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64( + (__m256i)(__v4du){0, 0, 0, 0}, + (__m256i)(__v4du){123, 456, 789, 321}, + (__m256i)(__v4du){42, 73, 11, 99}), + 42, 73, 11, 99)); + +TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64( + (__m128i)(__v2du){5, 3}, + (__m128i)(__v2du){7, 4}, + (__m128i)(__v2du){0, 0}), + 35, 12)); + +TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64( + (__m256i)(__v4du){5, 3, 7, 2}, + (__m256i)(__v4du){7, 4, 3, 8}, + (__m256i)(__v4du){0, 0, 0, 0}), + 35, 12, 21, 16)); + +TEST_CONSTEXPR(match_v2du(_mm_madd52hi_epu64( + (__m128i)(__v2du){0x0010000000000000ULL, 0x0008000000000000ULL}, + (__m128i)(__v2du){1, 2}, + (__m128i)(__v2du){0, 0}), + 1, 1)); + +TEST_CONSTEXPR(match_v4du(_mm256_madd52hi_epu64( + (__m256i)(__v4du){0x0010000000000000ULL, 0x0008000000000000ULL, 0x0020000000000000ULL, 0x0004000000000000ULL}, + (__m256i)(__v4du){1, 2, 1, 4}, + (__m256i)(__v4du){0, 0, 0, 0}), + 1, 1, 2, 1)); + +TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64( + (__m128i)(__v2du){0x0010000000000000ULL, 0x0008000000000000ULL}, + (__m128i)(__v2du){1, 2}, + (__m128i)(__v2du){0, 0}), + 0, 0)); + +TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64( + (__m256i)(__v4du){0x0010000000000000ULL, 0x0008000000000000ULL, 0x0020000000000000ULL, 0x0004000000000000ULL}, + (__m256i)(__v4du){1, 2, 1, 4}, + (__m256i)(__v4du){0, 0, 0, 0}), + 0, 0, 0, 0)); + +TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64( + (__m128i)(__v2du){0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL}, + (__m128i)(__v2du){1, 2}, + (__m128i)(__v2du){0, 1}), + compute_madd52lo_manual(0x000FFFFFFFFFFFFFULL, 1, 0), + compute_madd52lo_manual(0x000FFFFFFFFFFFFFULL, 2, 1))); + +TEST_CONSTEXPR(match_v2du(_mm_madd52hi_epu64( + (__m128i)(__v2du){0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL}, + (__m128i)(__v2du){1, 2}, + (__m128i)(__v2du){0, 1}), + compute_madd52hi_manual(0x000FFFFFFFFFFFFFULL, 1, 0), + compute_madd52hi_manual(0x000FFFFFFFFFFFFFULL, 2, 1))); + +TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64( + (__m128i)(__v2du){123456789, 987654321}, + (__m128i)(__v2du){1, 1}, + (__m128i)(__v2du){0, 0}), + 123456789, 987654321)); + +TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64( + (__m256i)(__v4du){123456789, 987654321, 555666777, 111222333}, + (__m256i)(__v4du){1, 1, 1, 1}, + (__m256i)(__v4du){0, 0, 0, 0}), + 123456789, 987654321, 555666777, 111222333)); + +TEST_CONSTEXPR(match_v8du(_mm512_madd52lo_epu64( + (__m512i)(__v8du){123456789, 987654321, 555666777, 111222333, 444555666, 777888999, 100200300, 999888777}, + (__m512i)(__v8du){1, 1, 1, 1, 1, 1, 1, 1}, + (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0}), + 123456789, 987654321, 555666777, 111222333, 444555666, 777888999, 100200300, 999888777)); + +TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64( + (__m128i)(__v2du){123, 456}, + (__m128i)(__v2du){789, 321}, + (__m128i)(__v2du){100, 200}), + compute_madd52lo_manual(123, 789, 100), compute_madd52lo_manual(456, 321, 200))); + +TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64( + (__m256i)(__v4du){123, 456, 789, 321}, + (__m256i)(__v4du){789, 321, 123, 456}, + (__m256i)(__v4du){100, 200, 300, 400}), + compute_madd52lo_manual(123, 789, 100), compute_madd52lo_manual(456, 321, 200), + compute_madd52lo_manual(789, 123, 300), compute_madd52lo_manual(321, 456, 400))); + +TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64( + (__m128i)(__v2du){789, 321}, + (__m128i)(__v2du){123, 456}, + (__m128i)(__v2du){100, 200}), + compute_madd52lo_manual(789, 123, 100), compute_madd52lo_manual(321, 456, 200))); + +TEST_CONSTEXPR(match_v2du(_mm_madd52hi_epu64( + (__m128i)(__v2du){0x0010000000000000ULL, 0x0020000000000000ULL}, + (__m128i)(__v2du){2, 3}, + (__m128i)(__v2du){0, 0}), + 2, 6)); + +TEST_CONSTEXPR(match_v4du(_mm256_madd52hi_epu64( + (__m256i)(__v4du){0x0010000000000000ULL, 0x0020000000000000ULL, 0x0030000000000000ULL, 0x0040000000000000ULL}, + (__m256i)(__v4du){2, 3, 2, 2}, + (__m256i)(__v4du){0, 0, 0, 0}), + 2, 6, 6, 8)); + +TEST_CONSTEXPR(match_v8du(_mm512_madd52hi_epu64( + (__m512i)(__v8du){0x0010000000000000ULL, 0x0020000000000000ULL, 0x0030000000000000ULL, 0x0040000000000000ULL, + 0x0050000000000000ULL, 0x0060000000000000ULL, 0x0070000000000000ULL, 0x0080000000000000ULL}, + (__m512i)(__v8du){2, 3, 2, 2, 1, 1, 1, 1}, + (__m512i)(__v8du){0, 0, 0, 0, 0, 0, 0, 0}), + 2, 6, 6, 8, 5, 6, 7, 8)); + +TEST_CONSTEXPR(match_v2du(_mm_madd52hi_epu64( + (__m128i)(__v2du){1, 1}, + (__m128i)(__v2du){1, 1}, + (__m128i)(__v2du){0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL}), + 1, 1)); + +TEST_CONSTEXPR(match_v4du(_mm256_madd52hi_epu64( + (__m256i)(__v4du){1, 1, 1, 1}, + (__m256i)(__v4du){1, 1, 1, 1}, + (__m256i)(__v4du){0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL}), + 1, 1, 1, 1)); + +TEST_CONSTEXPR(match_v2du(_mm_madd52lo_epu64( + (__m128i)(__v2du){1, 1}, + (__m128i)(__v2du){1, 1}, + (__m128i)(__v2du){0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL}), + 0, 0)); + +TEST_CONSTEXPR(match_v4du(_mm256_madd52lo_epu64( + (__m256i)(__v4du){1, 1, 1, 1}, + (__m256i)(__v4du){1, 1, 1, 1}, + (__m256i)(__v4du){0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL, 0x000FFFFFFFFFFFFFULL}), + 0, 0, 0, 0)); + +// expected-no-diagnostics \ No newline at end of file _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
