https://github.com/adream307 updated https://github.com/llvm/llvm-project/pull/202257
>From 52eb55fe0e5715ff5e1ed92f3621612d14667834 Mon Sep 17 00:00:00 2001 From: adream307 <[email protected]> Date: Thu, 4 Jun 2026 11:29:09 +0800 Subject: [PATCH] [clang][X86] Add constexpr support for mpsadbw128/256 intrinsics Enable constexpr evaluation for `_mm_mpsadbw_epu8` and `_mm256_mpsadbw_epu8` (`__builtin_ia32_mpsadbw128`/`mpsadbw256`). Fixes #157522. Signed-off-by: adream307 <[email protected]> --- clang/include/clang/Basic/BuiltinsX86.td | 7 ++-- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 52 ++++++++++++++++++++++++ clang/lib/AST/ExprConstant.cpp | 40 ++++++++++++++++++ clang/test/CodeGen/X86/avx2-builtins.c | 17 ++++++++ clang/test/CodeGen/X86/sse41-builtins.c | 8 ++++ 5 files changed, 121 insertions(+), 3 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index c8c371625b568..3fedae5ac289a 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -315,10 +315,11 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] def roundpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Constant int)">; def dpps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">; def dppd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2,double>, _Constant char)">; - def mpsadbw128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">; } let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { + def mpsadbw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>, _Constant char)">; + def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">; def pblendw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">; @@ -574,14 +575,14 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid } let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">; - def psadbw256 : X86Builtin< "_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">; } let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def mpsadbw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>, _Constant char)">; + def permdf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">; def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long " diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 5f8681ef38492..5a7eebf935000 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -3019,6 +3019,54 @@ static bool interp__builtin_ia32_dbpsadbw(InterpState &S, CodePtr OpPC, return true; } +static bool interp__builtin_ia32_mpsadbw(InterpState &S, CodePtr OpPC, + const CallExpr *Call) { + assert(Call->getNumArgs() == 3); + uint64_t Imm; + if (!popToUInt64(S, Call->getArg(2), Imm)) + return false; + + const Pointer &Src2 = S.Stk.pop<Pointer>(); + const Pointer &Src1 = S.Stk.pop<Pointer>(); + const Pointer &Dst = S.Stk.peek<Pointer>(); + + const auto *SrcVT = Call->getArg(0)->getType()->castAs<VectorType>(); + PrimType SrcElemT = *S.getContext().classify(SrcVT->getElementType()); + unsigned SourceLen = SrcVT->getNumElements(); // 16 or 32 + + const auto *DestVT = Call->getType()->castAs<VectorType>(); + PrimType DestElemT = *S.getContext().classify(DestVT->getElementType()); + bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType(); + + constexpr unsigned LaneSize = 16; // 128-bit lane = 16 bytes + unsigned NumLanes = SourceLen / LaneSize; + + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + unsigned Ctrl = (Imm >> (3 * Lane)) & 0x7; + unsigned AOff = ((Ctrl >> 2) & 1) * 4; + unsigned BOff = (Ctrl & 3) * 4; + for (unsigned J = 0; J < 8; ++J) { + uint16_t Sad = 0; + for (unsigned K = 0; K < 4; ++K) { + uint8_t A, B; + INT_TYPE_SWITCH_NO_BOOL(SrcElemT, { + A = static_cast<uint8_t>( + Src1.elem<T>(Lane * LaneSize + AOff + J + K)); + B = static_cast<uint8_t>(Src2.elem<T>(Lane * LaneSize + BOff + K)); + }); + Sad += (A > B) ? (A - B) : (B - A); + } + INT_TYPE_SWITCH_NO_BOOL(DestElemT, { + Dst.elem<T>(Lane * 8 + J) = + static_cast<T>(APSInt(APInt(16, Sad), DestUnsigned)); + }); + } + } + + Dst.initializeAllElements(); + return true; +} + static bool interp_builtin_horizontal_int_binop( InterpState &S, CodePtr OpPC, const CallExpr *Call, llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) { @@ -5315,6 +5363,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case clang::X86::BI__builtin_ia32_dbpsadbw512: return interp__builtin_ia32_dbpsadbw(S, OpPC, Call); + case clang::X86::BI__builtin_ia32_mpsadbw128: + case clang::X86::BI__builtin_ia32_mpsadbw256: + return interp__builtin_ia32_mpsadbw(S, OpPC, Call); + case clang::X86::BI__builtin_ia32_pmulhuw128: case clang::X86::BI__builtin_ia32_pmulhuw256: case clang::X86::BI__builtin_ia32_pmulhuw512: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 1642c41a99a2f..c3cd9c3e95886 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12694,6 +12694,46 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case clang::X86::BI__builtin_ia32_mpsadbw128: + case clang::X86::BI__builtin_ia32_mpsadbw256: { + APValue SourceA, SourceB, SourceImm; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceA) || + !EvaluateAsRValue(Info, E->getArg(1), SourceB) || + !EvaluateAsRValue(Info, E->getArg(2), SourceImm)) + return false; + unsigned SourceLen = SourceA.getVectorLength(); + constexpr unsigned LaneSize = 16; + unsigned NumLanes = SourceLen / LaneSize; + unsigned Imm = SourceImm.getInt().getZExtValue(); + + QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType(); + bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType(); + SmallVector<APValue, 16> ResultElements; + ResultElements.reserve(SourceLen / 2); + + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + unsigned Ctrl = (Imm >> (3 * Lane)) & 0x7; + unsigned AOff = ((Ctrl >> 2) & 1) * 4; + unsigned BOff = (Ctrl & 3) * 4; + for (unsigned J = 0; J < 8; ++J) { + uint16_t Sad = 0; + for (unsigned K = 0; K < 4; ++K) { + uint8_t A = static_cast<uint8_t>( + SourceA.getVectorElt(Lane * 16 + AOff + J + K) + .getInt() + .getZExtValue()); + uint8_t B = + static_cast<uint8_t>(SourceB.getVectorElt(Lane * 16 + BOff + K) + .getInt() + .getZExtValue()); + Sad += (A > B) ? (A - B) : (B - A); + } + ResultElements.push_back(APValue(APSInt(APInt(16, Sad), DestUnsigned))); + } + } + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + case clang::X86::BI__builtin_ia32_pmulhuw128: case clang::X86::BI__builtin_ia32_pmulhuw256: case clang::X86::BI__builtin_ia32_pmulhuw512: diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index de3d92ea1c6cc..b829d16c3566f 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1009,6 +1009,23 @@ __m256i test_mm256_mpsadbw_epu8(__m256i x, __m256i y) { // CHECK: call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, i8 3) return _mm256_mpsadbw_epu8(x, y, 3); } +// imm=0 both lanes. Lane0 A=4,B=1 -> 12 each ; Lane1 A=8,B=1 -> |8-1|*4=28 each +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 0), 353,344,334,322,310,298,282,268,442,428,410,394,376,352,330,310)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 1), 415,406,396,384,372,360,344,330,442,428,410,394,376,352,330,310)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 2), 517,508,498,486,474,462,446,432,442,428,410,394,376,352,330,310)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 3), 603,594,584,572,560,548,532,518,442,428,410,394,376,352,330,310)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 4), 310,298,282,268,250,232,218,202,442,428,410,394,376,352,330,310)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 5), 372,360,344,330,312,294,280,264,442,428,410,394,376,352,330,310)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 6), 474,462,446,432,414,396,382,366,442,428,410,394,376,352,330,310)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 7), 560,548,532,518,500,482,468,452,442,428,410,394,376,352,330,310)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 0<<3), 353,344,334,322,310,298,282,268,442,428,410,394,376,352,330,310)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 1<<3), 353,344,334,322,310,298,282,268,522,508,490,474,456,432,410,390)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 2<<3), 353,344,334,322,310,298,282,268,632,618,600,584,566,542,520,500)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 3<<3), 353,344,334,322,310,298,282,268,706,692,674,658,640,616,594,574)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 4<<3), 353,344,334,322,310,298,282,268,376,352,330,310,292,280,268,244)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 5<<3), 353,344,334,322,310,298,282,268,456,432,410,390,372,360,348,324)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 6<<3), 353,344,334,322,310,298,282,268,566,542,520,500,482,470,458,434)); +TEST_CONSTEXPR(match_v16hu(_mm256_mpsadbw_epu8(((__m256i)(__v32qu){2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71,73,79,83,89,97,101,103,107,109,113,127,131}), ((__m256i)(__v32qu){83,89,97,101,103,107,109,113,127,131,137,139,149,151,157,163,167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 7<<3), 353,344,334,322,310,298,282,268,640,616,594,574,556,544,532,508)); __m256i test_mm256_mul_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_mul_epi32 diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c index 1be1aa71de737..dceb86b87ce3d 100644 --- a/clang/test/CodeGen/X86/sse41-builtins.c +++ b/clang/test/CodeGen/X86/sse41-builtins.c @@ -402,6 +402,14 @@ __m128i test_mm_mpsadbw_epu8(__m128i x, __m128i y) { // CHECK: call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 1) return _mm_mpsadbw_epu8(x, y, 1); } +TEST_CONSTEXPR(match_v8hu(_mm_mpsadbw_epu8(((__m128i)(__v16qu){11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71}), ((__m128i)(__v16qu){167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 0), 640,628,612,598,580,562,548,532)); +TEST_CONSTEXPR(match_v8hu(_mm_mpsadbw_epu8(((__m128i)(__v16qu){11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71}), ((__m128i)(__v16qu){167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 1), 720,708,692,678,660,642,628,612)); +TEST_CONSTEXPR(match_v8hu(_mm_mpsadbw_epu8(((__m128i)(__v16qu){11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71}), ((__m128i)(__v16qu){167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 2), 830,818,802,788,770,752,738,722)); +TEST_CONSTEXPR(match_v8hu(_mm_mpsadbw_epu8(((__m128i)(__v16qu){11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71}), ((__m128i)(__v16qu){167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 3), 904,892,876,862,844,826,812,796)); +TEST_CONSTEXPR(match_v8hu(_mm_mpsadbw_epu8(((__m128i)(__v16qu){11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71}), ((__m128i)(__v16qu){167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 4), 580,562,548,532,516,498,480,460)); +TEST_CONSTEXPR(match_v8hu(_mm_mpsadbw_epu8(((__m128i)(__v16qu){11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71}), ((__m128i)(__v16qu){167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 5), 660,642,628,612,596,578,560,540)); +TEST_CONSTEXPR(match_v8hu(_mm_mpsadbw_epu8(((__m128i)(__v16qu){11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71}), ((__m128i)(__v16qu){167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 6), 770,752,738,722,706,688,670,650)); +TEST_CONSTEXPR(match_v8hu(_mm_mpsadbw_epu8(((__m128i)(__v16qu){11,13,17,19,23,29,31,37,41,43,47,53,59,61,67,71}), ((__m128i)(__v16qu){167,173,179,181,191,193,197,199,211,223,227,229,233,239,241,251}), 7), 844,826,812,796,780,762,744,724)); __m128i test_mm_mul_epi32(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_mul_epi32 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
