https://github.com/AdityaC4 updated https://github.com/llvm/llvm-project/pull/158778
>From a2a0e8199769862804bc75e859cdc8f83838ef4d Mon Sep 17 00:00:00 2001 From: AdityaC4 <[email protected]> Date: Mon, 15 Sep 2025 23:19:21 -0500 Subject: [PATCH 1/2] [Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - allow AVX/AVX512 subvector insertion intrinsics to be used in constexpr #157709 --- clang/include/clang/Basic/BuiltinsX86.td | 22 +++-- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 68 +++++++++++++++ clang/lib/AST/ExprConstant.cpp | 50 +++++++++++ .../test/CodeGen/X86/avx-insert-constexpr.cpp | 87 +++++++++++++++++++ 4 files changed, 219 insertions(+), 8 deletions(-) create mode 100644 clang/test/CodeGen/X86/avx-insert-constexpr.cpp diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index dd7727a39f693..941b0a96a2e07 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -502,9 +502,6 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { def vpermilpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">; def vpermilps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">; - def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">; - def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">; - def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">; def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">; def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">; def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">; @@ -513,6 +510,12 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in def roundps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">; } +let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">; + def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">; + def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">; +} + let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def vtestzpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">; def vtestcpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">; @@ -609,6 +612,9 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">; def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">; def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">; +} + +let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">; } @@ -2945,29 +2951,29 @@ let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256 def extracti32x4_256_mask : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int, _Vector<4, int>, unsigned char)">; } -let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def insertf32x8 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<8, float>, _Constant int)">; def insertf64x2_512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<2, double>, _Constant int)">; def inserti32x8 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<8, int>, _Constant int)">; def inserti64x2_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>, _Constant int)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def insertf64x4 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<4, double>, _Constant int)">; def inserti64x4 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<4, long long int>, _Constant int)">; } -let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def insertf64x2_256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">; def inserti64x2_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def insertf32x4_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">; def inserti32x4_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def insertf32x4 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<4, float>, _Constant int)">; def inserti32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>, _Constant int)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 40b9e04aa335c..edc02671dfda7 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2937,6 +2937,56 @@ static bool interp__builtin_elementwise_triop( return true; } +static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC, + const CallExpr *Call, + unsigned ID) { + assert(Call->getNumArgs() == 3); + + PrimType ImmPT = *S.getContext().classify(Call->getArg(2)); + APSInt ImmAPS = popToAPSInt(S.Stk, ImmPT); + uint64_t Index = ImmAPS.getZExtValue(); + + const Pointer &SubVec = S.Stk.pop<Pointer>(); + if (!SubVec.getFieldDesc()->isPrimitiveArray()) { + return Invalid(S, OpPC); + } + + const Pointer &DstVec = S.Stk.pop<Pointer>(); + if (!DstVec.getFieldDesc()->isPrimitiveArray()) { + return Invalid(S, OpPC); + } + + const Pointer &Result = S.Stk.peek<Pointer>(); + + unsigned DstElements = DstVec.getNumElems(); + unsigned SubElements = SubVec.getNumElems(); + + if (SubElements == 0 || DstElements == 0 || (DstElements % SubElements) != 0) + return Invalid(S, OpPC); + + unsigned NumLanes = DstElements / SubElements; + unsigned Lane = static_cast<unsigned>(Index % NumLanes); + + QualType ElemType = DstVec.getFieldDesc()->getElemQualType(); + PrimType ElemPT = *S.getContext().classify(ElemType); + + unsigned InsertPos = Lane * SubElements; + + TYPE_SWITCH(ElemPT, { + for (unsigned i = 0; i < DstElements; ++i) { + Result.elem<T>(i) = DstVec.elem<T>(i); + } + + for (unsigned i = 0; i < SubElements; ++i) { + Result.elem<T>(InsertPos + i) = SubVec.elem<T>(i); + } + }); + + Result.initializeAllElements(); + + return true; +} + bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, uint32_t BuiltinID) { if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID)) @@ -3595,6 +3645,24 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return interp__builtin_elementwise_triop(S, OpPC, Call, llvm::APIntOps::fshr); + case X86::BI__builtin_ia32_insertf32x4_256: + case X86::BI__builtin_ia32_inserti32x4_256: + case X86::BI__builtin_ia32_insertf64x2_256: + case X86::BI__builtin_ia32_inserti64x2_256: + case X86::BI__builtin_ia32_insertf32x4: + case X86::BI__builtin_ia32_inserti32x4: + case X86::BI__builtin_ia32_insertf64x2_512: + case X86::BI__builtin_ia32_inserti64x2_512: + case X86::BI__builtin_ia32_insertf32x8: + case X86::BI__builtin_ia32_inserti32x8: + case X86::BI__builtin_ia32_insertf64x4: + case X86::BI__builtin_ia32_inserti64x4: + case X86::BI__builtin_ia32_vinsertf128_ps256: + case X86::BI__builtin_ia32_vinsertf128_pd256: + case X86::BI__builtin_ia32_vinsertf128_si256: + case X86::BI__builtin_ia32_insert128i256: + return interp__builtin_x86_insert_subvector(S, OpPC, Call, BuiltinID); + default: S.FFDiag(S.Current->getLocation(OpPC), diag::note_invalid_subexpr_in_const_expr) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 77dc2203576b3..94fb6dfcfa75d 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12127,6 +12127,56 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + + case X86::BI__builtin_ia32_insertf32x4_256: + case X86::BI__builtin_ia32_inserti32x4_256: + case X86::BI__builtin_ia32_insertf64x2_256: + case X86::BI__builtin_ia32_inserti64x2_256: + case X86::BI__builtin_ia32_insertf32x4: + case X86::BI__builtin_ia32_inserti32x4: + case X86::BI__builtin_ia32_insertf64x2_512: + case X86::BI__builtin_ia32_inserti64x2_512: + case X86::BI__builtin_ia32_insertf32x8: + case X86::BI__builtin_ia32_inserti32x8: + case X86::BI__builtin_ia32_insertf64x4: + case X86::BI__builtin_ia32_inserti64x4: + case X86::BI__builtin_ia32_vinsertf128_ps256: + case X86::BI__builtin_ia32_vinsertf128_pd256: + case X86::BI__builtin_ia32_vinsertf128_si256: + case X86::BI__builtin_ia32_insert128i256: { + APValue SourceDst, SourceSub; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceDst) || + !EvaluateAsRValue(Info, E->getArg(1), SourceSub)) + return false; + + APSInt Imm; + if (!EvaluateInteger(E->getArg(2), Imm, Info)) + return false; + + if (!SourceDst.isVector() || !SourceSub.isVector()) + return false; + + unsigned DstLen = SourceDst.getVectorLength(); + unsigned SubLen = SourceSub.getVectorLength(); + if (SubLen == 0 || DstLen == 0 || (DstLen % SubLen) != 0) + return false; + + unsigned NumLanes = DstLen / SubLen; + unsigned LaneIdx = (Imm.getZExtValue() % NumLanes) * SubLen; + + SmallVector<APValue, 16> ResultElements; + ResultElements.reserve(DstLen); + + for (unsigned EltNum = 0; EltNum < DstLen; ++EltNum) { + if (EltNum >= LaneIdx && EltNum < LaneIdx + SubLen) { + ResultElements.push_back(SourceSub.getVectorElt(EltNum - LaneIdx)); + } else { + ResultElements.push_back(SourceDst.getVectorElt(EltNum)); + } + } + + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } } } diff --git a/clang/test/CodeGen/X86/avx-insert-constexpr.cpp b/clang/test/CodeGen/X86/avx-insert-constexpr.cpp new file mode 100644 index 0000000000000..30c1776d8ba6d --- /dev/null +++ b/clang/test/CodeGen/X86/avx-insert-constexpr.cpp @@ -0,0 +1,87 @@ +// REQUIRES: x86-registered-target +// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-linux-gnu -O0 -target-cpu skylake-avx512 -std=c++17 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -ffreestanding -triple x86_64-unknown-linux-gnu -O0 -target-cpu skylake-avx512 -std=c++17 -fexperimental-new-constant-interpreter -emit-llvm -o - %s | FileCheck %s + +#include <immintrin.h> +#include "builtin_test_helpers.h" + +// +// AVX256 Insert Tests +// + +__m256 test_mm256_insertf32x4(__m256 A, __m128 B) { + // CHECK-LABEL: test_mm256_insertf32x4 + return _mm256_insertf32x4(A, B, 1); +} + +// Insert 128-bit float vector into upper lane +TEST_CONSTEXPR(match_m256(_mm256_insertf32x4(_mm256_set1_ps(1.0f), _mm_set_ps(40.0f, 30.0f, 20.0f, 10.0f), 1), 1.0f, 1.0f, 1.0f, 1.0f, 10.0f, 20.0f, 30.0f, 40.0f)); + +__m256i test_mm256_inserti32x4(__m256i A, __m128i B) { + // CHECK-LABEL: test_mm256_inserti32x4 + return _mm256_inserti32x4(A, B, 0); +} + +// Insert 128-bit integer vector into lower lane +TEST_CONSTEXPR(match_v8si(_mm256_inserti32x4(_mm256_set1_epi32(1), _mm_set_epi32(40, 30, 20, 10), 0), 10, 20, 30, 40, 1, 1, 1, 1)); + +// +// AVX256 Masked Insert Test +// + +__m256 test_mm256_maskz_insertf32x4(__mmask8 U, __m256 A, __m128 B) { + // CHECK-LABEL: test_mm256_maskz_insertf32x4 + return _mm256_maskz_insertf32x4(U, A, B, 1); +} + +// Test zero mask produces all zeros +TEST_CONSTEXPR(match_m256( + _mm256_maskz_insertf32x4(0x00, _mm256_set1_ps(1.0f), + _mm_set_ps(40.0f, 30.0f, 20.0f, 10.0f), 1), + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)); + +// +// AVX Legacy Insert Test +// + +__m256 test_mm256_insertf128_ps(__m256 A, __m128 B) { + // CHECK-LABEL: test_mm256_insertf128_ps + return _mm256_insertf128_ps(A, B, 1); +} + +// Legacy insertf128 into upper lane +TEST_CONSTEXPR(match_m256(_mm256_insertf128_ps(_mm256_set1_ps(1.0f), _mm_set1_ps(7.0f), 1), 1.0f, 1.0f, 1.0f, 1.0f, 7.0f, 7.0f, 7.0f, 7.0f)); + +// +//AVX512 Insert Tests +// + +__m512 test_mm512_insertf32x4(__m512 A, __m128 B) { + // CHECK-LABEL: test_mm512_insertf32x4 + return _mm512_insertf32x4(A, B, 3); +} + +// Insert 128-bit into highest lane of 512-bit vector +TEST_CONSTEXPR(match_m512(_mm512_insertf32x4(_mm512_set1_ps(1.0f), _mm_set1_ps(5.0f), 3), 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 5.0f, 5.0f, 5.0f, 5.0f)); + +__m512 test_mm512_insertf32x8(__m512 A, __m256 B) { + // CHECK-LABEL: test_mm512_insertf32x8 + return _mm512_insertf32x8(A, B, 1); +} + +// Insert 256-bit into upper half of 512-bit vector +TEST_CONSTEXPR(match_m512(_mm512_insertf32x8(_mm512_set1_ps(1.0f), _mm256_set1_ps(2.0f), 1), 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f)); + +// +// AVX512 Masked Insert Test +// + +__m512 test_mm512_maskz_insertf32x4(__mmask16 U, __m512 A, __m128 B) { + // CHECK-LABEL: test_mm512_maskz_insertf32x4 + return _mm512_maskz_insertf32x4(U, A, B, 3); +} + +// Test zero mask produces all zeros +TEST_CONSTEXPR(match_m512( + _mm512_maskz_insertf32x4(0x0000, _mm512_set1_ps(1.0f), _mm_set1_ps(5.0f), 3), + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f)); >From 8472b631c0905b7582b6db7c82643604fd3c5713 Mon Sep 17 00:00:00 2001 From: AdityaC4 <[email protected]> Date: Tue, 16 Sep 2025 09:29:18 -0500 Subject: [PATCH 2/2] use getPrimType() instead & formatting --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 28 ++++++++++-------------- clang/lib/AST/ExprConstant.cpp | 5 ++--- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index edc02671dfda7..dbdd3860daf68 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2947,14 +2947,12 @@ static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC, uint64_t Index = ImmAPS.getZExtValue(); const Pointer &SubVec = S.Stk.pop<Pointer>(); - if (!SubVec.getFieldDesc()->isPrimitiveArray()) { - return Invalid(S, OpPC); - } + if (!SubVec.getFieldDesc()->isPrimitiveArray()) + return false; const Pointer &DstVec = S.Stk.pop<Pointer>(); - if (!DstVec.getFieldDesc()->isPrimitiveArray()) { - return Invalid(S, OpPC); - } + if (!DstVec.getFieldDesc()->isPrimitiveArray()) + return false; const Pointer &Result = S.Stk.peek<Pointer>(); @@ -2962,24 +2960,20 @@ static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC, unsigned SubElements = SubVec.getNumElems(); if (SubElements == 0 || DstElements == 0 || (DstElements % SubElements) != 0) - return Invalid(S, OpPC); + return false; unsigned NumLanes = DstElements / SubElements; unsigned Lane = static_cast<unsigned>(Index % NumLanes); - - QualType ElemType = DstVec.getFieldDesc()->getElemQualType(); - PrimType ElemPT = *S.getContext().classify(ElemType); - unsigned InsertPos = Lane * SubElements; + PrimType ElemPT = DstVec.getFieldDesc()->getPrimType(); + TYPE_SWITCH(ElemPT, { - for (unsigned i = 0; i < DstElements; ++i) { - Result.elem<T>(i) = DstVec.elem<T>(i); - } + for (unsigned I = 0; I != DstElements; ++I) + Result.elem<T>(I) = DstVec.elem<T>(I); - for (unsigned i = 0; i < SubElements; ++i) { - Result.elem<T>(InsertPos + i) = SubVec.elem<T>(i); - } + for (unsigned I = 0; I != SubElements; ++I) + Result.elem<T>(InsertPos + I) = SubVec.elem<T>(I); }); Result.initializeAllElements(); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 94fb6dfcfa75d..6ebaab42dd22f 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12168,11 +12168,10 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { ResultElements.reserve(DstLen); for (unsigned EltNum = 0; EltNum < DstLen; ++EltNum) { - if (EltNum >= LaneIdx && EltNum < LaneIdx + SubLen) { + if (EltNum >= LaneIdx && EltNum < LaneIdx + SubLen) ResultElements.push_back(SourceSub.getVectorElt(EltNum - LaneIdx)); - } else { + else ResultElements.push_back(SourceDst.getVectorElt(EltNum)); - } } return Success(APValue(ResultElements.data(), ResultElements.size()), E); _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
