llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Ahmed Nour (ahmednoursphinx) <details> <summary>Changes</summary> This PR adds constexpr support for the AVX2 cross-lane permute intrinsics _mm256_permute4x64_pd and _mm256_permute4x64_epi64 Resolves https://github.com/llvm/llvm-project/issues/169304 --- Full diff: https://github.com/llvm/llvm-project/pull/170442.diff 5 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsX86.td (+6-2) - (modified) clang/lib/AST/ByteCode/InterpBuiltin.cpp (+10) - (modified) clang/lib/AST/ExprConstant.cpp (+13) - (modified) clang/lib/Headers/avx2intrin.h (+2-2) - (modified) clang/test/CodeGen/X86/avx2-builtins.c (+22) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 98cea35beb0ea..23eee6df926a1 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -577,11 +577,15 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def psadbw256 : X86Builtin< "_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">; - def permdf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">; def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">; - def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">; } +let Features = "avx2", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def permdf256 + : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">; + def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">; +} let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 971fce541bb88..3ff5dc3eb5600 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -4944,6 +4944,16 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return std::make_pair(0, static_cast<int>(LaneOffset + Index)); }); + case X86::BI__builtin_ia32_permdf256: + case X86::BI__builtin_ia32_permdi256: + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, [](unsigned DstIdx, unsigned Control) { + // permute4x64 operates on 4 64-bit elements + // For element i (0-3), extract bits [2*i+1:2*i] from Control + unsigned Index = (Control >> (2 * DstIdx)) & 0x3; + return std::make_pair(0, static_cast<int>(Index)); + }); + case X86::BI__builtin_ia32_vpmultishiftqb128: case X86::BI__builtin_ia32_vpmultishiftqb256: case X86::BI__builtin_ia32_vpmultishiftqb512: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index e5af4cb049ba9..13f27be6df58f 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13122,6 +13122,19 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(R, E); } + case X86::BI__builtin_ia32_permdf256: + case X86::BI__builtin_ia32_permdi256: { + APValue R; + if (!evalShuffleGeneric(Info, E, R, [](unsigned DstIdx, unsigned Control) { + // permute4x64 operates on 4 64-bit elements + // For element i (0-3), extract bits [2*i+1:2*i] from Control + unsigned Index = (Control >> (2 * DstIdx)) & 0x3; + return std::make_pair(0, static_cast<int>(Index)); + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_vpermilvarps: case X86::BI__builtin_ia32_vpermilvarps256: case X86::BI__builtin_ia32_vpermilvarps512: { diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index d3ceb2327ac62..4c73a4a59e326 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -3238,7 +3238,7 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) { /// \a M[1:0] specifies the index in \a a for element 0 of the result, /// \a M[3:2] specifies the index for element 1, and so forth. /// \returns A 256-bit vector of [4 x double] containing the result. -#define _mm256_permute4x64_pd(V, M) \ +#define _mm256_permute4x64_pd(V, M) \ ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))) /// Sets the result's 256-bit vector of [8 x float] to copies of elements of @@ -3295,7 +3295,7 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) { /// \a M[1:0] specifies the index in \a a for element 0 of the result, /// \a M[3:2] specifies the index for element 1, and so forth. /// \returns A 256-bit vector of [4 x i64] containing the result. -#define _mm256_permute4x64_epi64(V, M) \ +#define _mm256_permute4x64_epi64(V, M) \ ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))) /// Sets each half of the 256-bit result either to zero or to one of the diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index d6facfea8962e..1f7b2fe7e2d39 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -1111,12 +1111,34 @@ __m256i test_mm256_permute4x64_epi64(__m256i a) { // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <4 x i32> <i32 3, i32 0, i32 2, i32 0> return _mm256_permute4x64_epi64(a, 35); } +// Control value 0x00: [0,0,0,0] -> broadcast element 0 +TEST_CONSTEXPR(match_v4di(_mm256_permute4x64_epi64(_mm256_set_epi64x(40LL, 30LL, 20LL, 10LL), 0x00), 10LL, 10LL, 10LL, 10LL)); +// Control value 0x1B: [0,1,2,3] -> reverse order [3,2,1,0] = [D,C,B,A] +TEST_CONSTEXPR(match_v4di(_mm256_permute4x64_epi64(_mm256_set_epi64x(40LL, 30LL, 20LL, 10LL), 0x1B), 40LL, 30LL, 20LL, 10LL)); +// Control value 0x39: [1,2,3,0] -> rotate left [B,C,D,A] +TEST_CONSTEXPR(match_v4di(_mm256_permute4x64_epi64(_mm256_set_epi64x(40LL, 30LL, 20LL, 10LL), 0x39), 20LL, 30LL, 40LL, 10LL)); +// Control value 0x12: [2,0,1,0] -> [C,A,B,A] +TEST_CONSTEXPR(match_v4di(_mm256_permute4x64_epi64(_mm256_set_epi64x(40LL, 30LL, 20LL, 10LL), 0x12), 30LL, 10LL, 20LL, 10LL)); +// Control value 0xE4: [3,2,1,0] -> identity [A,B,C,D] +TEST_CONSTEXPR(match_v4di(_mm256_permute4x64_epi64(_mm256_set_epi64x(40LL, 30LL, 20LL, 10LL), 0xE4), 10LL, 20LL, 30LL, 40LL)); +// Test with negative values +TEST_CONSTEXPR(match_v4di(_mm256_permute4x64_epi64(_mm256_set_epi64x(-40LL, -30LL, -20LL, -10LL), 0x1B), -40LL, -30LL, -20LL, -10LL)); __m256d test_mm256_permute4x64_pd(__m256d a) { // CHECK-LABEL: test_mm256_permute4x64_pd // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 1, i32 0> return _mm256_permute4x64_pd(a, 25); } +// Control value 0x00: [0,0,0,0] -> broadcast element 0 +TEST_CONSTEXPR(match_m256d(_mm256_permute4x64_pd(_mm256_set_pd(4.0, 3.0, 2.0, 1.0), 0x00), 1.0, 1.0, 1.0, 1.0)); +// Control value 0x1B: [0,1,2,3] -> reverse order [3,2,1,0] = [D,C,B,A] +TEST_CONSTEXPR(match_m256d(_mm256_permute4x64_pd(_mm256_set_pd(4.0, 3.0, 2.0, 1.0), 0x1B), 4.0, 3.0, 2.0, 1.0)); +// Control value 0x39: [1,2,3,0] -> rotate left [B,C,D,A] +TEST_CONSTEXPR(match_m256d(_mm256_permute4x64_pd(_mm256_set_pd(4.0, 3.0, 2.0, 1.0), 0x39), 2.0, 3.0, 4.0, 1.0)); +// Control value 0x12: [2,0,1,0] -> [C,A,B,A] +TEST_CONSTEXPR(match_m256d(_mm256_permute4x64_pd(_mm256_set_pd(4.0, 3.0, 2.0, 1.0), 0x12), 3.0, 1.0, 2.0, 1.0)); +// Control value 0xE4: [3,2,1,0] -> identity [A,B,C,D] +TEST_CONSTEXPR(match_m256d(_mm256_permute4x64_pd(_mm256_set_pd(4.0, 3.0, 2.0, 1.0), 0xE4), 1.0, 2.0, 3.0, 4.0)); __m256i test_mm256_permutevar8x32_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_permutevar8x32_epi32 `````````` </details> https://github.com/llvm/llvm-project/pull/170442 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
