Author: Gergo Stomfai Date: 2025-12-23T13:44:21Z New Revision: 8290134a7421c5a5ce3348b29f908887158b4b8a
URL: https://github.com/llvm/llvm-project/commit/8290134a7421c5a5ce3348b29f908887158b4b8a DIFF: https://github.com/llvm/llvm-project/commit/8290134a7421c5a5ce3348b29f908887158b4b8a.diff LOG: [X86] EmitX86BuiltinExpr - attempt to convert SSE41/AVX1 roundps/d/ss/sd builtins to regular rounding modes (#171227) Adding clauses to `CodeGenFunction::EmitX86BuiltinExpr` to convert SSE4.1/AVX1 builts `roundps/pd/ss/sd` to regular rounding modes. We use: 1. `roundeven/floor/ceil/trunc` when not using MXCSR or _MM_FROUND_RAISE_EXC , and FP mode is not strict, 2. `experimental_constrained_roundeven/floor/ceil/trunc` when not using MXCSR or _MM_FROUND_RAISE_EXC , and FP mode is strict 3. `x86_sse41/avx_round_ps/pd/ss/sd` when using MXCSR or _MM_FROUND_RAISE_EXC . Closes #170273 Added: clang/test/CodeGen/X86/sse41-builtins-constrained.c Modified: clang/lib/CodeGen/TargetBuiltins/X86.cpp clang/test/CodeGen/X86/avx-builtins-constrained.c clang/test/CodeGen/X86/avx-builtins.c clang/test/CodeGen/X86/sse41-builtins.c llvm/include/llvm/IR/IntrinsicsX86.td Removed: ################################################################################ diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index be2b7d442645e..685040fc4524f 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -75,6 +75,62 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask, return MaskVec; } +/// Emit rounding for the value \p X according to the rounding \p +/// RoundingControl based on bits 0 and 1. +static Value *emitX86RoundImmediate(CodeGenFunction &CGF, Value *X, + unsigned RoundingControl) { + unsigned RoundingMask = 0b11; + unsigned RoundingMode = RoundingControl & RoundingMask; + + Intrinsic::ID ID = Intrinsic::not_intrinsic; + LLVMContext &Ctx = CGF.CGM.getLLVMContext(); + if (CGF.Builder.getIsFPConstrained()) { + + Value *ExceptMode = + MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.ignore")); + + switch (RoundingMode) { + case 0b00: + ID = Intrinsic::experimental_constrained_roundeven; + break; + case 0b01: + ID = Intrinsic::experimental_constrained_floor; + break; + case 0b10: + ID = Intrinsic::experimental_constrained_ceil; + break; + case 0b11: + ID = Intrinsic::experimental_constrained_trunc; + break; + default: + llvm_unreachable("Invalid rounding mode"); + } + + Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); + return CGF.Builder.CreateCall(F, {X, ExceptMode}); + } + + switch (RoundingMode) { + case 0b00: + ID = Intrinsic::roundeven; + break; + case 0b01: + ID = Intrinsic::floor; + break; + case 0b10: + ID = Intrinsic::ceil; + break; + case 0b11: + ID = Intrinsic::trunc; + break; + default: + llvm_unreachable("Invalid rounding mode"); + } + + Function *F = CGF.CGM.getIntrinsic(ID, X->getType()); + return CGF.Builder.CreateCall(F, {X}); +} + static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops, Align Alignment) { Value *Ptr = Ops[0]; @@ -840,6 +896,76 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, Ops[0]); return Builder.CreateExtractValue(Call, 0); } + case X86::BI__builtin_ia32_roundps: + case X86::BI__builtin_ia32_roundpd: + case X86::BI__builtin_ia32_roundps256: + case X86::BI__builtin_ia32_roundpd256: { + unsigned M = cast<ConstantInt>(Ops[1])->getZExtValue(); + unsigned MXCSRMask = 0b100; + unsigned FRoundNoExcMask = 0b1000; + unsigned UseMXCSR = MXCSRMask & M; + unsigned FRoundNoExc = FRoundNoExcMask & M; + + if (UseMXCSR || !FRoundNoExc) { + + Intrinsic::ID ID = Intrinsic::not_intrinsic; + + switch (BuiltinID) { + case X86::BI__builtin_ia32_roundps: + ID = Intrinsic::x86_sse41_round_ps; + break; + case X86::BI__builtin_ia32_roundps256: + ID = Intrinsic::x86_avx_round_ps_256; + break; + case X86::BI__builtin_ia32_roundpd: + ID = Intrinsic::x86_sse41_round_pd; + break; + case X86::BI__builtin_ia32_roundpd256: + ID = Intrinsic::x86_avx_round_pd_256; + break; + default: + llvm_unreachable("must return from switch"); + } + + Function *F = CGM.getIntrinsic(ID); + return Builder.CreateCall(F, Ops); + } + + return emitX86RoundImmediate(*this, Ops[0], M); + } + case X86::BI__builtin_ia32_roundss: + case X86::BI__builtin_ia32_roundsd: { + unsigned M = cast<ConstantInt>(Ops[2])->getZExtValue(); + unsigned MXCSRMask = 0b100; + unsigned FRoundNoExcMask = 0b1000; + unsigned UseMXCSR = MXCSRMask & M; + unsigned FRoundNoExc = FRoundNoExcMask & M; + + if (UseMXCSR || !FRoundNoExc) { + + Intrinsic::ID ID = Intrinsic::not_intrinsic; + + switch (BuiltinID) { + case X86::BI__builtin_ia32_roundss: + ID = Intrinsic::x86_sse41_round_ss; + break; + case X86::BI__builtin_ia32_roundsd: + ID = Intrinsic::x86_sse41_round_sd; + break; + default: + llvm_unreachable("must return from switch"); + } + + Function *F = CGM.getIntrinsic(ID); + return Builder.CreateCall(F, Ops); + } + + Value *Idx = Builder.getInt32(0); + Value *ValAt0 = Builder.CreateExtractElement(Ops[1], Idx); + Value *RoundedAt0 = emitX86RoundImmediate(*this, ValAt0, M); + + return Builder.CreateInsertElement(Ops[0], RoundedAt0, Idx); + } case X86::BI__builtin_ia32_lzcnt_u16: case X86::BI__builtin_ia32_lzcnt_u32: case X86::BI__builtin_ia32_lzcnt_u64: { diff --git a/clang/test/CodeGen/X86/avx-builtins-constrained.c b/clang/test/CodeGen/X86/avx-builtins-constrained.c index 428febeb1d293..357b6e1c66339 100644 --- a/clang/test/CodeGen/X86/avx-builtins-constrained.c +++ b/clang/test/CodeGen/X86/avx-builtins-constrained.c @@ -32,4 +32,40 @@ __m256d test_mm256_sqrt_pd(__m256d x) { // CONSTRAINED: call {{.*}}<4 x double> @llvm.experimental.constrained.sqrt.v4f64(<4 x double> {{.*}}, metadata !{{.*}}) // CHECK-ASM: vsqrtpd %ymm{{.*}}, return _mm256_sqrt_pd(x); -} \ No newline at end of file +} + +__m256d test_mm256_round_pd_mxcsr(__m256d x) { + // CONSTRAINED-LABEL: test_mm256_round_pd_mxcsr + // CONSTRAINED: %{{.*}} = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 12) + return _mm256_round_pd(x, 0b1100); +} + +__m256d test_mm256_round_pd_fround_no_exc(__m256d x) { + // CONSTRAINED-LABEL: test_mm256_round_pd_fround_no_exc + // CONSTRAINED: %{{.*}} = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 0) + return _mm256_round_pd(x, 0b0000); +} + +__m256d test_mm256_round_pd_trunc(__m256d x) { + // CONSTRAINED-LABEL: test_mm256_round_pd_trunc + // CONSTRAINED: %{{.*}} = call <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double> %{{.*}}, metadata !"fpexcept.ignore") + return _mm256_round_pd(x, 0b1011); +} + +__m256 test_mm256_round_ps_mxcsr(__m256 x) { + // CONSTRAINED-LABEL: test_mm256_round_ps_mxcsr + // CONSTRAINED: %{{.*}} = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 12) + return _mm256_round_ps(x, 0b1100); +} + +__m256 test_mm256_round_ps_fround_no_exc(__m256 x) { + // CONSTRAINED-LABEL: test_mm256_round_ps_fround_no_exc + // CONSTRAINED: %{{.*}} = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 0) + return _mm256_round_ps(x, 0b0000); +} + +__m256 test_mm256_round_ps_trunc(__m256 x) { + // CONSTRAINED-LABEL: test_mm256_round_ps_trunc + // CONSTRAINED: %{{.*}} = call <8 x float> @llvm.experimental.constrained.trunc.v8f32(<8 x float> %{{.*}}, metadata !"fpexcept.ignore") + return _mm256_round_ps(x, 0b1011); +} diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index d92869ff6574f..d2f8740cffbbd 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -246,7 +246,7 @@ TEST_CONSTEXPR(match_m128i(_mm256_castsi256_si128((__m256i)(__v4du){0xBFF0000000 __m256d test_mm256_ceil_pd(__m256d x) { // CHECK-LABEL: test_mm256_ceil_pd - // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2) + // CHECK: %{{.*}} = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2) return _mm256_ceil_pd(x); } @@ -1526,14 +1526,38 @@ __m256 test_mm256_rcp_ps(__m256 A) { __m256d test_mm256_round_pd(__m256d x) { // CHECK-LABEL: test_mm256_round_pd - // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 4) - return _mm256_round_pd(x, 4); + // CHECK: %{{.*}} = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %{{.*}}) + return _mm256_round_pd(x, 0b1000); +} + +__m256d test_mm256_round_pd_mxcsr(__m256d x) { + // CHECK-LABEL: test_mm256_round_pd_mxcsr + // CHECK: %{{.*}} = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 12) + return _mm256_round_pd(x, 0b1100); +} + +__m256d test_mm256_round_pd_fround_no_exc(__m256d x) { + // CHECK-LABEL: test_mm256_round_pd_fround_no_exc + // CHECK: %{{.*}} = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 0) + return _mm256_round_pd(x, 0b0000); } __m256 test_mm256_round_ps(__m256 x) { // CHECK-LABEL: test_mm256_round_ps - // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 4) - return _mm256_round_ps(x, 4); + // CHECK: %{{.*}} = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %{{.*}}) + return _mm256_round_ps(x, 0b1000); +} + +__m256 test_mm256_round_ps_mxcsr(__m256 x) { + // CHECK-LABEL: test_mm256_round_ps_mxcsr + // CHECK: %{{.*}} = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 12) + return _mm256_round_ps(x, 0b1100); +} + +__m256 test_mm256_round_ps_fround_no_exc(__m256 x) { + // CHECK-LABEL: test_mm256_round_ps_fround_no_exc + // CHECK: %{{.*}} = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 0) + return _mm256_round_ps(x, 0b0000); } __m256 test_mm256_rsqrt_ps(__m256 A) { diff --git a/clang/test/CodeGen/X86/sse41-builtins-constrained.c b/clang/test/CodeGen/X86/sse41-builtins-constrained.c new file mode 100644 index 0000000000000..6b25bd27af7e0 --- /dev/null +++ b/clang/test/CodeGen/X86/sse41-builtins-constrained.c @@ -0,0 +1,96 @@ +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK + +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK +// RUN: %clang_cc1 -x c++ -frounding-math -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse4.1 -fno-signed-char -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s --check-prefixes=CHECK + + +#include <immintrin.h> + +__m128d test_mm_round_pd_roundeven(__m128d x) { + // CHECK-LABEL: test_mm_round_pd_roundeven + // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double> %{{.*}}, metadata !"fpexcept.ignore") + return _mm_round_pd(x, 0b1000); +} + +__m128d test_mm_round_pd_mxcsr(__m128d x) { + // CHECK-LABEL: test_mm_round_pd_mxcsr + // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 12) + return _mm_round_pd(x, 0b1100); +} + +__m128d test_mm_round_pd_fround_no_exc(__m128d x) { + // CHECK-LABEL: test_mm_round_pd_fround_no_exc + // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 0) + return _mm_round_pd(x, 0b0000); +} + +__m128 test_mm_round_ps_floor(__m128 x) { + // CHECK-LABEL: test_mm_round_ps_floor + // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.floor.v4f32(<4 x float> %{{.*}}, metadata !"fpexcept.ignore") + return _mm_round_ps(x, 0b1001); +} + +__m128 test_mm_round_ps_mxcsr(__m128 x) { + // CHECK-LABEL: test_mm_round_ps_mxcsr + // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 12) + return _mm_round_ps(x, 0b1100); +} + +__m128 test_mm_round_ps_fround_no_exc(__m128 x) { + // CHECK-LABEL: test_mm_round_ps_fround_no_exc + // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0) + return _mm_round_ps(x, 0b0000); +} + +__m128d test_mm_round_sd_ceil(__m128d x, __m128d y) { + // CHECK-LABEL: test_mm_round_sd_ceil + // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.ceil.f64(double %[[A:.*]], metadata !"fpexcept.ignore") + // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0 + return _mm_round_sd(x, y, 0b1010); +} + +__m128d test_mm_round_sd_mxcsr(__m128d x, __m128d y) { + // CHECK-LABEL: test_mm_round_sd_mxcsr + // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 12) + return _mm_round_sd(x, y, 0b1100); +} + +__m128d test_mm_round_sd_fround_no_exc(__m128d x, __m128d y) { + // CHECK-LABEL: test_mm_round_sd_fround_no_exc + // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 0) + return _mm_round_sd(x, y, 0b0000); +} + +__m128 test_mm_round_ss_trunc(__m128 x, __m128 y) { + // CHECK-LABEL: test_mm_round_ss_trunc + // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.trunc.f32(float %[[A:.*]], metadata !"fpexcept.ignore") + // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0 + return _mm_round_ss(x, y, 0b1011); +} + +__m128 test_mm_round_ss_mxcsr(__m128 x, __m128 y) { + // CHECK-LABEL: test_mm_round_ss_mxcsr + // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %1, i32 12) + return _mm_round_ss(x, y, 0b1100); +} + +__m128 test_mm_round_ss_fround_no_exc(__m128 x, __m128 y) { + // CHECK-LABEL: test_mm_round_ss_fround_no_exc + // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %1, i32 0) + return _mm_round_ss(x, y, 0b0000); +} diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c index 35fa65a99836b..1be1aa71de737 100644 --- a/clang/test/CodeGen/X86/sse41-builtins.c +++ b/clang/test/CodeGen/X86/sse41-builtins.c @@ -75,13 +75,13 @@ TEST_CONSTEXPR(match_m128(_mm_blendv_ps((__m128)(__v4sf){0.0f, 1.0f, 2.0f, 3.0f} __m128d test_mm_ceil_pd(__m128d x) { // CHECK-LABEL: test_mm_ceil_pd - // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2) + // CHECK %{{.*}} = call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{.*}}) return _mm_ceil_pd(x); } __m128 test_mm_ceil_ps(__m128 x) { // CHECK-LABEL: test_mm_ceil_ps - // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2) + // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2) return _mm_ceil_ps(x); } @@ -430,26 +430,78 @@ TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 3276 __m128d test_mm_round_pd(__m128d x) { // CHECK-LABEL: test_mm_round_pd - // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 4) - return _mm_round_pd(x, 4); + // CHECK: %{{.*}} = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %{{.*}}) + return _mm_round_pd(x, 0b1000); +} + +__m128d test_mm_round_pd_mxcsr(__m128d x) { + // CHECK-LABEL: test_mm_round_pd_mxcsr + // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 12) + return _mm_round_pd(x, 0b1100); +} + +__m128d test_mm_round_pd_fround_no_exc(__m128d x) { + // CHECK-LABEL: test_mm_round_pd_fround_no_exc + // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 0) + return _mm_round_pd(x, 0b0000); } __m128 test_mm_round_ps(__m128 x) { // CHECK-LABEL: test_mm_round_ps - // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 4) - return _mm_round_ps(x, 4); + // CHECK: %{{.*}} = call <4 x float> @llvm.floor.v4f32(<4 x float> %{{.*}}) + return _mm_round_ps(x, 0b1001); +} + +__m128 test_mm_round_ps_mxcsr(__m128 x) { + // CHECK-LABEL: test_mm_round_ps_mxcsr + // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 12) + return _mm_round_ps(x, 0b1100); +} + +__m128 test_mm_round_ps_fround_no_exc(__m128 x) { + // CHECK-LABEL: test_mm_round_ps_fround_no_exc + // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0) + return _mm_round_ps(x, 0b0000); } __m128d test_mm_round_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_round_sd - // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 4) - return _mm_round_sd(x, y, 4); + // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call double @llvm.roundeven.f64(double %[[A:.*]]) + // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0 + return _mm_round_sd(x, y, 0b1000); +} + +__m128d test_mm_round_sd_mxcsr(__m128d x, __m128d y) { + // CHECK-LABEL: test_mm_round_sd_mxcsr + // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %0, <2 x double> %{{.*}}, i32 12) + return _mm_round_sd(x, y, 0b1100); +} + +__m128d test_mm_round_sd_fround_no_exc(__m128d x, __m128d y) { + // CHECK-LABEL: test_mm_round_sd_fround_no_exc + // CHECK: %{{.*}} = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %0, <2 x double> %{{.*}}, i32 0) + return _mm_round_sd(x, y, 0b0000); } __m128 test_mm_round_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_round_ss - // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 4) - return _mm_round_ss(x, y, 4); + // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0 + // CHECK: %[[B:.*]] = call float @llvm.trunc.f32(float %[[A:.*]]) + // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0 + return _mm_round_ss(x, y, 0b1011); +} + +__m128 test_mm_round_ss_mxcsr(__m128 x, __m128 y) { + // CHECK-LABEL: test_mm_round_ss_mxcsr + // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %0, <4 x float> %{{.*}}, i32 12) + return _mm_round_ss(x, y, 0b1100); +} + +__m128 test_mm_round_ss_fround_no_exc(__m128 x, __m128 y) { + // CHECK-LABEL: test_mm_round_ss_fround_no_exc + // CHECK: %{{.*}} = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %0, <4 x float> %{{.*}}, i32 0) + return _mm_round_ss(x, y, 0b0000); } __m128i test_mm_stream_load_si128(__m128i const *a) { diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 0245611bc422b..e36187ea54d6f 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -626,18 +626,20 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // FP rounding ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse41_round_ss : ClangBuiltin<"__builtin_ia32_roundss">, - DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, - llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>; - def int_x86_sse41_round_ps : ClangBuiltin<"__builtin_ia32_roundps">, - DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, - llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>; - def int_x86_sse41_round_sd : ClangBuiltin<"__builtin_ia32_roundsd">, - DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, - llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>; - def int_x86_sse41_round_pd : ClangBuiltin<"__builtin_ia32_roundpd">, - DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, - llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>; + def int_x86_sse41_round_ss + : DefaultAttrsIntrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<2>>]>; + def int_x86_sse41_round_ps + : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<1>>]>; + def int_x86_sse41_round_sd + : DefaultAttrsIntrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<2>>]>; + def int_x86_sse41_round_pd + : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<1>>]>; } // Vector min element @@ -921,12 +923,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_rcp_ps_256 : ClangBuiltin<"__builtin_ia32_rcpps256">, DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; - def int_x86_avx_round_pd_256 : ClangBuiltin<"__builtin_ia32_roundpd256">, - DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty], - [IntrNoMem, ImmArg<ArgIndex<1>>]>; - def int_x86_avx_round_ps_256 : ClangBuiltin<"__builtin_ia32_roundps256">, - DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty], - [IntrNoMem, ImmArg<ArgIndex<1>>]>; + def int_x86_avx_round_pd_256 + : DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<1>>]>; + def int_x86_avx_round_ps_256 + : DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty], + [IntrNoMem, ImmArg<ArgIndex<1>>]>; } // Horizontal ops _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
