mike.dvoretsky updated this revision to Diff 149484.
mike.dvoretsky added a comment.
Changed the scalar intrinsic lowering to work via extract-insert.
https://reviews.llvm.org/D45203 contains tests for folding the resulting IR
patterns.
https://reviews.llvm.org/D45202
Files:
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGen/avx-builtins.c
clang/test/CodeGen/avx512f-builtins.c
clang/test/CodeGen/sse41-builtins.c
Index: clang/test/CodeGen/sse41-builtins.c
===================================================================
--- clang/test/CodeGen/sse41-builtins.c
+++ clang/test/CodeGen/sse41-builtins.c
@@ -44,25 +44,31 @@
__m128d test_mm_ceil_pd(__m128d x) {
// CHECK-LABEL: test_mm_ceil_pd
- // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2)
+ // CHECK: @llvm.ceil.v2f64
+ // CHECK-NOT: select
return _mm_ceil_pd(x);
}
__m128 test_mm_ceil_ps(__m128 x) {
// CHECK-LABEL: test_mm_ceil_ps
- // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2)
+ // CHECK: @llvm.ceil.v4f32
+ // CHECK-NOT: select
return _mm_ceil_ps(x);
}
__m128d test_mm_ceil_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_ceil_sd
- // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2)
+ // CHECK: extractelement
+ // CHECK: @llvm.ceil.f64
+ // CHECK: insertelement
return _mm_ceil_sd(x, y);
}
__m128 test_mm_ceil_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_ceil_ss
- // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2)
+ // CHECK: extractelement
+ // CHECK: @llvm.ceil.f32
+ // CHECK: insertelement
return _mm_ceil_ss(x, y);
}
@@ -196,25 +202,31 @@
__m128d test_mm_floor_pd(__m128d x) {
// CHECK-LABEL: test_mm_floor_pd
- // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1)
+ // CHECK: @llvm.floor.v2f64
+ // CHECK-NOT: select
return _mm_floor_pd(x);
}
__m128 test_mm_floor_ps(__m128 x) {
// CHECK-LABEL: test_mm_floor_ps
- // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1)
+ // CHECK: @llvm.floor.v4f32
+ // CHECK-NOT: select
return _mm_floor_ps(x);
}
__m128d test_mm_floor_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_floor_sd
- // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1)
+ // CHECK: extractelement
+ // CHECK: @llvm.floor.f64
+ // CHECK: insertelement
return _mm_floor_sd(x, y);
}
__m128 test_mm_floor_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_floor_ss
- // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1)
+ // CHECK: extractelement
+ // CHECK: @llvm.floor.f32
+ // CHECK: insertelement
return _mm_floor_ss(x, y);
}
Index: clang/test/CodeGen/avx512f-builtins.c
===================================================================
--- clang/test/CodeGen/avx512f-builtins.c
+++ clang/test/CodeGen/avx512f-builtins.c
@@ -7565,46 +7565,98 @@
return _mm512_min_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION);
}
+__m512 test_mm512_floor_ps(__m512 __A)
+{
+ // CHECK-LABEL: @test_mm512_floor_ps
+ // CHECK: @llvm.floor.v16f32
+ // CHECK-NOT: select
+ return _mm512_floor_ps(__A);
+}
+
+__m512d test_mm512_floor_pd(__m512d __A)
+{
+ // CHECK-LABEL: @test_mm512_floor_pd
+ // CHECK: @llvm.floor.v8f64
+ // CHECK-NOT: select
+ return _mm512_floor_pd(__A);
+}
+
__m512 test_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
- // CHECK-LABEL: @test_mm512_mask_floor_ps
- // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+ // CHECK-LABEL: @test_mm512_mask_floor_ps
+ // CHECK: @llvm.floor.v16f32
+ // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_mask_floor_ps (__W,__U,__A);
}
__m512d test_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
- // CHECK-LABEL: @test_mm512_mask_floor_pd
- // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+ // CHECK-LABEL: @test_mm512_mask_floor_pd
+ // CHECK: @llvm.floor.v8f64
+ // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
return _mm512_mask_floor_pd (__W,__U,__A);
}
+__m512 test_mm512_ceil_ps(__m512 __A)
+{
+ // CHECK-LABEL: @test_mm512_ceil_ps
+ // CHECK: @llvm.ceil.v16f32
+ // CHECK-NOT: select
+ return _mm512_ceil_ps(__A);
+}
+
+__m512d test_mm512_ceil_pd(__m512d __A)
+{
+ // CHECK-LABEL: @test_mm512_ceil_pd
+ // CHECK: @llvm.ceil.v8f64
+ // CHECK-NOT: select
+ return _mm512_ceil_pd(__A);
+}
+
__m512 test_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
{
- // CHECK-LABEL: @test_mm512_mask_ceil_ps
- // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+ // CHECK-LABEL: @test_mm512_mask_ceil_ps
+ // CHECK: @llvm.ceil.v16f32
+ // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_mask_ceil_ps (__W,__U,__A);
}
__m512d test_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
{
- // CHECK-LABEL: @test_mm512_mask_ceil_pd
- // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+ // CHECK-LABEL: @test_mm512_mask_ceil_pd
+ // CHECK: @llvm.ceil.v8f64
+ // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
return _mm512_mask_ceil_pd (__W,__U,__A);
}
__m512 test_mm512_mask_roundscale_ps(__m512 __W, __mmask16 __U, __m512 __A)
{
// CHECK-LABEL: @test_mm512_mask_roundscale_ps
// CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+ return _mm512_mask_roundscale_ps(__W,__U,__A, 3);
+}
+
+__m512 test_mm512_mask_roundscale_floor_ps(__m512 __W, __mmask16 __U, __m512 __A)
+{
+ // CHECK-LABEL: @test_mm512_mask_roundscale_floor_ps
+ // CHECK: @llvm.floor.v16f32
+ // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
return _mm512_mask_roundscale_ps(__W,__U,__A, 1);
}
+__m512 test_mm512_mask_roundscale_ceil_ps(__m512 __W, __mmask16 __U, __m512 __A)
+{
+ // CHECK-LABEL: @test_mm512_mask_roundscale_ceil_ps
+ // CHECK: @llvm.ceil.v16f32
+ // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+ return _mm512_mask_roundscale_ps(__W,__U,__A, 2);
+}
+
__m512 test_mm512_maskz_roundscale_ps(__mmask16 __U, __m512 __A)
{
// CHECK-LABEL: @test_mm512_maskz_roundscale_ps
// CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
- return _mm512_maskz_roundscale_ps(__U,__A, 1);
+ return _mm512_maskz_roundscale_ps(__U,__A, 3);
}
__m512 test_mm512_mask_roundscale_round_ps(__m512 __A,__mmask16 __U,__m512 __C)
@@ -7632,14 +7684,14 @@
{
// CHECK-LABEL: @test_mm512_mask_roundscale_pd
// CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
- return _mm512_mask_roundscale_pd(__W,__U,__A, 1);
+ return _mm512_mask_roundscale_pd(__W,__U,__A, 3);
}
__m512d test_mm512_maskz_roundscale_pd(__mmask8 __U, __m512d __A)
{
// CHECK-LABEL: @test_mm512_maskz_roundscale_pd
// CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
- return _mm512_maskz_roundscale_pd(__U,__A, 1);
+ return _mm512_maskz_roundscale_pd(__U,__A, 3);
}
__m512d test_mm512_mask_roundscale_round_pd(__m512d __A,__mmask8 __U,__m512d __C)
Index: clang/test/CodeGen/avx-builtins.c
===================================================================
--- clang/test/CodeGen/avx-builtins.c
+++ clang/test/CodeGen/avx-builtins.c
@@ -202,13 +202,15 @@
__m256d test_mm256_ceil_pd(__m256d x) {
// CHECK-LABEL: test_mm256_ceil_pd
- // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2)
+ // CHECK: @llvm.ceil.v4f64
+ // CHECK-NOT: select
return _mm256_ceil_pd(x);
}
__m256 test_mm_ceil_ps(__m256 x) {
// CHECK-LABEL: test_mm_ceil_ps
- // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 2)
+ // CHECK: @llvm.ceil.v8f32
+ // CHECK-NOT: select
return _mm256_ceil_ps(x);
}
@@ -364,13 +366,15 @@
__m256d test_mm256_floor_pd(__m256d x) {
// CHECK-LABEL: test_mm256_floor_pd
- // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 1)
+ // CHECK: @llvm.floor.v4f64
+ // CHECK-NOT: select
return _mm256_floor_pd(x);
}
__m256 test_mm_floor_ps(__m256 x) {
// CHECK-LABEL: test_mm_floor_ps
- // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 1)
+ // CHECK: @llvm.floor.v8f32
+ // CHECK-NOT: select
return _mm256_floor_ps(x);
}
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -8408,6 +8408,130 @@
return EmitX86MaskedCompare(CGF, 1, true, { In, Zero });
}
+static Value *EmitX86Round(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
+ unsigned BuiltinID) {
+ int RoundControl;
+ if (BuiltinID == clang::X86::BI__builtin_ia32_roundss ||
+ BuiltinID == clang::X86::BI__builtin_ia32_roundsd)
+ RoundControl = cast<ConstantInt>(Ops[2])->getSExtValue();
+ else if (BuiltinID == clang::X86::BI__builtin_ia32_rndscalesd_round_mask ||
+ BuiltinID == clang::X86::BI__builtin_ia32_rndscaless_round_mask)
+ RoundControl = cast<ConstantInt>(Ops[4])->getSExtValue();
+ else
+ RoundControl = cast<ConstantInt>(Ops[1])->getSExtValue();
+
+ int SAE;
+ if (BuiltinID == clang::X86::BI__builtin_ia32_rndscaleps_mask ||
+ BuiltinID == clang::X86::BI__builtin_ia32_rndscalepd_mask)
+ SAE = cast<ConstantInt>(Ops[4])->getSExtValue();
+ else if (BuiltinID == clang::X86::BI__builtin_ia32_rndscalesd_round_mask ||
+ BuiltinID == clang::X86::BI__builtin_ia32_rndscaless_round_mask)
+ SAE = cast<ConstantInt>(Ops[5])->getSExtValue();
+ else
+ SAE = 4;
+
+ if (SAE != 4 || (RoundControl != 2 /*ceil*/ && RoundControl != 1 /*floor*/)) {
+ Intrinsic::ID ID;
+ switch (BuiltinID) {
+ default: llvm_unreachable("Unsupported intrinsic!");
+ case clang::X86::BI__builtin_ia32_roundps:
+ ID = Intrinsic::x86_sse41_round_ps;
+ break;
+ case clang::X86::BI__builtin_ia32_roundss:
+ ID = Intrinsic::x86_sse41_round_ss;
+ break;
+ case clang::X86::BI__builtin_ia32_roundsd:
+ ID = Intrinsic::x86_sse41_round_sd;
+ break;
+ case clang::X86::BI__builtin_ia32_roundpd:
+ ID = Intrinsic::x86_sse41_round_pd;
+ break;
+ case clang::X86::BI__builtin_ia32_roundpd256:
+ ID = Intrinsic::x86_avx_round_pd_256;
+ break;
+ case clang::X86::BI__builtin_ia32_roundps256:
+ ID = Intrinsic::x86_avx_round_ps_256;
+ break;
+ case clang::X86::BI__builtin_ia32_rndscaleps_mask:
+ ID = Intrinsic::x86_avx512_mask_rndscale_ps_512;
+ break;
+ case clang::X86::BI__builtin_ia32_rndscalepd_mask:
+ ID = Intrinsic::x86_avx512_mask_rndscale_pd_512;
+ break;
+ case clang::X86::BI__builtin_ia32_rndscalepd_128_mask:
+ ID = Intrinsic::x86_avx512_mask_rndscale_pd_128;
+ break;
+ case clang::X86::BI__builtin_ia32_rndscalepd_256_mask:
+ ID = Intrinsic::x86_avx512_mask_rndscale_pd_256;
+ break;
+ case clang::X86::BI__builtin_ia32_rndscaleps_128_mask:
+ ID = Intrinsic::x86_avx512_mask_rndscale_ps_128;
+ break;
+ case clang::X86::BI__builtin_ia32_rndscaleps_256_mask:
+ ID = Intrinsic::x86_avx512_mask_rndscale_ps_256;
+ break;
+ case clang::X86::BI__builtin_ia32_rndscalesd_round_mask:
+ ID = Intrinsic::x86_avx512_mask_rndscale_sd;
+ break;
+ case clang::X86::BI__builtin_ia32_rndscaless_round_mask:
+ ID = Intrinsic::x86_avx512_mask_rndscale_ss;
+ break;
+ }
+ llvm::Function *F = CGF.CGM.getIntrinsic(ID);
+ return CGF.Builder.CreateCall(F, Ops);
+ }
+
+ Value *Src, *Dst, *Mask;
+ bool IsScalar = false;
+ if (BuiltinID == clang::X86::BI__builtin_ia32_roundss ||
+ BuiltinID == clang::X86::BI__builtin_ia32_roundsd ||
+ BuiltinID == clang::X86::BI__builtin_ia32_rndscalesd_round_mask ||
+ BuiltinID == clang::X86::BI__builtin_ia32_rndscaless_round_mask) {
+ IsScalar = true;
+ if (BuiltinID == clang::X86::BI__builtin_ia32_rndscalesd_round_mask ||
+ BuiltinID == clang::X86::BI__builtin_ia32_rndscaless_round_mask) {
+ llvm::Type *MaskTy = Ops[3]->getType();
+ llvm::Type *I32Ty = CGF.Builder.getInt32Ty();
+ Value *One = llvm::ConstantInt::get(I32Ty, 1);
+ Value *Zero = llvm::Constant::getNullValue(I32Ty);
+ Mask = (MaskTy == I32Ty) ? Ops[3] : CGF.Builder.CreateZExt(Ops[3], I32Ty);
+ Mask = CGF.Builder.CreateAnd(Mask, One);
+ Mask = CGF.Builder.CreateICmp(ICmpInst::ICMP_NE, Mask, Zero);
+ Dst = Ops[2];
+ }
+ else
+ Dst = Ops[0];
+ Src = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0);
+ } else {
+ Src = Ops[0];
+ if (BuiltinID == clang::X86::BI__builtin_ia32_rndscaleps_mask ||
+ BuiltinID == clang::X86::BI__builtin_ia32_rndscalepd_mask ||
+ BuiltinID == clang::X86::BI__builtin_ia32_rndscalepd_128_mask ||
+ BuiltinID == clang::X86::BI__builtin_ia32_rndscalepd_256_mask ||
+ BuiltinID == clang::X86::BI__builtin_ia32_rndscaleps_128_mask ||
+ BuiltinID == clang::X86::BI__builtin_ia32_rndscaleps_256_mask) {
+ Dst = Ops[2];
+ Mask = Ops[3];
+ } else {
+ Dst = Src;
+ Mask = llvm::ConstantInt::getAllOnesValue(CGF.Builder.getIntNTy(Src->getType()->getVectorNumElements()));
+ }
+ }
+
+ Intrinsic::ID ID = (RoundControl == 2) ? Intrinsic::ceil : Intrinsic::floor;
+ Value *F = CGF.CGM.getIntrinsic(ID, Src->getType());
+ Value *Res = CGF.Builder.CreateCall(F, {Src});
+ if (!IsScalar)
+ return EmitX86Select(CGF, Mask, Res, Dst);
+ if (BuiltinID == clang::X86::BI__builtin_ia32_rndscalesd_round_mask ||
+ BuiltinID == clang::X86::BI__builtin_ia32_rndscaless_round_mask) {
+ Dst = CGF.Builder.CreateExtractElement(Dst, (uint64_t)0);
+ Res = CGF.Builder.CreateSelect(Mask, Res, Dst);
+ Dst = Ops[0];
+ }
+ return CGF.Builder.CreateInsertElement(Dst, Res, (uint64_t)0);
+}
+
static Value *EmitX86Abs(CodeGenFunction &CGF, ArrayRef<Value *> Ops) {
llvm::Type *Ty = Ops[0]->getType();
@@ -9103,6 +9227,22 @@
return Builder.CreateBitCast(Res, Ops[0]->getType());
}
+ case X86::BI__builtin_ia32_roundps:
+ case X86::BI__builtin_ia32_roundss:
+ case X86::BI__builtin_ia32_roundsd:
+ case X86::BI__builtin_ia32_roundpd:
+ case X86::BI__builtin_ia32_roundpd256:
+ case X86::BI__builtin_ia32_roundps256:
+ case X86::BI__builtin_ia32_rndscaleps_mask:
+ case X86::BI__builtin_ia32_rndscalepd_mask:
+ case X86::BI__builtin_ia32_rndscalepd_128_mask:
+ case X86::BI__builtin_ia32_rndscalepd_256_mask:
+ case X86::BI__builtin_ia32_rndscaleps_128_mask:
+ case X86::BI__builtin_ia32_rndscaleps_256_mask:
+ case X86::BI__builtin_ia32_rndscalesd_round_mask:
+ case X86::BI__builtin_ia32_rndscaless_round_mask:
+ return EmitX86Round(*this, Ops, BuiltinID);
+
case X86::BI__builtin_ia32_vplzcntd_128:
case X86::BI__builtin_ia32_vplzcntd_256:
case X86::BI__builtin_ia32_vplzcntd_512:
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits