yaxunl updated this revision to Diff 325972.
yaxunl marked 2 inline comments as done.
yaxunl edited the summary of this revision.
yaxunl added a comment.
revised comments and fixed test
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D97318/new/
https://reviews.llvm.org/D97318
Files:
clang/lib/CodeGen/TargetInfo.cpp
clang/test/CodeGenCUDA/float16.cu
clang/test/CodeGenOpenCL/builtins-f16.cl
Index: clang/test/CodeGenOpenCL/builtins-f16.cl
===================================================================
--- clang/test/CodeGenOpenCL/builtins-f16.cl
+++ clang/test/CodeGenOpenCL/builtins-f16.cl
@@ -6,66 +6,70 @@
void test_half_builtins(half h0, half h1, half h2) {
volatile half res;
- // CHECK: call half @llvm.copysign.f16(half %h0, half %h1)
+ // CHECK: [[h0:%.*]] = bitcast i16 %h0.coerce to half
+ // CHECK: [[h1:%.*]] = bitcast i16 %h1.coerce to half
+ // CHECK: [[h2:%.*]] = bitcast i16 %h2.coerce to half
+
+ // CHECK: call half @llvm.copysign.f16(half [[h0]], half [[h1]])
res = __builtin_copysignf16(h0, h1);
- // CHECK: call half @llvm.fabs.f16(half %h0)
+ // CHECK: call half @llvm.fabs.f16(half [[h0]])
res = __builtin_fabsf16(h0);
- // CHECK: call half @llvm.ceil.f16(half %h0)
+ // CHECK: call half @llvm.ceil.f16(half [[h0]])
res = __builtin_ceilf16(h0);
- // CHECK: call half @llvm.cos.f16(half %h0)
+ // CHECK: call half @llvm.cos.f16(half [[h0]])
res = __builtin_cosf16(h0);
- // CHECK: call half @llvm.exp.f16(half %h0)
+ // CHECK: call half @llvm.exp.f16(half [[h0]])
res = __builtin_expf16(h0);
- // CHECK: call half @llvm.exp2.f16(half %h0)
+ // CHECK: call half @llvm.exp2.f16(half [[h0]])
res = __builtin_exp2f16(h0);
- // CHECK: call half @llvm.floor.f16(half %h0)
+ // CHECK: call half @llvm.floor.f16(half [[h0]])
res = __builtin_floorf16(h0);
- // CHECK: call half @llvm.fma.f16(half %h0, half %h1, half %h2)
+ // CHECK: call half @llvm.fma.f16(half [[h0]], half [[h1]], half [[h2]])
res = __builtin_fmaf16(h0, h1 ,h2);
- // CHECK: call half @llvm.maxnum.f16(half %h0, half %h1)
+ // CHECK: call half @llvm.maxnum.f16(half [[h0]], half [[h1]])
res = __builtin_fmaxf16(h0, h1);
- // CHECK: call half @llvm.minnum.f16(half %h0, half %h1)
+ // CHECK: call half @llvm.minnum.f16(half [[h0]], half [[h1]])
res = __builtin_fminf16(h0, h1);
- // CHECK: frem half %h0, %h1
+ // CHECK: frem half [[h0]], [[h1]]
res = __builtin_fmodf16(h0, h1);
- // CHECK: call half @llvm.pow.f16(half %h0, half %h1)
+ // CHECK: call half @llvm.pow.f16(half [[h0]], half [[h1]])
res = __builtin_powf16(h0, h1);
- // CHECK: call half @llvm.log10.f16(half %h0)
+ // CHECK: call half @llvm.log10.f16(half [[h0]])
res = __builtin_log10f16(h0);
- // CHECK: call half @llvm.log2.f16(half %h0)
+ // CHECK: call half @llvm.log2.f16(half [[h0]])
res = __builtin_log2f16(h0);
- // CHECK: call half @llvm.log.f16(half %h0)
+ // CHECK: call half @llvm.log.f16(half [[h0]])
res = __builtin_logf16(h0);
- // CHECK: call half @llvm.rint.f16(half %h0)
+ // CHECK: call half @llvm.rint.f16(half [[h0]])
res = __builtin_rintf16(h0);
- // CHECK: call half @llvm.round.f16(half %h0)
+ // CHECK: call half @llvm.round.f16(half [[h0]])
res = __builtin_roundf16(h0);
- // CHECK: call half @llvm.sin.f16(half %h0)
+ // CHECK: call half @llvm.sin.f16(half [[h0]])
res = __builtin_sinf16(h0);
- // CHECK: call half @llvm.sqrt.f16(half %h0)
+ // CHECK: call half @llvm.sqrt.f16(half [[h0]])
res = __builtin_sqrtf16(h0);
- // CHECK: call half @llvm.trunc.f16(half %h0)
+ // CHECK: call half @llvm.trunc.f16(half [[h0]])
res = __builtin_truncf16(h0);
- // CHECK: call half @llvm.canonicalize.f16(half %h0)
+ // CHECK: call half @llvm.canonicalize.f16(half [[h0]])
res = __builtin_canonicalizef16(h0);
}
Index: clang/test/CodeGenCUDA/float16.cu
===================================================================
--- /dev/null
+++ clang/test/CodeGenCUDA/float16.cu
@@ -0,0 +1,94 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - -x hip %s \
+// RUN: -fhip-new-launch-api | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -o - -x hip %s \
+// RUN: -fcuda-is-device | FileCheck -check-prefix=DEV %s
+
+#include "Inputs/cuda.h"
+
+// CHECK: %struct.A = type { i16 }
+struct A {
+ short x;
+};
+
+// CHECK: %struct.B = type { half }
+struct B {
+ _Float16 x;
+};
+
+// CHECK: %struct.C = type { half }
+struct C {
+ __fp16 x;
+};
+
+// Check struct containing _Float16 is coerced and passed correctly to kernel
+// in a similar way as int16.
+
+// CHECK: define dso_local void @_Z20__device_stub__kern11A(i16 %x.coerce)
+// CHECK: %x = alloca %struct.A, align 2
+// CHECK: %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x, i32 0, i32 0
+// CHECK: store i16 %x.coerce, i16* %coerce.dive, align 2
+// CHECK: %kernel_args = alloca i8*, i64 1, align 16
+// CHECK: %[[PTR:.*]] = bitcast %struct.A* %x to i8*
+// CHECK: %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+// CHECK: store i8* %[[PTR]], i8** %[[ARGS]], align 8
+// DEV: define dso_local amdgpu_kernel void @_Z5kern11A(i16 %x.coerce)
+// DEV: %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x1, i32 0, i32 0
+// DEV: store i16 %x.coerce, i16* %coerce.dive, align 2
+__global__ void kern1(A x) {
+ x.x += 1;
+}
+
+// CHECK: define dso_local void @_Z20__device_stub__kern21B(i16 %x.coerce)
+// CHECK: %x = alloca %struct.B, align 2
+// CHECK: %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x, i32 0, i32 0
+// CHECK: %[[PTR:.*]] = bitcast half* %coerce.dive to i16*
+// CHECK: store i16 %x.coerce, i16* %[[PTR]], align 2
+// CHECK: %kernel_args = alloca i8*, i64 1, align 16
+// CHECK: %[[PTR:.*]] = bitcast %struct.B* %x to i8*
+// CHECK: %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+// CHECK: store i8* %[[PTR]], i8** %[[ARGS]], align 8
+// DEV: define dso_local amdgpu_kernel void @_Z5kern21B(half %x.coerce)
+// DEV: %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x1, i32 0, i32 0
+// DEV: store half %x.coerce, half* %coerce.dive, align 2
+// DEV: %[[HALF:.*]] = load half, half* %x2, align 2
+// DEV: %add = fadd contract half %[[HALF]], 0xH3C00
+// DEV: store half %add, half* %x2, align 2
+__global__ void kern2(B x) {
+ x.x += 1;
+}
+
+// CHECK: define dso_local void @_Z20__device_stub__kern31C(i16 %x.coerce)
+// CHECK: %x = alloca %struct.C, align 2
+// CHECK: %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x, i32 0, i32 0
+// CHECK: %[[PTR:.*]] = bitcast half* %coerce.dive to i16*
+// CHECK: store i16 %x.coerce, i16* %[[PTR]], align 2
+// CHECK: %kernel_args = alloca i8*, i64 1, align 16
+// CHECK: %[[PTR:.*]] = bitcast %struct.C* %x to i8*
+// CHECK: %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+// CHECK: store i8* %[[PTR]], i8** %[[ARGS]], align 8
+// DEV: define dso_local amdgpu_kernel void @_Z5kern31C(half %x.coerce)
+// DEV: %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x1, i32 0, i32 0
+// DEV: store half %x.coerce, half* %coerce.dive, align 2
+// DEV: %[[HALF:.*]] = load half, half* %x2, align 2
+// DEV: %conv = fpext half %[[HALF]] to float
+// DEV: %add = fadd contract float %conv, 1.000000e+00
+// DEV: %[[HALF:.*]] = fptrunc float %add to half
+// DEV: store half %[[HALF]], half* %x2, align 2
+__global__ void kern3(C x) {
+ x.x += 1;
+}
+
+// CHECK: define dso_local void @_Z4fun11A(i16 %x.coerce)
+void fun1(A x) {
+ kern1<<<1, 1>>>(x);
+}
+
+// CHECK: define dso_local void @_Z4fun21B(i16 %x.coerce)
+void fun2(B x) {
+ kern2<<<1, 1>>>(x);
+}
+
+// CHECK: define dso_local void @_Z5func31C(i16 %x.coerce)
+void func3(C x) {
+ kern3<<<1, 1>>>(x);
+}
Index: clang/lib/CodeGen/TargetInfo.cpp
===================================================================
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -2817,6 +2817,13 @@
Current = SSE;
} else
llvm_unreachable("unexpected long double representation!");
+ } else if (k == BuiltinType::Float16 || k == BuiltinType::Half) {
+ // AMD64 does not support operations on _Float16 or __fp16 other than
+ // load and store. For load/store operations, _Float16 and __fp16 is
+ // equivalent to 16 bit integer since they have the same size and
+ // alignment. We need this to interop with gcc where 16 bit integer
+ // is used in place of _Float16 or __fp16.
+ Lo = Integer;
}
// FIXME: _Decimal32 and _Decimal64 are SSE.
// FIXME: _float128 and _Decimal128 are (SSE, SSEUp).
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits