arsenm created this revision. arsenm added reviewers: Anastasia, yaxunl. Herald added subscribers: kosarev, jdoerfert, kerbowa, jvesely. Herald added a project: All. arsenm requested review of this revision. Herald added a subscriber: wdng.
This was missing important environment context, like denormal-fp-math and target-features. Curiously this seems to be losing nounwind. Note this only fixes the actual invoke kernel. The invoke function is already setting the default attribute set for internal functions. However that is still buggy since it's not applying any use function attributes (it's also missing uniform-work-group-size). I also noticed update_cc_test_checks for attributes seem to not connect the captured attribute variables to the attributes at the end (although I think the numbers happen to work out correctly). https://reviews.llvm.org/D141620 Files: clang/lib/CodeGen/TargetInfo.cpp clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
Index: clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl =================================================================== --- clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -411,7 +411,7 @@ // COMMON: ret void // COMMON: } // COMMON: define spir_kernel void [[INVLK2]](i8 addrspace(4)*{{.*}}) -// COMMON: define spir_kernel void [[INVGK1]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) [[INVOKE_ATTR:#[0-9]+]] +// COMMON: define spir_kernel void [[INVGK1]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) [[INVOKE_KERNEL_ATTR:#[0-9]+]] // COMMON: define spir_kernel void [[INVGK2]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) // COMMON: define spir_kernel void [[INVGK3]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) // COMMON: define spir_kernel void [[INVGK4]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) @@ -422,7 +422,7 @@ // COMMON: ret void // COMMON: } // COMMON: define spir_kernel void [[INVGK7]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) -// COMMON: define internal spir_func void [[INVG8]](i8 addrspace(4)*{{.*}}) +// COMMON: define internal spir_func void [[INVG8]](i8 addrspace(4)*{{.*}}) [[INVG8_INVOKE_FUNC_ATTR:#[0-9]+]] // COMMON: define internal spir_func void [[INVG9]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)* %{{.*}}) // COMMON: define spir_kernel void [[INVGK8]](i8 addrspace(4)*{{.*}}) // COMMON: define spir_kernel void [[INV_G_K]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) @@ -431,4 +431,5 @@ // COMMON: define spir_kernel void [[INVGK10]](i8 addrspace(4)*{{.*}}) // COMMON: define spir_kernel void [[INVGK11]](i8 addrspace(4)*{{.*}}) -// COMMON: attributes [[INVOKE_ATTR]] = { convergent nounwind } +// COMMON: attributes [[INVG8_INVOKE_FUNC_ATTR]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// COMMON: attributes [[INVOKE_KERNEL_ATTR]] = { convergent norecurse "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } Index: clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl =================================================================== --- /dev/null +++ clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl @@ -0,0 +1,102 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs +// RUN: %clang_cc1 -fno-ident -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" -fdenormal-fp-math-f32=preserve-sign -cl-uniform-work-group-size | FileCheck --check-prefix=SPIR32 %s + +// Test that attributes are correctly applied to functions introduced for +// enqueued blocks + + +typedef void (^bl_t)(local void *); +typedef struct {int a;} ndrange_t; + +kernel void device_side_enqueue(global int *a, global int *b, int i) { + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + + enqueue_kernel(default_queue, flags, ndrange, + ^(void) { + a[i] = b[i]; + }); +} +// SPIR32: Function Attrs: convergent noinline norecurse nounwind optnone +// SPIR32-LABEL: define {{[^@]+}}@device_side_enqueue +// SPIR32-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 { +// SPIR32-NEXT: entry: +// SPIR32-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR32-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR32-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 +// SPIR32-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr, align 4 +// SPIR32-NEXT: [[FLAGS:%.*]] = alloca i32, align 4 +// SPIR32-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4 +// SPIR32-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4 +// SPIR32-NEXT: [[BLOCK:%.*]] = alloca <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, align 4 +// SPIR32-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 4 +// SPIR32-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 4 +// SPIR32-NEXT: store i32 [[I]], ptr [[I_ADDR]], align 4 +// SPIR32-NEXT: store i32 0, ptr [[FLAGS]], align 4 +// SPIR32-NEXT: [[TMP0:%.*]] = load ptr, ptr [[DEFAULT_QUEUE]], align 4 +// SPIR32-NEXT: [[TMP1:%.*]] = load i32, ptr [[FLAGS]], align 4 +// SPIR32-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP]], ptr align 4 [[NDRANGE]], i32 4, i1 false) +// SPIR32-NEXT: [[BLOCK_SIZE:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 0 +// SPIR32-NEXT: store i32 24, ptr [[BLOCK_SIZE]], align 4 +// SPIR32-NEXT: [[BLOCK_ALIGN:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 1 +// SPIR32-NEXT: store i32 4, ptr [[BLOCK_ALIGN]], align 4 +// SPIR32-NEXT: [[BLOCK_INVOKE:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 2 +// SPIR32-NEXT: store ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke to ptr addrspace(4)), ptr [[BLOCK_INVOKE]], align 4 +// SPIR32-NEXT: [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 3 +// SPIR32-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 4 +// SPIR32-NEXT: store ptr addrspace(1) [[TMP2]], ptr [[BLOCK_CAPTURED]], align 4 +// SPIR32-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 4 +// SPIR32-NEXT: [[TMP3:%.*]] = load i32, ptr [[I_ADDR]], align 4 +// SPIR32-NEXT: store i32 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 4 +// SPIR32-NEXT: [[BLOCK_CAPTURED2:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 5 +// SPIR32-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4 +// SPIR32-NEXT: store ptr addrspace(1) [[TMP4]], ptr [[BLOCK_CAPTURED2]], align 4 +// SPIR32-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[BLOCK]] to ptr addrspace(4) +// SPIR32-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(ptr [[TMP0]], i32 [[TMP1]], ptr byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]]) +// SPIR32-NEXT: ret void +// +// +// SPIR32: Function Attrs: convergent noinline nounwind optnone +// SPIR32-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke +// SPIR32-SAME: (ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] { +// SPIR32-NEXT: entry: +// SPIR32-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr addrspace(4), align 4 +// SPIR32-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr addrspace(4), align 4 +// SPIR32-NEXT: store ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR]], align 4 +// SPIR32-NEXT: store ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], ptr [[BLOCK_ADDR]], align 4 +// SPIR32-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5 +// SPIR32-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[BLOCK_CAPTURE_ADDR]], align 4 +// SPIR32-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 +// SPIR32-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[BLOCK_CAPTURE_ADDR1]], align 4 +// SPIR32-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP0]], i32 [[TMP1]] +// SPIR32-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4 +// SPIR32-NEXT: [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3 +// SPIR32-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[BLOCK_CAPTURE_ADDR2]], align 4 +// SPIR32-NEXT: [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 +// SPIR32-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[BLOCK_CAPTURE_ADDR3]], align 4 +// SPIR32-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[TMP3]], i32 [[TMP4]] +// SPIR32-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 4 +// SPIR32-NEXT: ret void +// +// +// SPIR32: Function Attrs: convergent norecurse +// SPIR32-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke_kernel +// SPIR32-SAME: (ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { +// SPIR32-NEXT: entry: +// SPIR32-NEXT: call spir_func void @__device_side_enqueue_block_invoke(ptr addrspace(4) [[TMP0]]) +// SPIR32-NEXT: ret void +// +//. +// SPIR32: attributes #0 = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// SPIR32: attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +// SPIR32: attributes #2 = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// SPIR32: attributes #3 = { convergent norecurse "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +//. +// SPIR32: !0 = !{i32 1, !"wchar_size", i32 4} +// SPIR32: !1 = !{i32 2, i32 0} +// SPIR32: !2 = !{i32 1, i32 1, i32 0} +// SPIR32: !3 = !{!"none", !"none", !"none"} +// SPIR32: !4 = !{!"int*", !"int*", !"int"} +// SPIR32: !5 = !{!"", !"", !""} +//. Index: clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl =================================================================== --- clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl +++ clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs --prefix-filecheck-ir-name VAR -// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -disable-llvm-passes -fno-ident -emit-llvm -o - -triple amdgcn-amd-amdhsa -target-cpu gfx900 %s | FileCheck %s --check-prefix=CHECK +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -disable-llvm-passes -fno-ident -emit-llvm -o - -triple amdgcn-amd-amdhsa %s -fdenormal-fp-math-f32=preserve-sign | FileCheck %s --check-prefix=CHECK typedef struct {int a;} ndrange_t; @@ -35,9 +35,40 @@ enqueue_kernel(default_queue, flags, ndrange, block); } + +// Test that target attributes are applied to the functions inserted for the +// block. +__attribute__((target("s-memtime-inst"))) +kernel void test_target_features_kernel(global int *i) { + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + + enqueue_kernel(default_queue, flags, ndrange, + ^(void) { + __builtin_amdgcn_s_memtime(); + }); +} + +__attribute__((target("s-memtime-inst"))) +void test_target_features_func(global int *i) { + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + + enqueue_kernel(default_queue, flags, ndrange, + ^(void) { + __builtin_amdgcn_s_memtime(); + }); +} + +//. +// CHECK: @__block_literal_global = internal addrspace(1) constant { i32, i32, ptr } { i32 16, i32 8, ptr @__test_target_features_kernel_block_invoke }, align 8 #0 +// CHECK: @__block_literal_global.1 = internal addrspace(1) constant { i32, i32, ptr } { i32 16, i32 8, ptr @__test_target_features_func_block_invoke }, align 8 #0 +//. // CHECK: Function Attrs: convergent noinline norecurse nounwind optnone // CHECK-LABEL: define {{[^@]+}}@callee -// CHECK-SAME: (i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-SAME: (i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5) // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -53,7 +84,7 @@ // // CHECK: Function Attrs: convergent noinline norecurse nounwind optnone // CHECK-LABEL: define {{[^@]+}}@test -// CHECK-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { +// CHECK-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { // CHECK-NEXT: entry: // CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // CHECK-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) @@ -167,7 +198,7 @@ // // CHECK: Function Attrs: convergent noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@__test_block_invoke -// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -182,9 +213,9 @@ // CHECK-NEXT: ret void // // -// CHECK: Function Attrs: convergent nounwind +// CHECK: Function Attrs: convergent norecurse // CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -// CHECK-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 { +// CHECK-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5) // CHECK-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -195,7 +226,7 @@ // // CHECK: Function Attrs: convergent noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_2 -// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -216,9 +247,9 @@ // CHECK-NEXT: ret void // // -// CHECK: Function Attrs: convergent nounwind +// CHECK: Function Attrs: convergent norecurse // CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -// CHECK-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR4]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 { +// CHECK-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // CHECK-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -229,7 +260,7 @@ // // CHECK: Function Attrs: convergent noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_3 -// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR4]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) @@ -255,9 +286,9 @@ // CHECK-NEXT: ret void // // -// CHECK: Function Attrs: convergent nounwind +// CHECK: Function Attrs: convergent norecurse // CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel -// CHECK-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR4]] !kernel_arg_addr_space !11 !kernel_arg_access_qual !12 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !14 { +// CHECK-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !11 !kernel_arg_access_qual !12 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !14 { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // CHECK-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8 @@ -268,7 +299,7 @@ // // CHECK: Function Attrs: convergent noinline nounwind optnone // CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_4 -// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // CHECK-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -278,13 +309,13 @@ // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8 // CHECK-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 // CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8 -// CHECK-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR5:[0-9]+]] +// CHECK-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR11:[0-9]+]] // CHECK-NEXT: ret void // // -// CHECK: Function Attrs: convergent nounwind +// CHECK: Function Attrs: convergent norecurse // CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel -// CHECK-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR4]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 { +// CHECK-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // CHECK-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 @@ -292,13 +323,101 @@ // CHECK-NEXT: call void @__test_block_invoke_4(ptr [[TMP2]]) // CHECK-NEXT: ret void // +// +// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@test_target_features_kernel +// CHECK-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space !15 !kernel_arg_access_qual !8 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !10 { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) +// CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8 +// CHECK-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: convergent noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke +// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8 +// CHECK-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: convergent norecurse +// CHECK-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel +// CHECK-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR8:[0-9]+]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5) +// CHECK-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr +// CHECK-NEXT: call void @__test_target_features_kernel_block_invoke(ptr [[TMP2]]) +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: convergent noinline norecurse nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@test_target_features_func +// CHECK-SAME: (ptr addrspace(1) noundef [[I:%.*]]) #[[ATTR9:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) +// CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) +// CHECK-NEXT: store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8 +// CHECK-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_target_features_func_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global.1 to ptr)) +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: convergent noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@__test_target_features_func_block_invoke +// CHECK-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8 +// CHECK-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() +// CHECK-NEXT: ret void +// +// +// CHECK: Function Attrs: convergent norecurse +// CHECK-LABEL: define {{[^@]+}}@__test_target_features_func_block_invoke_kernel +// CHECK-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR10:[0-9]+]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5) +// CHECK-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr +// CHECK-NEXT: call void @__test_target_features_func_block_invoke(ptr [[TMP2]]) +// CHECK-NEXT: ret void +// //. -// CHECK: attributes #0 = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -// CHECK: attributes #1 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" "uniform-work-group-size"="false" } -// CHECK: attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -// CHECK: attributes #3 = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" } -// CHECK: attributes #4 = { convergent nounwind "enqueued-block" } -// CHECK: attributes #5 = { convergent } +// CHECK: attributes #0 = { "objc_arc_inert" } +// CHECK: attributes #1 = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CHECK: attributes #2 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } +// CHECK: attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +// CHECK: attributes #4 = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// CHECK: attributes #5 = { convergent norecurse "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } +// CHECK: attributes #6 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" } +// CHECK: attributes #7 = { nocallback nofree nosync nounwind willreturn } +// CHECK: attributes #8 = { convergent norecurse "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" } +// CHECK: attributes #9 = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" } +// CHECK: attributes #10 = { convergent norecurse "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" } +// CHECK: attributes #11 = { convergent } //. // CHECK: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400} // CHECK: !1 = !{i32 1, !"wchar_size", i32 4} @@ -315,4 +434,6 @@ // CHECK: !12 = !{!"none", !"none"} // CHECK: !13 = !{!"__block_literal", !"void*"} // CHECK: !14 = !{!"", !""} +// CHECK: !15 = !{i32 1} +// CHECK: !16 = !{!"int*"} //. Index: clang/lib/CodeGen/TargetInfo.cpp =================================================================== --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -12416,6 +12416,13 @@ CGF.getTypes().ClangCallConvToLLVMCallConv(CallingConv::CC_OpenCLKernel); F->setCallingConv(KernelCC); + // Inherit attributes from the use context. + // + // TODO: Are there any non-kernel specific attributes we need to take care to + // avoid? + llvm::AttrBuilder KernelAttrs(C, CGF.CurFn->getAttributes().getFnAttrs()); + F->addFnAttrs(KernelAttrs); + auto IP = CGF.Builder.saveIP(); auto *BB = llvm::BasicBlock::Create(C, "entry", F); auto &Builder = CGF.Builder; @@ -12424,10 +12431,6 @@ llvm::CallInst *Call = Builder.CreateCall(Invoke, Args); Call->setCallingConv(Invoke->getCallingConv()); - // FIXME: Apply default attributes - F->addFnAttr(llvm::Attribute::NoUnwind); - F->addFnAttr(llvm::Attribute::Convergent); - Builder.CreateRetVoid(); Builder.restoreIP(IP); return F; @@ -12478,10 +12481,13 @@ &CGF.CGM.getModule()); F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); - // FIXME: Apply default attributes - F->addFnAttr(llvm::Attribute::NoUnwind); - F->addFnAttr(llvm::Attribute::Convergent); - F->addFnAttr("enqueued-block"); + // Inherit attributes from the use context. + // TODO: Are there any non-kernel specific attributes we need to take care to + // avoid? + // FIXME: The invoke isn't applying the right attributes either + llvm::AttrBuilder KernelAttrs(C, CGF.CurFn->getAttributes().getFnAttrs()); + KernelAttrs.addAttribute("enqueued-block"); + F->addFnAttrs(KernelAttrs); auto IP = CGF.Builder.saveIP(); auto *BB = llvm::BasicBlock::Create(C, "entry", F);
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits