arsenm updated this revision to Diff 558095.
arsenm added a comment.
Drop bitcode auto upgrade handling
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D141700/new/
https://reviews.llvm.org/D141700
Files:
clang/lib/CodeGen/Targets/AMDGPU.cpp
clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel-linking.cl
clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
llvm/docs/AMDGPUUsage.rst
llvm/lib/IR/AutoUpgrade.cpp
llvm/lib/IR/CMakeLists.txt
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/AMDGPUExportKernelRuntimeHandles.cpp
llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/CMakeLists.txt
llvm/test/CodeGen/AMDGPU/amdgpu-export-kernel-runtime-handles.ll
llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll
llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -37,7 +37,7 @@
; GCN-O0-NEXT: Dominator Tree Construction
; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O0-NEXT: Function Alias Analysis Results
-; GCN-O0-NEXT: Lower OpenCL enqueued blocks
+; GCN-O0-NEXT: Externalize enqueued block runtime handles
; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Expand Atomic instructions
@@ -178,7 +178,7 @@
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-NEXT: Function Alias Analysis Results
-; GCN-O1-NEXT: Lower OpenCL enqueued blocks
+; GCN-O1-NEXT: Externalize enqueued block runtime handles
; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions
; GCN-O1-NEXT: AMDGPU Attributor
; GCN-O1-NEXT: FunctionPass Manager
@@ -445,7 +445,7 @@
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
-; GCN-O1-OPTS-NEXT: Lower OpenCL enqueued blocks
+; GCN-O1-OPTS-NEXT: Externalize enqueued block runtime handles
; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions
; GCN-O1-OPTS-NEXT: AMDGPU Attributor
; GCN-O1-OPTS-NEXT: FunctionPass Manager
@@ -736,7 +736,7 @@
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O2-NEXT: Function Alias Analysis Results
-; GCN-O2-NEXT: Lower OpenCL enqueued blocks
+; GCN-O2-NEXT: Externalize enqueued block runtime handles
; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions
; GCN-O2-NEXT: AMDGPU Attributor
; GCN-O2-NEXT: FunctionPass Manager
@@ -1037,7 +1037,7 @@
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
; GCN-O3-NEXT: Function Alias Analysis Results
-; GCN-O3-NEXT: Lower OpenCL enqueued blocks
+; GCN-O3-NEXT: Externalize enqueued block runtime handles
; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions
; GCN-O3-NEXT: AMDGPU Attributor
; GCN-O3-NEXT: FunctionPass Manager
Index: llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
+++ llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll
@@ -14,7 +14,8 @@
%struct.B = type { ptr addrspace(1) }
%opencl.clk_event_t = type opaque
-@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant ptr addrspace(1)
+@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant ptr addrspace(1), section ".amdgpu.kernel.runtime.handle"
[email protected] = external addrspace(1) externally_initialized constant ptr addrspace(1)
; CHECK: ---
; CHECK-NEXT: amdhsa.kernels:
@@ -1678,7 +1679,7 @@
; CHECK: .name: __test_block_invoke_kernel
; CHECK: .symbol: __test_block_invoke_kernel.kd
define amdgpu_kernel void @__test_block_invoke_kernel(
- <{ i32, i32, ptr, ptr addrspace(1), i8 }> %arg) #1
+ <{ i32, i32, ptr, ptr addrspace(1), i8 }> %arg) #1 !associated !112
!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110
!kernel_arg_base_type !110 !kernel_arg_type_qual !4 {
ret void
@@ -1734,6 +1735,29 @@
ret void
}
+; Make sure the device_enqueue_symbol is not reported
+; CHECK: - .args: []
+; CHECK-NEXT: .group_segment_fixed_size: 0
+; CHECK-NEXT: .kernarg_segment_align: 4
+; CHECK-NEXT: .kernarg_segment_size: 0
+; CHECK-NEXT: .language: OpenCL C
+; CHECK-NEXT: .language_version:
+; CHECK-NEXT: - 2
+; CHECK-NEXT: - 0
+; CHECK-NEXT: .max_flat_workgroup_size: 1024
+; CHECK-NEXT: .name: associated_global_not_handle
+; CHECK-NEXT: .private_segment_fixed_size: 0
+; CHECK-NEXT: .sgpr_count:
+; CHECK-NEXT: .sgpr_spill_count: 0
+; CHECK-NEXT: .symbol: associated_global_not_handle.kd
+; CHECK-NEXT: .vgpr_count:
+; CHECK-NEXT: .vgpr_spill_count: 0
+; CHECK-NEXT: .wavefront_size: 64
+; CHECK-NOT: device_enqueue_symbol
+define amdgpu_kernel void @associated_global_not_handle() #3 !associated !113 {
+ ret void
+}
+
; CHECK: amdhsa.printf:
; CHECK-NEXT: - '1:1:4:%d\n'
; CHECK-NEXT: - '2:1:8:%g\n'
@@ -1744,6 +1768,7 @@
attributes #0 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" }
attributes #1 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
+attributes #3 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
@@ -1803,5 +1828,7 @@
!101 = !{!"2:1:8:%g\5Cn"}
!110 = !{!"__block_literal"}
!111 = !{!"char", !"char"}
+!112 = !{ptr addrspace(1) @__test_block_invoke_kernel_runtime_handle }
+!113 = !{ptr addrspace(1) @not.a.handle }
; PARSER: AMDGPU HSA Metadata Parser Test: PASS
Index: llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll
+++ /dev/null
@@ -1,214 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
-; RUN: opt -data-layout=A5 -amdgpu-lower-enqueued-block -S < %s | FileCheck %s
-
-%struct.ndrange_t = type { i32 }
-%opencl.queue_t = type opaque
-
-define amdgpu_kernel void @non_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
- ret void
-}
-
-define amdgpu_kernel void @caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
-entry:
- %block = alloca <{ i32, i32, ptr addrspace(1), i8 }>, align 8, addrspace(5)
- %inst = alloca %struct.ndrange_t, align 4, addrspace(5)
- %block2 = alloca <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
- %inst3 = alloca %struct.ndrange_t, align 4, addrspace(5)
- %block.size = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 0
- store i32 25, ptr addrspace(5) %block.size, align 8
- %block.align = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 1
- store i32 8, ptr addrspace(5) %block.align, align 4
- %block.captured = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 2
- store ptr addrspace(1) %a, ptr addrspace(5) %block.captured, align 8
- %block.captured1 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 3
- store i8 %b, ptr addrspace(5) %block.captured1, align 8
- %inst4 = addrspacecast ptr addrspace(5) %block to ptr
- %inst5 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
- ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2
- %inst10 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
- ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2
- %inst11 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
- ptr @0, ptr nonnull %inst4) #2
- %inst12 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
- ptr @1, ptr nonnull %inst4) #2
- %block.size4 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 0
- store i32 41, ptr addrspace(5) %block.size4, align 8
- %block.align5 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 1
- store i32 8, ptr addrspace(5) %block.align5, align 4
- %block.captured7 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 2
- store ptr addrspace(1) %a, ptr addrspace(5) %block.captured7, align 8
- %block.captured8 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 5
- store i8 %b, ptr addrspace(5) %block.captured8, align 8
- %block.captured9 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 3
- store ptr addrspace(1) %c, ptr addrspace(5) %block.captured9, align 8
- %block.captured10 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 4
- store i64 %d, ptr addrspace(5) %block.captured10, align 8
- %inst8 = addrspacecast ptr addrspace(5) %block2 to ptr
- %inst9 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst3,
- ptr @__test_block_invoke_2_kernel, ptr nonnull %inst8) #2
- ret void
-}
-
-; __enqueue_kernel* functions may get inlined
-define amdgpu_kernel void @inlined_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
-entry:
- %inst = load i64, ptr addrspace(1) addrspacecast (ptr @__test_block_invoke_kernel to ptr addrspace(1))
- store i64 %inst, ptr addrspace(1) %c
- ret void
-}
-
-define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
-entry:
- %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2
- %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3
- store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1
- ret void
-}
-
-declare i32 @__enqueue_kernel_basic(ptr addrspace(1), i32, ptr addrspace(5), ptr, ptr) local_unnamed_addr
-
-define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg) #0 {
-entry:
- %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 2
- %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 3
- %.fca.5.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 4
- %.fca.6.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 5
- store i8 %.fca.6.extract, ptr addrspace(1) %.fca.3.extract, align 1
- store i64 %.fca.5.extract, ptr addrspace(1) %.fca.4.extract, align 8
- ret void
-}
-
-@kernel_address_user = global [1 x ptr] [ ptr @block_has_used_kernel_address ]
-
-define internal amdgpu_kernel void @block_has_used_kernel_address(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
-entry:
- %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2
- %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3
- store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1
- ret void
-}
-
-define amdgpu_kernel void @user_of_kernel_address(ptr addrspace(1) %arg) {
- store ptr @block_has_used_kernel_address, ptr addrspace(1) %arg
- ret void
-}
-
-define internal amdgpu_kernel void @0(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
- ret void
-}
-
-define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
- ret void
-}
-
-attributes #0 = { "enqueued-block" }
-;.
-; CHECK: @[[KERNEL_ADDRESS_USER:[a-zA-Z0-9_$"\\.-]+]] = global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @block_has_used_kernel_address.runtime_handle to ptr)]
-; CHECK: @[[__TEST_BLOCK_INVOKE_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
-; CHECK: @[[__TEST_BLOCK_INVOKE_2_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
-; CHECK: @[[BLOCK_HAS_USED_KERNEL_ADDRESS_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
-; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
-; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_1_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
-;.
-; CHECK-LABEL: define {{[^@]+}}@non_caller
-; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) {
-; CHECK-NEXT: ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@caller
-; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[BLOCK:%.*]] = alloca <{ i32, i32, ptr addrspace(1), i8 }>, align 8, addrspace(5)
-; CHECK-NEXT: [[INST:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
-; CHECK-NEXT: [[BLOCK2:%.*]] = alloca <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
-; CHECK-NEXT: [[INST3:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
-; CHECK-NEXT: [[BLOCK_SIZE:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 0
-; CHECK-NEXT: store i32 25, ptr addrspace(5) [[BLOCK_SIZE]], align 8
-; CHECK-NEXT: [[BLOCK_ALIGN:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 1
-; CHECK-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN]], align 4
-; CHECK-NEXT: [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 2
-; CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[BLOCK_CAPTURED]], align 8
-; CHECK-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 3
-; CHECK-NEXT: store i8 [[B]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8
-; CHECK-NEXT: [[INST4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
-; CHECK-NEXT: [[INST5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime_handle to ptr), ptr nonnull [[INST4]])
-; CHECK-NEXT: [[INST10:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime_handle to ptr), ptr nonnull [[INST4]])
-; CHECK-NEXT: [[INST11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST]], ptr addrspacecast (ptr addrspace(1) @__amdgpu_enqueued_kernel.runtime_handle to ptr), ptr nonnull [[INST4]])
-; CHECK-NEXT: [[INST12:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST]], ptr addrspacecast (ptr addrspace(1) @__amdgpu_enqueued_kernel.1.runtime_handle to ptr), ptr nonnull [[INST4]])
-; CHECK-NEXT: [[BLOCK_SIZE4:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 0
-; CHECK-NEXT: store i32 41, ptr addrspace(5) [[BLOCK_SIZE4]], align 8
-; CHECK-NEXT: [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 1
-; CHECK-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN5]], align 4
-; CHECK-NEXT: [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 2
-; CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[BLOCK_CAPTURED7]], align 8
-; CHECK-NEXT: [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 5
-; CHECK-NEXT: store i8 [[B]], ptr addrspace(5) [[BLOCK_CAPTURED8]], align 8
-; CHECK-NEXT: [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 3
-; CHECK-NEXT: store ptr addrspace(1) [[C]], ptr addrspace(5) [[BLOCK_CAPTURED9]], align 8
-; CHECK-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 4
-; CHECK-NEXT: store i64 [[D]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8
-; CHECK-NEXT: [[INST8:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK2]] to ptr
-; CHECK-NEXT: [[INST9:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST3]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime_handle to ptr), ptr nonnull [[INST8]])
-; CHECK-NEXT: ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@inlined_caller
-; CHECK-SAME: (ptr addrspace(1) [[A:%.*]], i8 [[B:%.*]], ptr addrspace(1) [[C:%.*]], i64 [[D:%.*]]) {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INST:%.*]] = load i64, ptr addrspace(1) @__test_block_invoke_kernel.runtime_handle, align 4
-; CHECK-NEXT: store i64 [[INST]], ptr addrspace(1) [[C]], align 4
-; CHECK-NEXT: ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_kernel
-; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 2
-; CHECK-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 3
-; CHECK-NEXT: store i8 [[DOTFCA_4_EXTRACT]], ptr addrspace(1) [[DOTFCA_3_EXTRACT]], align 1
-; CHECK-NEXT: ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel
-; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 2
-; CHECK-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 3
-; CHECK-NEXT: [[DOTFCA_5_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 4
-; CHECK-NEXT: [[DOTFCA_6_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 5
-; CHECK-NEXT: store i8 [[DOTFCA_6_EXTRACT]], ptr addrspace(1) [[DOTFCA_3_EXTRACT]], align 1
-; CHECK-NEXT: store i64 [[DOTFCA_5_EXTRACT]], ptr addrspace(1) [[DOTFCA_4_EXTRACT]], align 8
-; CHECK-NEXT: ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@block_has_used_kernel_address
-; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 2
-; CHECK-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 3
-; CHECK-NEXT: store i8 [[DOTFCA_4_EXTRACT]], ptr addrspace(1) [[DOTFCA_3_EXTRACT]], align 1
-; CHECK-NEXT: ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@user_of_kernel_address
-; CHECK-SAME: (ptr addrspace(1) [[ARG:%.*]]) {
-; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(1) @block_has_used_kernel_address.runtime_handle to ptr), ptr addrspace(1) [[ARG]], align 8
-; CHECK-NEXT: ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@__amdgpu_enqueued_kernel
-; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-NEXT: ret void
-;
-;
-; CHECK-LABEL: define {{[^@]+}}@__amdgpu_enqueued_kernel.1
-; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR4:[0-9]+]] {
-; CHECK-NEXT: ret void
-;
-;.
-; CHECK: attributes #[[ATTR0]] = { "enqueued-block" "runtime-handle"="__test_block_invoke_kernel.runtime_handle" }
-; CHECK: attributes #[[ATTR1]] = { "enqueued-block" "runtime-handle"="__test_block_invoke_2_kernel.runtime_handle" }
-; CHECK: attributes #[[ATTR2]] = { "enqueued-block" "runtime-handle"="block_has_used_kernel_address.runtime_handle" }
-; CHECK: attributes #[[ATTR3]] = { "enqueued-block" "runtime-handle"="__amdgpu_enqueued_kernel.runtime_handle" }
-; CHECK: attributes #[[ATTR4]] = { "enqueued-block" "runtime-handle"="__amdgpu_enqueued_kernel.1.runtime_handle" }
-;.
Index: llvm/test/CodeGen/AMDGPU/amdgpu-export-kernel-runtime-handles.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/amdgpu-export-kernel-runtime-handles.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-export-kernel-runtime-handles < %s | FileCheck %s
+
+%block.runtime.handle.t = type { ptr addrspace(1), i32, i32 }
+
+; associated globals without the correct section should be ignored.
[email protected] = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"
[email protected] = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer
+
+;.
+; CHECK: @[[BLOCK_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @[[NOT_A_BLOCK_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
+;.
+define internal amdgpu_kernel void @block_kernel() !associated !0 {
+; CHECK-LABEL: define {{[^@]+}}@block_kernel() !associated !0 {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define internal dso_local amdgpu_kernel void @dso_local_block_kernel() !associated !0 {
+; CHECK-LABEL: define {{[^@]+}}@dso_local_block_kernel() !associated !0 {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define internal amdgpu_kernel void @not_block_kernel() !associated !1 {
+; CHECK-LABEL: define {{[^@]+}}@not_block_kernel() !associated !1 {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define internal amdgpu_kernel void @associated_null() !associated !2 {
+; CHECK-LABEL: define {{[^@]+}}@associated_null() !associated !2 {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define internal amdgpu_kernel void @no_metadata() {
+; CHECK-LABEL: define {{[^@]+}}@no_metadata() {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+!0 = !{ptr addrspace(1) @block.handle }
+!1 = !{ptr addrspace(1) @not.a.block.handle }
+!2 = !{ptr addrspace(1) null }
+
+;.
+; CHECK: [[META0:![0-9]+]] = !{ptr addrspace(1) @block.handle}
+; CHECK: [[META1:![0-9]+]] = !{ptr addrspace(1) @not.a.block.handle}
+; CHECK: [[META2:![0-9]+]] = !{ptr addrspace(1) null}
+;.
Index: llvm/lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -54,6 +54,7 @@
AMDGPUCombinerHelper.cpp
AMDGPUCtorDtorLowering.cpp
AMDGPUExportClustering.cpp
+ AMDGPUExportKernelRuntimeHandles.cpp
AMDGPUFrameLowering.cpp
AMDGPUGlobalISelUtils.cpp
AMDGPUHSAMetadataStreamer.cpp
@@ -78,7 +79,6 @@
AMDGPUMCInstLower.cpp
AMDGPUIGroupLP.cpp
AMDGPUMIRFormatter.cpp
- AMDGPUOpenCLEnqueuedBlockLowering.cpp
AMDGPUPerfHintAnalysis.cpp
AMDGPUPostLegalizerCombiner.cpp
AMDGPUPreLegalizerCombiner.cpp
Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -389,7 +389,7 @@
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);
- initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
+ initializeAMDGPUExportKernelRuntimeHandlesPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
initializeAMDGPURegBankCombinerPass(*PR);
@@ -1014,8 +1014,8 @@
if (Arch == Triple::r600)
addPass(createR600OpenCLImageTypeLoweringPass());
- // Replace OpenCL enqueued block function pointers with global variables.
- addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
+ // Make enqueued block runtime handles externally visible.
+ addPass(createAMDGPUExportKernelRuntimeHandlesPass());
// Runs before PromoteAlloca so the latter can account for function uses
if (EnableLowerModuleLDS) {
Index: llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// \file
-// This post-linking pass replaces the function pointer of enqueued
-// block kernel with a global variable (runtime handle) and adds
-// "runtime-handle" attribute to the enqueued block kernel.
-//
-// In LLVM CodeGen the runtime-handle metadata will be translated to
-// RuntimeHandle metadata in code object. Runtime allocates a global buffer
-// for each kernel with RuntimeHandle metadata and saves the kernel address
-// required for the AQL packet into the buffer. __enqueue_kernel function
-// in device library knows that the invoke function pointer in the block
-// literal is actually runtime handle and loads the kernel address from it
-// and put it into AQL packet for dispatching.
-//
-// This cannot be done in FE since FE cannot create a unique global variable
-// with external linkage across LLVM modules. The global variable with internal
-// linkage does not work since optimization passes will try to replace loads
-// of the global variable with its initialization value.
-//
-// It also identifies the kernels directly or indirectly enqueues kernels
-// and adds "calls-enqueue-kernel" function attribute to them, which will
-// be used to determine whether to emit runtime metadata for the kernel
-// enqueue related hidden kernel arguments.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-
-#define DEBUG_TYPE "amdgpu-lower-enqueued-block"
-
-using namespace llvm;
-
-namespace {
-
-/// Lower enqueued blocks.
-class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
-public:
- static char ID;
-
- explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}
-
-private:
- bool runOnModule(Module &M) override;
-};
-
-} // end anonymous namespace
-
-char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0;
-
-char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID =
- AMDGPUOpenCLEnqueuedBlockLowering::ID;
-
-INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
- "Lower OpenCL enqueued blocks", false, false)
-
-ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
- return new AMDGPUOpenCLEnqueuedBlockLowering();
-}
-
-bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
- DenseSet<Function *> Callers;
- auto &C = M.getContext();
- bool Changed = false;
-
- // ptr kernel_object, i32 private_segment_size, i32 group_segment_size
- StructType *HandleTy = nullptr;
-
- for (auto &F : M.functions()) {
- if (F.hasFnAttribute("enqueued-block")) {
- if (!F.hasName()) {
- SmallString<64> Name;
- Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel",
- M.getDataLayout());
- F.setName(Name);
- }
- LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
- auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
- if (!HandleTy) {
- Type *Int32 = Type::getInt32Ty(C);
- HandleTy =
- StructType::create(C, {PointerType::getUnqual(C), Int32, Int32},
- "block.runtime.handle.t");
- }
-
- auto *GV = new GlobalVariable(
- M, HandleTy,
- /*isConstant=*/true, GlobalValue::ExternalLinkage,
- /*Initializer=*/Constant::getNullValue(HandleTy), RuntimeHandle,
- /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
- AMDGPUAS::GLOBAL_ADDRESS,
- /*isExternallyInitialized=*/true);
- LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
-
- F.replaceAllUsesWith(ConstantExpr::getAddrSpaceCast(GV, F.getType()));
- F.addFnAttr("runtime-handle", RuntimeHandle);
- F.setLinkage(GlobalValue::ExternalLinkage);
- Changed = true;
- }
- }
-
- return Changed;
-}
Index: llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -21,6 +21,7 @@
namespace llvm {
+class AMDGPUTargetMachine;
class AMDGPUTargetStreamer;
class Argument;
class DataLayout;
@@ -58,7 +59,8 @@
virtual void emitVersion() = 0;
virtual void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
msgpack::ArrayDocNode Args) = 0;
- virtual void emitKernelAttrs(const Function &Func,
+ virtual void emitKernelAttrs(const AMDGPUTargetMachine &TM,
+ const Function &Func,
msgpack::MapDocNode Kern) = 0;
};
@@ -95,7 +97,8 @@
void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern);
- void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern) override;
+ void emitKernelAttrs(const AMDGPUTargetMachine &TM, const Function &Func,
+ msgpack::MapDocNode Kern) override;
void emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern);
@@ -141,7 +144,8 @@
void emitVersion() override;
void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
msgpack::ArrayDocNode Args) override;
- void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern) override;
+ void emitKernelAttrs(const AMDGPUTargetMachine &TM, const Function &Func,
+ msgpack::MapDocNode Kern) override;
public:
MetadataStreamerMsgPackV5() = default;
Index: llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -19,6 +19,8 @@
#include "SIMachineFunctionInfo.h"
#include "SIProgramInfo.h"
#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
using namespace llvm;
static std::pair<Type *, Align> getArgumentTypeAlign(const Argument &Arg,
@@ -36,6 +38,27 @@
return std::pair(Ty, *ArgAlign);
}
+/// Find the mangled symbol name for the runtime handle for \p EnqueuedBlock
+static std::string getEnqueuedBlockSymbolName(const AMDGPUTargetMachine &TM,
+ const Function &EnqueuedBlock) {
+ const MDNode *Associated =
+ EnqueuedBlock.getMetadata(LLVMContext::MD_associated);
+ if (!Associated)
+ return "";
+
+ auto *VM = cast<ValueAsMetadata>(Associated->getOperand(0));
+ auto *RuntimeHandle =
+ dyn_cast<GlobalVariable>(VM->getValue()->stripPointerCasts());
+ if (!RuntimeHandle ||
+ RuntimeHandle->getSection() != ".amdgpu.kernel.runtime.handle")
+ return "";
+
+ SmallString<128> Name;
+ TM.getNameWithPrefix(Name, RuntimeHandle,
+ TM.getObjFileLowering()->getMangler());
+ return Name.str().str();
+}
+
namespace llvm {
static cl::opt<bool> DumpHSAMetadata(
@@ -229,7 +252,8 @@
Kern[".language_version"] = LanguageVersion;
}
-void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func,
+void MetadataStreamerMsgPackV4::emitKernelAttrs(const AMDGPUTargetMachine &TM,
+ const Function &Func,
msgpack::MapDocNode Kern) {
if (auto Node = Func.getMetadata("reqd_work_group_size"))
@@ -243,11 +267,13 @@
mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()),
/*Copy=*/true);
}
- if (Func.hasFnAttribute("runtime-handle")) {
- Kern[".device_enqueue_symbol"] = Kern.getDocument()->getNode(
- Func.getFnAttribute("runtime-handle").getValueAsString().str(),
- /*Copy=*/true);
+
+ std::string HandleName = getEnqueuedBlockSymbolName(TM, Func);
+ if (!HandleName.empty()) {
+ Kern[".device_enqueue_symbol"] =
+ Kern.getDocument()->getNode(std::move(HandleName), /*Copy=*/true);
}
+
if (Func.hasFnAttribute("device-init"))
Kern[".kind"] = Kern.getDocument()->getNode("init");
else if (Func.hasFnAttribute("device-fini"))
@@ -532,6 +558,8 @@
Func.getCallingConv() != CallingConv::SPIR_KERNEL)
return;
+ const auto &TM = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
+
auto CodeObjectVersion = AMDGPU::getCodeObjectVersion(*Func.getParent());
auto Kern = getHSAKernelProps(MF, ProgramInfo, CodeObjectVersion);
@@ -543,7 +571,7 @@
Kern[".symbol"] = Kern.getDocument()->getNode(
(Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true);
emitKernelLanguage(Func, Kern);
- emitKernelAttrs(Func, Kern);
+ emitKernelAttrs(TM, Func, Kern);
emitKernelArgs(MF, Kern);
}
@@ -661,15 +689,15 @@
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);
}
-void MetadataStreamerMsgPackV5::emitKernelAttrs(const Function &Func,
+void MetadataStreamerMsgPackV5::emitKernelAttrs(const AMDGPUTargetMachine &TM,
+ const Function &Func,
msgpack::MapDocNode Kern) {
- MetadataStreamerMsgPackV4::emitKernelAttrs(Func, Kern);
+ MetadataStreamerMsgPackV4::emitKernelAttrs(TM, Func, Kern);
if (Func.getFnAttribute("uniform-work-group-size").getValueAsBool())
Kern[".uniform_work_group_size"] = Kern.getDocument()->getNode(1);
}
-
} // end namespace HSAMD
} // end namespace AMDGPU
} // end namespace llvm
Index: llvm/lib/Target/AMDGPU/AMDGPUExportKernelRuntimeHandles.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Target/AMDGPU/AMDGPUExportKernelRuntimeHandles.cpp
@@ -0,0 +1,94 @@
+//===- AMDGPUExportKernelRuntimeHandles.cpp - Lower enqueued block --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+//
+// Give any globals used for OpenCL block enqueue runtime handles external
+// linkage so the runtime may access them. These should behave like internal
+// functions for purposes of linking, but need to have an external symbol in the
+// final object for the runtime to access them.
+//
+// TODO: This could be replaced with a new linkage type or global object
+// metadata that produces an external symbol in the final object, but allows
+// rename on IR linking. Alternatively if we can rely on
+// GlobalValue::getGlobalIdentifier we can just make these external symbols to
+// begin with.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+#define DEBUG_TYPE "amdgpu-export-kernel-runtime-handles"
+
+using namespace llvm;
+
+namespace {
+
+/// Lower enqueued blocks.
+class AMDGPUExportKernelRuntimeHandles : public ModulePass {
+public:
+ static char ID;
+
+ explicit AMDGPUExportKernelRuntimeHandles() : ModulePass(ID) {}
+
+private:
+ bool runOnModule(Module &M) override;
+};
+
+} // end anonymous namespace
+
+char AMDGPUExportKernelRuntimeHandles::ID = 0;
+
+char &llvm::AMDGPUExportKernelRuntimeHandlesID =
+ AMDGPUExportKernelRuntimeHandles::ID;
+
+INITIALIZE_PASS(AMDGPUExportKernelRuntimeHandles, DEBUG_TYPE,
+ "Externalize enqueued block runtime handles", false, false)
+
+ModulePass *llvm::createAMDGPUExportKernelRuntimeHandlesPass() {
+ return new AMDGPUExportKernelRuntimeHandles();
+}
+
+bool AMDGPUExportKernelRuntimeHandles::runOnModule(Module &M) {
+ bool Changed = false;
+
+ const StringLiteral HandleSectionName(".amdgpu.kernel.runtime.handle");
+
+ for (GlobalVariable &GV : M.globals()) {
+ if (GV.getSection() == HandleSectionName) {
+ GV.setLinkage(GlobalValue::ExternalLinkage);
+ GV.setDSOLocal(false);
+ Changed = true;
+ }
+ }
+
+ if (!Changed)
+ return false;
+
+ // FIXME: We shouldn't really need to export the kernel address. We can
+ // initialize the runtime handle with the kernel descriptor
+ for (Function &F : M) {
+ if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+ continue;
+
+ const MDNode *Associated = F.getMetadata(LLVMContext::MD_associated);
+ if (!Associated)
+ continue;
+
+ auto *VM = cast<ValueAsMetadata>(Associated->getOperand(0));
+ auto *Handle = dyn_cast<GlobalObject>(VM->getValue());
+ if (Handle && Handle->getSection() == HandleSectionName) {
+ F.setLinkage(GlobalValue::ExternalLinkage);
+ F.setVisibility(GlobalValue::ProtectedVisibility);
+ }
+ }
+
+ return Changed;
+}
Index: llvm/lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPU.h
+++ llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -360,9 +360,9 @@
void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);
-ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();
-void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
-extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
+ModulePass *createAMDGPUExportKernelRuntimeHandlesPass();
+void initializeAMDGPUExportKernelRuntimeHandlesPass(PassRegistry &);
+extern char &AMDGPUExportKernelRuntimeHandlesID;
void initializeGCNNSAReassignPass(PassRegistry &);
extern char &GCNNSAReassignID;
Index: llvm/lib/IR/CMakeLists.txt
===================================================================
--- llvm/lib/IR/CMakeLists.txt
+++ llvm/lib/IR/CMakeLists.txt
@@ -82,6 +82,7 @@
LINK_COMPONENTS
BinaryFormat
Demangle
+ TransformUtils
Remarks
Support
TargetParser
Index: llvm/lib/IR/AutoUpgrade.cpp
===================================================================
--- llvm/lib/IR/AutoUpgrade.cpp
+++ llvm/lib/IR/AutoUpgrade.cpp
@@ -41,6 +41,8 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
#include <cstring>
using namespace llvm;
@@ -5076,6 +5078,51 @@
};
} // namespace
+static StructType *getAMDGPURuntimeHandleType(LLVMContext &C,
+ Type *KernelDescriptorPtrTy) {
+ Type *Int32 = Type::getInt32Ty(C);
+ return StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
+ "block.runtime.handle.t");
+}
+
+/// Rewrite to new scheme for enqueued block lowering
+static void upgradeAMDGPUKernelEnqueuedBlock(Function &F) {
+ if (F.isMaterializable()) {
+ // A verifier error is produced if we add metadata to the function during
+ // linking.
+ return;
+ }
+
+ const StringLiteral EnqueuedBlockName("enqueued-block");
+ if (!F.hasFnAttribute(EnqueuedBlockName))
+ return;
+
+ F.removeFnAttr(EnqueuedBlockName);
+
+ Module *M = F.getParent();
+ LLVMContext &Ctx = M->getContext();
+ const DataLayout &DL = M->getDataLayout();
+
+ StructType *HandleTy = getAMDGPURuntimeHandleType(
+ Ctx, PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace()));
+
+ Twine RuntimeHandleName = F.getName() + ".runtime.handle";
+
+ auto *RuntimeHandle = new GlobalVariable(
+ *M, HandleTy,
+ /*isConstant=*/true, F.getLinkage(),
+ /*Initializer=*/ConstantAggregateZero::get(HandleTy), RuntimeHandleName,
+ /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
+ DL.getDefaultGlobalsAddressSpace(),
+ /*isExternallyInitialized=*/true);
+ RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
+
+ MDNode *HandleAsMD = MDNode::get(Ctx, ValueAsMetadata::get(RuntimeHandle));
+ F.setMetadata(LLVMContext::MD_associated, HandleAsMD);
+
+ appendToUsed(*M, {&F, RuntimeHandle});
+}
+
void llvm::UpgradeFunctionAttributes(Function &F) {
// If a function definition doesn't have the strictfp attribute,
// convert any callsite strictfp attributes to nobuiltin.
@@ -5088,6 +5135,9 @@
F.removeRetAttrs(AttributeFuncs::typeIncompatible(F.getReturnType()));
for (auto &Arg : F.args())
Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType()));
+
+ if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
+ upgradeAMDGPUKernelEnqueuedBlock(F);
}
static bool isOldLoopArgument(Metadata *MD) {
Index: llvm/docs/AMDGPUUsage.rst
===================================================================
--- llvm/docs/AMDGPUUsage.rst
+++ llvm/docs/AMDGPUUsage.rst
@@ -1733,6 +1733,9 @@
as position independent code. See :ref:`amdgpu-code-conventions` for
information on conventions used in the isa generation.
+``.amdgpu.kernel.runtime.handle``
+ Symbols used for device enqueue.
+
.. _amdgpu-note-records:
Note Records
Index: clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
===================================================================
--- clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -61,7 +61,13 @@
}
//.
+// CHECK: @__test_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+// CHECK: @__test_block_invoke_2_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.1 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+// CHECK: @__test_block_invoke_3_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.3 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+// CHECK: @__test_block_invoke_4_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.5 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
// CHECK: @__block_literal_global = internal addrspace(1) constant { i32, i32, ptr } { i32 16, i32 8, ptr @__test_target_features_kernel_block_invoke }, align 8 #0
+// CHECK: @__test_target_features_kernel_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.7 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+// CHECK: @llvm.used = appending addrspace(1) global [10 x ptr] [ptr @__test_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr @__test_block_invoke_2_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr @__test_block_invoke_3_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr @__test_block_invoke_4_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr)], section "llvm.metadata"
//.
// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone
// NOCPU-LABEL: define {{[^@]+}}@callee
@@ -121,7 +127,7 @@
// NOCPU-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
// NOCPU-NEXT: store i8 [[TMP3]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8
// NOCPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
-// NOCPU-NEXT: [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[TMP4]])
+// NOCPU-NEXT: [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[TMP4]])
// NOCPU-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
// NOCPU-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
// NOCPU-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
@@ -144,7 +150,7 @@
// NOCPU-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
// NOCPU-NEXT: store i64 [[TMP11]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8
// NOCPU-NEXT: [[TMP12:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
-// NOCPU-NEXT: [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[TMP12]])
+// NOCPU-NEXT: [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[TMP12]])
// NOCPU-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
// NOCPU-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
// NOCPU-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
@@ -169,7 +175,7 @@
// NOCPU-NEXT: [[TMP20:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
// NOCPU-NEXT: [[TMP21:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
// NOCPU-NEXT: store i64 100, ptr addrspace(5) [[TMP21]], align 8
-// NOCPU-NEXT: [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])
+// NOCPU-NEXT: [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])
// NOCPU-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 0
// NOCPU-NEXT: store i32 32, ptr addrspace(5) [[BLOCK_SIZE22]], align 8
// NOCPU-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 1
@@ -189,7 +195,7 @@
// NOCPU-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
// NOCPU-NEXT: [[TMP27:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8
// NOCPU-NEXT: [[TMP28:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
-// NOCPU-NEXT: [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[TMP28]])
+// NOCPU-NEXT: [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[TMP28]])
// NOCPU-NEXT: ret void
//
//
@@ -212,7 +218,7 @@
//
// NOCPU: Function Attrs: convergent nounwind
// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_kernel
-// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
+// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !associated [[META7:![0-9]+]] !kernel_arg_addr_space !8 !kernel_arg_access_qual !9 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !11 {
// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5)
// NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
@@ -246,7 +252,7 @@
//
// NOCPU: Function Attrs: convergent nounwind
// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel
-// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
+// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META12:![0-9]+]] !kernel_arg_addr_space !8 !kernel_arg_access_qual !9 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !11 {
// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
// NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
@@ -285,7 +291,7 @@
//
// NOCPU: Function Attrs: convergent nounwind
// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel
-// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !11 !kernel_arg_access_qual !12 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !14 {
+// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !associated [[META13:![0-9]+]] !kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 {
// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
// NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8
@@ -312,7 +318,7 @@
//
// NOCPU: Function Attrs: convergent nounwind
// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel
-// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
+// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META18:![0-9]+]] !kernel_arg_addr_space !8 !kernel_arg_access_qual !9 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !11 {
// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
// NOCPU-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
@@ -323,7 +329,7 @@
//
// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone
// NOCPU-LABEL: define {{[^@]+}}@test_target_features_kernel
-// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space !15 !kernel_arg_access_qual !8 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !10 {
+// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !9 !kernel_arg_type !20 !kernel_arg_base_type !20 !kernel_arg_type_qual !11 {
// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// NOCPU-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -336,7 +342,7 @@
// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
// NOCPU-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
// NOCPU-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
-// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
+// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
// NOCPU-NEXT: ret void
//
//
@@ -354,7 +360,7 @@
//
// NOCPU: Function Attrs: convergent nounwind
// NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel
-// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {
+// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !associated [[META21:![0-9]+]] !kernel_arg_addr_space !8 !kernel_arg_access_qual !9 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !11 {
// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5)
// NOCPU-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
@@ -417,10 +423,10 @@
// GFX900-NEXT: store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13:![0-9]+]]
// GFX900-NEXT: store ptr addrspace(1) [[C]], ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: store i64 [[D]], ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8:[0-9]+]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7:[0-9]+]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]]
// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14:![0-9]+]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]]
// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16:![0-9]+]]
// GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18:![0-9]+]]
@@ -437,7 +443,7 @@
// GFX900-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13]]
// GFX900-NEXT: store i8 [[TMP3]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA13]]
// GFX900-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
-// GFX900-NEXT: [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[TMP4]])
+// GFX900-NEXT: [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[TMP4]])
// GFX900-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]
// GFX900-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
@@ -460,7 +466,7 @@
// GFX900-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: store i64 [[TMP11]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: [[TMP12:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
-// GFX900-NEXT: [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[TMP12]])
+// GFX900-NEXT: [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[TMP12]])
// GFX900-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]
// GFX900-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
@@ -483,12 +489,12 @@
// GFX900-NEXT: [[TMP19:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: store i64 [[TMP19]], ptr addrspace(5) [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: [[TMP20:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR7]]
// GFX900-NEXT: [[TMP21:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
// GFX900-NEXT: store i64 100, ptr addrspace(5) [[TMP21]], align 8
-// GFX900-NEXT: [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]
+// GFX900-NEXT: [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR7]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR7]]
// GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 0
// GFX900-NEXT: store i32 32, ptr addrspace(5) [[BLOCK_SIZE22]], align 8
// GFX900-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 1
@@ -508,11 +514,11 @@
// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
// GFX900-NEXT: [[TMP27:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA13]]
// GFX900-NEXT: [[TMP28:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
-// GFX900-NEXT: [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[TMP28]])
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]
+// GFX900-NEXT: [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[TMP28]])
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR7]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]]
// GFX900-NEXT: ret void
//
//
@@ -533,7 +539,7 @@
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_kernel
-// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !20 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !22 {
+// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META19:![0-9]+]] !kernel_arg_addr_space !20 !kernel_arg_access_qual !21 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !23 {
// GFX900-NEXT: entry:
// GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5)
// GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
@@ -565,7 +571,7 @@
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel
-// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR6]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !20 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !22 {
+// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META24:![0-9]+]] !kernel_arg_addr_space !20 !kernel_arg_access_qual !21 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !23 {
// GFX900-NEXT: entry:
// GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
// GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
@@ -602,7 +608,7 @@
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel
-// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR6]] !kernel_arg_addr_space !23 !kernel_arg_access_qual !24 !kernel_arg_type !25 !kernel_arg_base_type !25 !kernel_arg_type_qual !26 {
+// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !associated [[META25:![0-9]+]] !kernel_arg_addr_space !26 !kernel_arg_access_qual !27 !kernel_arg_type !28 !kernel_arg_base_type !28 !kernel_arg_type_qual !29 {
// GFX900-NEXT: entry:
// GFX900-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
// GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8
@@ -621,13 +627,13 @@
// GFX900-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR9:[0-9]+]]
+// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8:[0-9]+]]
// GFX900-NEXT: ret void
//
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel
-// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR6]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !20 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !22 {
+// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META30:![0-9]+]] !kernel_arg_addr_space !20 !kernel_arg_access_qual !21 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !23 {
// GFX900-NEXT: entry:
// GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
// GFX900-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
@@ -638,7 +644,7 @@
//
// GFX900: Function Attrs: convergent norecurse nounwind
// GFX900-LABEL: define {{[^@]+}}@test_target_features_kernel
-// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space !27 !kernel_arg_access_qual !20 !kernel_arg_type !28 !kernel_arg_base_type !28 !kernel_arg_type_qual !22 {
+// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space !31 !kernel_arg_access_qual !21 !kernel_arg_type !32 !kernel_arg_base_type !32 !kernel_arg_type_qual !23 {
// GFX900-NEXT: entry:
// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
@@ -646,18 +652,18 @@
// GFX900-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5)
// GFX900-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8, !tbaa [[TBAA7]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]]
// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
-// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
+// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]]
// GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]
// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
-// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]
-// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]
+// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]]
+// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]]
// GFX900-NEXT: ret void
//
//
@@ -673,7 +679,7 @@
//
// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel
-// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR6]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !20 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !22 {
+// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !associated [[META33:![0-9]+]] !kernel_arg_addr_space !20 !kernel_arg_access_qual !21 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !23 {
// GFX900-NEXT: entry:
// GFX900-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5)
// GFX900-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
@@ -682,74 +688,83 @@
// GFX900-NEXT: ret void
//
//.
-// NOCPU: attributes #0 = { "objc_arc_inert" }
-// NOCPU: attributes #1 = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// NOCPU: attributes #2 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" }
-// NOCPU: attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-// NOCPU: attributes #4 = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// NOCPU: attributes #5 = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// NOCPU: attributes #6 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" }
-// NOCPU: attributes #7 = { nocallback nofree nosync nounwind willreturn }
-// NOCPU: attributes #8 = { convergent nounwind }
+// NOCPU: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" }
+// NOCPU: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// NOCPU: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" }
+// NOCPU: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+// NOCPU: attributes #[[ATTR4]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// NOCPU: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// NOCPU: attributes #[[ATTR6]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" }
+// NOCPU: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn }
+// NOCPU: attributes #[[ATTR8]] = { convergent nounwind }
//.
-// GFX900: attributes #0 = { "objc_arc_inert" }
-// GFX900: attributes #1 = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
-// GFX900: attributes #2 = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" }
-// GFX900: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-// GFX900: attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-// GFX900: attributes #5 = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
-// GFX900: attributes #6 = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
-// GFX900: attributes #7 = { nocallback nofree nosync nounwind willreturn }
-// GFX900: attributes #8 = { nounwind }
-// GFX900: attributes #9 = { convergent nounwind }
+// GFX900: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" }
+// GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
+// GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" }
+// GFX900: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+// GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+// GFX900: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
+// GFX900: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn }
+// GFX900: attributes #[[ATTR7]] = { nounwind }
+// GFX900: attributes #[[ATTR8]] = { convergent nounwind }
//.
-// NOCPU: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
-// NOCPU: !1 = !{i32 1, !"wchar_size", i32 4}
-// NOCPU: !2 = !{i32 2, i32 0}
-// NOCPU: !3 = !{i32 1, i32 0, i32 1, i32 0}
-// NOCPU: !4 = !{!"none", !"none", !"none", !"none"}
-// NOCPU: !5 = !{!"char*", !"char", !"long*", !"long"}
-// NOCPU: !6 = !{!"", !"", !"", !""}
-// NOCPU: !7 = !{i32 0}
-// NOCPU: !8 = !{!"none"}
-// NOCPU: !9 = !{!"__block_literal"}
-// NOCPU: !10 = !{!""}
-// NOCPU: !11 = !{i32 0, i32 3}
-// NOCPU: !12 = !{!"none", !"none"}
-// NOCPU: !13 = !{!"__block_literal", !"void*"}
-// NOCPU: !14 = !{!"", !""}
-// NOCPU: !15 = !{i32 1}
-// NOCPU: !16 = !{!"int*"}
+// NOCPU: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400}
+// NOCPU: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// NOCPU: [[META2:![0-9]+]] = !{i32 2, i32 0}
+// NOCPU: [[META3:![0-9]+]] = !{i32 1, i32 0, i32 1, i32 0}
+// NOCPU: [[META4:![0-9]+]] = !{!"none", !"none", !"none", !"none"}
+// NOCPU: [[META5:![0-9]+]] = !{!"char*", !"char", !"long*", !"long"}
+// NOCPU: [[META6:![0-9]+]] = !{!"", !"", !"", !""}
+// NOCPU: [[META7]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle}
+// NOCPU: [[META8:![0-9]+]] = !{i32 0}
+// NOCPU: [[META9:![0-9]+]] = !{!"none"}
+// NOCPU: [[META10:![0-9]+]] = !{!"__block_literal"}
+// NOCPU: [[META11:![0-9]+]] = !{!""}
+// NOCPU: [[META12]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle}
+// NOCPU: [[META13]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle}
+// NOCPU: [[META14:![0-9]+]] = !{i32 0, i32 3}
+// NOCPU: [[META15:![0-9]+]] = !{!"none", !"none"}
+// NOCPU: [[META16:![0-9]+]] = !{!"__block_literal", !"void*"}
+// NOCPU: [[META17:![0-9]+]] = !{!"", !""}
+// NOCPU: [[META18]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle}
+// NOCPU: [[META19:![0-9]+]] = !{i32 1}
+// NOCPU: [[META20:![0-9]+]] = !{!"int*"}
+// NOCPU: [[META21]] = !{ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle}
//.
-// GFX900: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
-// GFX900: !1 = !{i32 1, !"wchar_size", i32 4}
-// GFX900: !2 = !{i32 2, i32 0}
-// GFX900: !3 = !{!4, !4, i64 0}
-// GFX900: !4 = !{!"long", !5, i64 0}
-// GFX900: !5 = !{!"omnipotent char", !6, i64 0}
-// GFX900: !6 = !{!"Simple C/C++ TBAA"}
-// GFX900: !7 = !{!8, !8, i64 0}
-// GFX900: !8 = !{!"any pointer", !5, i64 0}
-// GFX900: !9 = !{i32 1, i32 0, i32 1, i32 0}
-// GFX900: !10 = !{!"none", !"none", !"none", !"none"}
-// GFX900: !11 = !{!"char*", !"char", !"long*", !"long"}
-// GFX900: !12 = !{!"", !"", !"", !""}
-// GFX900: !13 = !{!5, !5, i64 0}
-// GFX900: !14 = !{!15, !15, i64 0}
-// GFX900: !15 = !{!"int", !5, i64 0}
-// GFX900: !16 = !{!17, !17, i64 0}
-// GFX900: !17 = !{!"queue_t", !5, i64 0}
-// GFX900: !18 = !{i64 0, i64 4, !14}
-// GFX900: !19 = !{i32 0}
-// GFX900: !20 = !{!"none"}
-// GFX900: !21 = !{!"__block_literal"}
-// GFX900: !22 = !{!""}
-// GFX900: !23 = !{i32 0, i32 3}
-// GFX900: !24 = !{!"none", !"none"}
-// GFX900: !25 = !{!"__block_literal", !"void*"}
-// GFX900: !26 = !{!"", !""}
-// GFX900: !27 = !{i32 1}
-// GFX900: !28 = !{!"int*"}
+// GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400}
+// GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// GFX900: [[META2:![0-9]+]] = !{i32 2, i32 0}
+// GFX900: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+// GFX900: [[META4]] = !{!"long", [[META5:![0-9]+]], i64 0}
+// GFX900: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+// GFX900: [[META6]] = !{!"Simple C/C++ TBAA"}
+// GFX900: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// GFX900: [[META8]] = !{!"any pointer", [[META5]], i64 0}
+// GFX900: [[META9:![0-9]+]] = !{i32 1, i32 0, i32 1, i32 0}
+// GFX900: [[META10:![0-9]+]] = !{!"none", !"none", !"none", !"none"}
+// GFX900: [[META11:![0-9]+]] = !{!"char*", !"char", !"long*", !"long"}
+// GFX900: [[META12:![0-9]+]] = !{!"", !"", !"", !""}
+// GFX900: [[TBAA13]] = !{[[META5]], [[META5]], i64 0}
+// GFX900: [[TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
+// GFX900: [[META15]] = !{!"int", [[META5]], i64 0}
+// GFX900: [[TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
+// GFX900: [[META17]] = !{!"queue_t", [[META5]], i64 0}
+// GFX900: [[TBAA_STRUCT18]] = !{i64 0, i64 4, [[TBAA14]]}
+// GFX900: [[META19]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle}
+// GFX900: [[META20:![0-9]+]] = !{i32 0}
+// GFX900: [[META21:![0-9]+]] = !{!"none"}
+// GFX900: [[META22:![0-9]+]] = !{!"__block_literal"}
+// GFX900: [[META23:![0-9]+]] = !{!""}
+// GFX900: [[META24]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle}
+// GFX900: [[META25]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle}
+// GFX900: [[META26:![0-9]+]] = !{i32 0, i32 3}
+// GFX900: [[META27:![0-9]+]] = !{!"none", !"none"}
+// GFX900: [[META28:![0-9]+]] = !{!"__block_literal", !"void*"}
+// GFX900: [[META29:![0-9]+]] = !{!"", !""}
+// GFX900: [[META30]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle}
+// GFX900: [[META31:![0-9]+]] = !{i32 1}
+// GFX900: [[META32:![0-9]+]] = !{!"int*"}
+// GFX900: [[META33]] = !{ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle}
//.
//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
// CHECK: {{.*}}
Index: clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel-linking.cl
===================================================================
--- /dev/null
+++ clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel-linking.cl
@@ -0,0 +1,78 @@
+// Make sure that invoking blocks in static functions with the same name in
+// different modules are linked together.
+
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -fno-ident -DKERNEL_NAME=test_kernel_first -DTYPE=float -DCONST=256.0f -emit-llvm-bc -o %t.0.bc %s
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -fno-ident -DKERNEL_NAME=test_kernel_second -DTYPE=int -DCONST=128.0f -emit-llvm-bc -o %t.1.bc %s
+
+// Make sure nothing strange happens with the linkage choices.
+// RUN: opt -passes=globalopt -o %t.opt.0.bc %t.0.bc
+// RUN: opt -passes=globalopt -o %t.opt.1.bc %t.1.bc
+
+// Check the result of linking
+// RUN: llvm-link -S %t.opt.0.bc %t.opt.1.bc -o - | FileCheck %s
+
+// Make sure that a block invoke used with the same name works in multiple
+// translation units
+
+// CHECK: @llvm.used = appending addrspace(1) global [4 x ptr] [ptr @__static_invoker_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle to ptr), ptr @__static_invoker_block_invoke_kernel.2, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle.3 to ptr)], section "llvm.metadata"
+
+
+// CHECK: @__static_invoker_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+// CHECK: @__static_invoker_block_invoke_kernel.runtime.handle.3 = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+
+// CHECK: define internal amdgpu_kernel void @__static_invoker_block_invoke_kernel(<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1) }> %0) #{{[0-9]+}} !associated ![[ASSOC_FIRST_MD:[0-9]+]]
+
+
+// CHECK-LABEL: define internal void @__static_invoker_block_invoke(ptr noundef %.block_descriptor)
+// CHECK: call float @llvm.fmuladd.f32
+
+
+// CHECK-LABEL: define dso_local amdgpu_kernel void @test_kernel_first(
+
+
+// CHECK-LABEL: define internal fastcc void @static_invoker(ptr addrspace(1) noundef %outptr, ptr addrspace(1) noundef %argptr)
+// CHECK: call i32 @__enqueue_kernel_basic(ptr addrspace(1) %{{[0-9]+}}, i32 %{{[0-9]+}}, ptr addrspace(5) byval(%struct.ndrange_t) %tmp, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle to ptr), ptr %{{[0-9]+}})
+
+// CHECK: declare i32 @__enqueue_kernel_basic(ptr addrspace(1), i32, ptr addrspace(5), ptr, ptr) local_unnamed_addr
+
+// CHECK: define internal amdgpu_kernel void @__static_invoker_block_invoke_kernel.2(<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1) }> %0) #{{[0-9]+}} !associated ![[ASSOC_SECOND_MD:[0-9]+]]
+// CHECK: call void @__static_invoker_block_invoke.4(ptr %
+
+
+// CHECK-LABEL: define internal void @__static_invoker_block_invoke.4(ptr noundef %.block_descriptor)
+// CHECK: mul nsw i32
+// CHECK: sitofp
+// CHECK: fadd
+// CHECK: fptosi
+
+// CHECK-LABEL: define dso_local amdgpu_kernel void @test_kernel_second(ptr addrspace(1) noundef align 4 %outptr, ptr addrspace(1) noundef align 4 %argptr, ptr addrspace(1) noundef align 4 %difference)
+
+// CHECK-LABEL: define internal fastcc void @static_invoker.5(ptr addrspace(1) noundef %outptr, ptr addrspace(1) noundef %argptr) unnamed_addr #{{[0-9]+}} {
+// CHECK: call i32 @__enqueue_kernel_basic(ptr addrspace(1) %{{[0-9]+}}, i32 %{{[0-9]+}}, ptr addrspace(5) byval(%struct.ndrange_t) %tmp, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle.3 to ptr), ptr %{{[0-9]+}})
+
+typedef struct {int a;} ndrange_t;
+
+static void static_invoker(global TYPE* outptr, global TYPE* argptr) {
+ queue_t default_queue;
+ unsigned flags = 0;
+ ndrange_t ndrange;
+
+ enqueue_kernel(default_queue, flags, ndrange,
+ ^(void) {
+ global TYPE* f = argptr;
+ outptr[0] = f[1] * f[2] + CONST;
+ });
+}
+
+kernel void KERNEL_NAME(global TYPE *outptr, global TYPE *argptr, global TYPE *difference) {
+ queue_t default_queue;
+ unsigned flags = 0;
+ ndrange_t ndrange;
+
+ static_invoker(outptr, argptr);
+
+ *difference = CONST;
+}
+
+// CHECK: ![[ASSOC_FIRST_MD]] = !{ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle}
+// CHECK: ![[ASSOC_SECOND_MD]] = !{ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle.3}
Index: clang/lib/CodeGen/Targets/AMDGPU.cpp
===================================================================
--- clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -515,6 +515,41 @@
FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
}
+/// Return IR struct type corresponding to kernel_descriptor_t (See
+/// AMDHSAKernelDescriptor.h)
+static llvm::StructType *getAMDGPUKernelDescriptorType(CodeGenFunction &CGF) {
+ return llvm::StructType::create(
+ CGF.getLLVMContext(),
+ {
+ CGF.Int32Ty, // group_segment_fixed_size
+ CGF.Int32Ty, // private_segment_fixed_size
+ CGF.Int32Ty, // kernarg_size
+ llvm::ArrayType::get(CGF.Int8Ty, 4), // reserved0
+ CGF.Int64Ty, // kernel_code_entry_byte_offset
+ llvm::ArrayType::get(CGF.Int8Ty, 20), // reserved1
+ CGF.Int32Ty, // compute_pgm_rsrc3
+ CGF.Int32Ty, // compute_pgm_rsrc1
+ CGF.Int32Ty, // compute_pgm_rsrc2
+ CGF.Int16Ty, // kernel_code_properties
+ llvm::ArrayType::get(CGF.Int8Ty, 6) // reserved2
+ },
+ "kernel_descriptor_t");
+}
+
+/// Return IR struct type for rtinfo struct in rocm-device-libs used for device
+/// enqueue.
+///
+/// ptr addrspace(1) kernel_object, i32 private_segment_size,
+/// i32 group_segment_size
+
+static llvm::StructType *
+getAMDGPURuntimeHandleType(llvm::LLVMContext &C,
+ llvm::Type *KernelDescriptorPtrTy) {
+ llvm::Type *Int32 = llvm::Type::getInt32Ty(C);
+ return llvm::StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
+ "block.runtime.handle.t");
+}
+
/// Create an OpenCL kernel for an enqueued block.
///
/// The type of the first argument (the block literal) is the struct type
@@ -554,23 +589,29 @@
ArgNames.push_back(
llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
}
- std::string Name = Invoke->getName().str() + "_kernel";
+
+ llvm::Module &Mod = CGF.CGM.getModule();
+ const llvm::DataLayout &DL = Mod.getDataLayout();
+
+ llvm::Twine Name = Invoke->getName() + "_kernel";
auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
+
+ // The kernel itself can be internal, the runtime does not directly access the
+ // kernel address (only the kernel descriptor).
auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
- &CGF.CGM.getModule());
+ &Mod);
F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
llvm::AttrBuilder KernelAttrs(C);
// FIXME: The invoke isn't applying the right attributes either
// FIXME: This is missing setTargetAttributes
CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
- KernelAttrs.addAttribute("enqueued-block");
F->addFnAttrs(KernelAttrs);
auto IP = CGF.Builder.saveIP();
auto *BB = llvm::BasicBlock::Create(C, "entry", F);
Builder.SetInsertPoint(BB);
- const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
+ const auto BlockAlign = DL.getPrefTypeAlign(BlockTy);
auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
BlockPtr->setAlignment(BlockAlign);
Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
@@ -593,7 +634,40 @@
if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
- return F;
+ llvm::Type *KernelDescriptorTy = getAMDGPUKernelDescriptorType(CGF);
+ llvm::StructType *HandleTy = getAMDGPURuntimeHandleType(
+ C, KernelDescriptorTy->getPointerTo(DL.getDefaultGlobalsAddressSpace()));
+ llvm::Constant *RuntimeHandleInitializer =
+ llvm::ConstantAggregateZero::get(HandleTy);
+
+ llvm::Twine RuntimeHandleName = F->getName() + ".runtime.handle";
+
+ // The runtime needs access to the runtime handle as an external symbol. The
+ // runtime handle will need to be made external later, in
+ // AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference
+ // inside the runtime handle, and is not directly referenced.
+
+ // TODO: We would initialize the first field by declaring F->getName() + ".kd"
+ // to reference the kernel descriptor. The runtime wouldn't need to bother
+ // setting it. We would need to have a final symbol name though.
+ // TODO: Can we directly use an external symbol with getGlobalIdentifier?
+ auto *RuntimeHandle = new llvm::GlobalVariable(
+ Mod, HandleTy,
+ /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
+ /*Initializer=*/RuntimeHandleInitializer, RuntimeHandleName,
+ /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+ DL.getDefaultGlobalsAddressSpace(),
+ /*isExternallyInitialized=*/true);
+
+ llvm::MDNode *HandleAsMD =
+ llvm::MDNode::get(C, llvm::ValueAsMetadata::get(RuntimeHandle));
+ F->setMetadata(llvm::LLVMContext::MD_associated, HandleAsMD);
+
+ RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
+
+ CGF.CGM.addUsedGlobal(F);
+ CGF.CGM.addUsedGlobal(RuntimeHandle);
+ return RuntimeHandle;
}
void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits