yaxunl updated this revision to Diff 118064.
yaxunl marked 5 inline comments as done.
yaxunl added a comment.
Revise by Anastasia's comments.
https://reviews.llvm.org/D38134
Files:
lib/CodeGen/CGBuiltin.cpp
lib/CodeGen/CGOpenCLRuntime.cpp
lib/CodeGen/CGOpenCLRuntime.h
lib/CodeGen/CodeGenTypes.h
lib/CodeGen/TargetInfo.cpp
lib/CodeGen/TargetInfo.h
test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
test/CodeGenOpenCL/cl20-device-side-enqueue.cl
Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl
===================================================================
--- test/CodeGenOpenCL/cl20-device-side-enqueue.cl
+++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -6,10 +6,33 @@
typedef void (^bl_t)(local void *);
typedef struct {int a;} ndrange_t;
-// N.B. The check here only exists to set BL_GLOBAL
-// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* }
+
+// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself.
+// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*)
+
+// For anonymous blocks without captures, emit block literals as global variable.
+// COMMON: [[BLG1:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG1:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG2:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG2:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG3:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG3:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG4:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG4:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG5:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG5:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG6:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* [[INVG6:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG7:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG7:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG9:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9:@[^ ]+]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG8K:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8K:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG9K:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9K:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BL_GLOBAL_K:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G_K:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG10:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG10:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+// COMMON: [[BLG11:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG11:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) }
+
+// Emits block literal [[BL_GLOBAL]], invoke function [[INV_G]] and global block variable @block_G
+// COMMON: define internal spir_func void [[INV_G]](i8 addrspace(4)* %{{.*}}, i8 addrspace(3)* %{{.*}})
const bl_t block_G = (bl_t) ^ (local void *a) {};
+// COMMON-LABEL: define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %{{.*}}, i32 addrspace(1)* %b, i32 %i)
kernel void device_side_enqueue(global int *a, global int *b, int i) {
// COMMON: %default_queue = alloca %opencl.queue_t*
queue_t default_queue;
@@ -24,117 +47,139 @@
// COMMON: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*]
clk_event_t event_wait_list2[] = {clk_event};
+ // Emits block literal on stack and invoke function [[INVL1]].
// COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4
// COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
// COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+ // COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVL1:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke
// B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()*
// B64: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 }>* %block to void ()*
// COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
- // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
+ // COMMON-LABEL: call i32 @__enqueue_kernel_basic
+ // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]])
enqueue_kernel(default_queue, flags, ndrange,
^(void) {
a[i] = b[i];
});
+ // Emits block literal on stack and invoke function [[INVL2]].
// COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
// COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
// COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %event_wait_list to %opencl.clk_event_t{{.*}}* addrspace(4)*
// COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)*
+ // COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVL2:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke
// COMMON: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
// COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)*
- // COMMON: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]])
+ // COMMON-LABEL: call i32 @__enqueue_kernel_basic_events
+ // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]])
enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event,
^(void) {
a[i] = b[i];
});
+ // Emits global block literal [[BLG1]] and invoke function [[INVG1]].
// COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
// COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
// B32: %[[TMP:.*]] = alloca [1 x i32]
// B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
// B32: store i32 256, i32* %[[TMP1]], align 4
- // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
+ // B32-LABEL: call i32 @__enqueue_kernel_vaargs
+ // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG1]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
// B64: %[[TMP:.*]] = alloca [1 x i64]
// B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0
// B64: store i64 256, i64* %[[TMP1]], align 8
- // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]])
+ // B64-LABEL: call i32 @__enqueue_kernel_vaargs
+ // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG1]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]])
enqueue_kernel(default_queue, flags, ndrange,
^(local void *p) {
return;
},
256);
char c;
+ // Emits global block literal [[BLG2]] and invoke function [[INVG2]].
// COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
// COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
// B32: %[[TMP:.*]] = alloca [1 x i32]
// B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
// B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4
- // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
+ // B32-LABEL: call i32 @__enqueue_kernel_vaargs
+ // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG2]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
// B64: %[[TMP:.*]] = alloca [1 x i64]
// B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0
// B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8
- // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]])
+ // B64-LABEL: call i32 @__enqueue_kernel_vaargs
+ // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG2]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]])
enqueue_kernel(default_queue, flags, ndrange,
^(local void *p) {
return;
},
c);
+ // Emits global block literal [[BLG3]] and invoke function [[INVG3]].
// COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
// COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
// COMMON: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
// COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** [[AD]] to %opencl.clk_event_t{{.*}}* addrspace(4)*
// COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)*
// B32: %[[TMP:.*]] = alloca [1 x i32]
// B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
// B32: store i32 256, i32* %[[TMP1]], align 4
- // B32: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
+ // B32-LABEL: call i32 @__enqueue_kernel_events_vaargs
+ // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG3]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
// B64: %[[TMP:.*]] = alloca [1 x i64]
// B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0
// B64: store i64 256, i64* %[[TMP1]], align 8
- // B64: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]])
+ // B64-LABEL: call i32 @__enqueue_kernel_events_vaargs
+ // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG3]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]])
enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event,
^(local void *p) {
return;
},
256);
+ // Emits global block literal [[BLG4]] and invoke function [[INVG4]].
// COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
// COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
// COMMON: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
// COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** [[AD]] to %opencl.clk_event_t{{.*}}* addrspace(4)*
// COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)*
// B32: %[[TMP:.*]] = alloca [1 x i32]
// B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
// B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4
- // B32: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
+ // B32-LABEL: call i32 @__enqueue_kernel_events_vaargs
+ // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG4]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
// B64: %[[TMP:.*]] = alloca [1 x i64]
// B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0
// B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8
- // B64: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]])
+ // B64-LABEL: call i32 @__enqueue_kernel_events_vaargs
+ // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG4]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]])
enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event,
^(local void *p) {
return;
},
c);
long l;
+ // Emits global block literal [[BLG5]] and invoke function [[INVG5]].
// COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
// COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
// B32: %[[TMP:.*]] = alloca [1 x i32]
// B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
// B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4
- // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
+ // B32-LABEL: call i32 @__enqueue_kernel_vaargs
+ // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__opencl_enqueued_block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
// B64: %[[TMP:.*]] = alloca [1 x i64]
// B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0
// B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8
- // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]])
+ // B64-LABEL: call i32 @__enqueue_kernel_vaargs
+ // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__opencl_enqueued_block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]])
enqueue_kernel(default_queue, flags, ndrange,
^(local void *p) {
return;
},
l);
+ // Emits global block literal [[BLG6]] and invoke function [[INVG6]].
// COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
// COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
// B32: %[[TMP:.*]] = alloca [3 x i32]
@@ -144,60 +189,117 @@
// B32: store i32 2, i32* %[[TMP2]], align 4
// B32: %[[TMP3:.*]] = getelementptr [3 x i32], [3 x i32]* %[[TMP]], i32 0, i32 2
// B32: store i32 4, i32* %[[TMP3]], align 4
- // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %[[TMP1]])
+ // B32-LABEL: call i32 @__enqueue_kernel_vaargs
+ // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__opencl_enqueued_block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %[[TMP1]])
// B64: %[[TMP:.*]] = alloca [3 x i64]
// B64: %[[TMP1:.*]] = getelementptr [3 x i64], [3 x i64]* %[[TMP]], i32 0, i32 0
// B64: store i64 1, i64* %[[TMP1]], align 8
// B64: %[[TMP2:.*]] = getelementptr [3 x i64], [3 x i64]* %[[TMP]], i32 0, i32 1
// B64: store i64 2, i64* %[[TMP2]], align 8
// B64: %[[TMP3:.*]] = getelementptr [3 x i64], [3 x i64]* %[[TMP]], i32 0, i32 2
// B64: store i64 4, i64* %[[TMP3]], align 8
- // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i64* %[[TMP1]])
+ // B64-LABEL: call i32 @__enqueue_kernel_vaargs
+ // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__opencl_enqueued_block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i64* %[[TMP1]])
enqueue_kernel(default_queue, flags, ndrange,
^(local void *p1, local void *p2, local void *p3) {
return;
},
1, 2, 4);
+ // Emits global block literal [[BLG7]] and invoke function [[INVG7]].
// COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
// COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
// B32: %[[TMP:.*]] = alloca [1 x i32]
// B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0
// B32: store i32 0, i32* %[[TMP1]], align 4
- // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
+ // B32-LABEL: call i32 @__enqueue_kernel_vaargs
+ // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__opencl_enqueued_block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]])
// B64: %[[TMP:.*]] = alloca [1 x i64]
// B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0
// B64: store i64 4294967296, i64* %[[TMP1]], align 8
- // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]])
+ // B64-LABEL: call i32 @__enqueue_kernel_vaargs
+ // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__opencl_enqueued_block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]])
enqueue_kernel(default_queue, flags, ndrange,
^(local void *p) {
return;
},
4294967296L);
+ // Emits global block literal [[BLG8]] and invoke function [[INVG8]].
// The full type of these expressions are long (and repeated elsewhere), so we
// capture it as part of the regex for convenience and clarity.
- // COMMON: store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_A:@__block_literal_global(\.[0-9]+)?]] to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %block_A
+ // COMMON: store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %block_A
void (^const block_A)(void) = ^{
return;
};
- // COMMON: store void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_B:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*), void (i8 addrspace(3)*) addrspace(4)** %block_B
+ // Emits global block literal [[BLG9]] and invoke function [[INVG9]].
+ // COMMON: store void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG9]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*), void (i8 addrspace(3)*) addrspace(4)** %block_B
void (^const block_B)(local void *) = ^(local void *a) {
return;
};
- // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_A]] to i8 addrspace(1)*) to i8 addrspace(4)*))
+ // Uses global block literal [[BLG8]] and invoke function [[INVG8]].
+ // COMMON: [[r1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* getelementptr inbounds (%struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), i32 0, i32 2)
+ // COMMON: [[r2:%.*]] = addrspacecast i8 addrspace(4)* [[r1]] to void (i8 addrspace(4)*)*
+ // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*))
+ block_A();
+
+ // Emits global block literal [[BLG8K]] and invoke function [[INVG8K]]. [[INVG8K]] is the same as [[INV8]] except calling convention, ABI and metadata.
+ // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue
+ // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+ // COMMON-LABEL: call i32 @__enqueue_kernel_basic
+ // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8K]] to i8 addrspace(1)*) to i8 addrspace(4)*))
+ enqueue_kernel(default_queue, flags, ndrange, block_A);
+
+ // Uses global block literal [[BLG8K]].
+ // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8K]] to i8 addrspace(1)*) to i8 addrspace(4)*))
unsigned size = get_kernel_work_group_size(block_A);
- // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_B]] to i8 addrspace(1)*) to i8 addrspace(4)*))
+
+ // Uses global block literal [[BLG8]] and invoke function [[INVG8]]. Make sure no redundant block literal and invoke functions are emitted.
+ // COMMON: [[r1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* getelementptr inbounds (%struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), i32 0, i32 2)
+ // COMMON: [[r2:%.*]] = addrspacecast i8 addrspace(4)* [[r1]] to void (i8 addrspace(4)*)*
+ // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*))
+ block_A();
+
+ // Emits global block literal [[BLG9K]] and invoke function [[INVG9K]]. [[INVG9K]] is the same as [[INV9]] except calling convention, ABI and metadata.
+ // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG9K]] to i8 addrspace(1)*) to i8 addrspace(4)*))
size = get_kernel_work_group_size(block_B);
- // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_A]] to i8 addrspace(1)*) to i8 addrspace(4)*))
+
+ // Uses global block literal [[BLG8K]] and invoke function [[INVG8K]]. Make sure no redundant block literal ind invoke functions are emitted.
+ // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8K]] to i8 addrspace(1)*) to i8 addrspace(4)*))
size = get_kernel_preferred_work_group_size_multiple(block_A);
- // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to i8 addrspace(1)*) to i8 addrspace(4)*))
+
+ // Uses global block literal [[BL_GLOBAL_K]] and invoke function [[INV_G_K]]. [[INV_G_K]] is the same as [[INV_G]] except calling convention, ABI and metadata.
+ // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL_K]] to i8 addrspace(1)*) to i8 addrspace(4)*))
size = get_kernel_preferred_work_group_size_multiple(block_G);
- // COMMON: call i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* {{.*}} to i8 addrspace(1)*) to i8 addrspace(4)*))
+ // Emits global block literal [[BLG10]] and invoke function [[INVG10]].
+ // COMMON: call i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG10]] to i8 addrspace(1)*) to i8 addrspace(4)*))
size = get_kernel_max_sub_group_size_for_ndrange(ndrange, ^(){});
- // COMMON: call i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* {{.*}} to i8 addrspace(1)*) to i8 addrspace(4)*))
+
+ // Emits global block literal [[BLG11]] and invoke function [[INVG11]].
+ // COMMON: call i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG11]] to i8 addrspace(1)*) to i8 addrspace(4)*))
size = get_kernel_sub_group_count_for_ndrange(ndrange, ^(){});
}
+
+// COMMON: define internal spir_kernel void [[INVL1]](i8 addrspace(4)*) #{{[0-9]+}} {
+// COMMON: entry:
+// COMMON: call void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %0)
+// COMMON: ret void
+// COMMON: }
+// COMMON: define internal spir_kernel void [[INVL2]](i8 addrspace(4)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVG1]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVG2]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVG3]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVG4]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVG5]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVG6]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}, i8 addrspace(3)*{{.*}}, i8 addrspace(3)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVG7]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
+// COMMON: define internal spir_func void [[INVG8]](i8 addrspace(4)*{{.*}})
+// COMMON: define internal spir_func void [[INVG9]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)* %{{.*}})
+// COMMON: define internal spir_kernel void [[INVG8K]](i8 addrspace(4)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVG9K]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
+// COMMON: define internal spir_kernel void [[INV_G_K]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVG10]](i8 addrspace(4)*{{.*}})
+// COMMON: define internal spir_kernel void [[INVG11]](i8 addrspace(4)*{{.*}})
Index: test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
===================================================================
--- /dev/null
+++ test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn | FileCheck %s --check-prefix=CHECK
+
+typedef struct {int a;} ndrange_t;
+
+// CHECK-LABEL: define amdgpu_kernel void @test
+kernel void test(global char *a, char b, global long *c, long d) {
+ queue_t default_queue;
+ unsigned flags = 0;
+ ndrange_t ndrange;
+
+ enqueue_kernel(default_queue, flags, ndrange,
+ ^(void) {
+ a[0] = b;
+ });
+
+ enqueue_kernel(default_queue, flags, ndrange,
+ ^(void) {
+ a[0] = b;
+ c[0] = d;
+ });
+}
+
+// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>)
+// CHECK-SAME: #[[ATTR:[0-9]+]] !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}}
+// CHECK: entry:
+// CHECK: %1 = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8
+// CHECK: store <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %0, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %1, align 8
+// CHECK: %2 = addrspacecast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %1 to i8 addrspace(4)*
+// CHECK: call void @__test_block_invoke(i8 addrspace(4)* %2)
+// CHECK: ret void
+// CHECK:}
+
+// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>)
+// CHECK-SAME: #[[ATTR]] !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}}
+
+// CHECK: attributes #[[ATTR]] = { nounwind "enqueued-block" }
Index: lib/CodeGen/TargetInfo.h
===================================================================
--- lib/CodeGen/TargetInfo.h
+++ lib/CodeGen/TargetInfo.h
@@ -287,6 +287,11 @@
virtual TargetOpenCLBlockHelper *getTargetOpenCLBlockHelper() const {
return nullptr;
}
+ /// Create an OpenCL kernel for an enqueued block.
+ virtual llvm::Function *
+ createEnqueuedBlockKernel(CodeGenFunction &CGF,
+ llvm::Function *BlockInvokeFunc,
+ llvm::Value *BlockLiteral) const;
};
} // namespace CodeGen
Index: lib/CodeGen/TargetInfo.cpp
===================================================================
--- lib/CodeGen/TargetInfo.cpp
+++ lib/CodeGen/TargetInfo.cpp
@@ -14,6 +14,7 @@
#include "TargetInfo.h"
#include "ABIInfo.h"
+#include "CGBlocks.h"
#include "CGCXXABI.h"
#include "CGValue.h"
#include "CodeGenFunction.h"
@@ -7617,6 +7618,10 @@
const VarDecl *D) const override;
llvm::SyncScope::ID getLLVMSyncScopeID(SyncScope S,
llvm::LLVMContext &C) const override;
+ llvm::Function *
+ createEnqueuedBlockKernel(CodeGenFunction &CGF,
+ llvm::Function *BlockInvokeFunc,
+ llvm::Value *BlockLiteral) const override;
};
}
@@ -8917,3 +8922,109 @@
return SetCGInfo(new SPIRTargetCodeGenInfo(Types));
}
}
+
+/// Create an OpenCL kernel for an enqueued block.
+///
+/// The kernel has the same function type as the block invoke function. Its
+/// name is the name of the block invoke function postfixed with "_kernel".
+/// It simply calls the block invoke function then returns.
+llvm::Function *
+TargetCodeGenInfo::createEnqueuedBlockKernel(CodeGenFunction &CGF,
+ llvm::Function *Invoke,
+ llvm::Value *BlockLiteral) const {
+ auto *InvokeFT = Invoke->getFunctionType();
+ llvm::SmallVector<llvm::Type *, 2> ArgTys;
+ for (auto &P : InvokeFT->params())
+ ArgTys.push_back(P);
+ auto &C = CGF.getLLVMContext();
+ std::string Name = Invoke->getName().str() + "_kernel";
+ auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
+ auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
+ &CGF.CGM.getModule());
+ auto IP = CGF.Builder.saveIP();
+ auto *BB = llvm::BasicBlock::Create(C, "entry", F);
+ auto &Builder = CGF.Builder;
+ Builder.SetInsertPoint(BB);
+ llvm::SmallVector<llvm::Value *, 2> Args;
+ for (auto &A : F->args())
+ Args.push_back(&A);
+ Builder.CreateCall(Invoke, Args);
+ Builder.CreateRetVoid();
+ Builder.restoreIP(IP);
+ return F;
+}
+
+/// Create an OpenCL kernel for an enqueued block.
+///
+/// The type of the first argument (the block literal) is the struct type
+/// of the block literal instead of a pointer type. The first argument
+/// (block literal) is passed directly by value to the kernel. The kernel
+/// allocates the same type of struct on stack and stores the block literal
+/// to it and passes its pointer to the block invoke function. The kernel
+/// has "enqueued-block" function attribute and kernel argument metadata.
+llvm::Function *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
+ CodeGenFunction &CGF, llvm::Function *Invoke,
+ llvm::Value *BlockLiteral) const {
+ auto &Builder = CGF.Builder;
+ auto &C = CGF.getLLVMContext();
+
+ auto *BlockTy = BlockLiteral->getType()->getPointerElementType();
+ auto *InvokeFT = Invoke->getFunctionType();
+ llvm::SmallVector<llvm::Type *, 2> ArgTys;
+ llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;
+ llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;
+ llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;
+ llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;
+ llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;
+ llvm::SmallVector<llvm::Metadata *, 8> ArgNames;
+
+ ArgTys.push_back(BlockTy);
+ ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
+ AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
+ ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
+ ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
+ AccessQuals.push_back(llvm::MDString::get(C, "none"));
+ ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
+ for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
+ ArgTys.push_back(InvokeFT->getParamType(I));
+ ArgTys.push_back(BlockTy);
+ ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
+ AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
+ AccessQuals.push_back(llvm::MDString::get(C, "none"));
+ ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
+ ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
+ ArgNames.push_back(
+ llvm::MDString::get(C, std::string("local_arg") + std::to_string(I)));
+ }
+ std::string Name = Invoke->getName().str() + "_kernel";
+ auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
+ auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
+ &CGF.CGM.getModule());
+ F->addFnAttr("enqueued-block");
+ auto IP = CGF.Builder.saveIP();
+ auto *BB = llvm::BasicBlock::Create(C, "entry", F);
+ Builder.SetInsertPoint(BB);
+ unsigned BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlignment(BlockTy);
+ auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
+ BlockPtr->setAlignment(BlockAlign);
+ Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
+ auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
+ llvm::SmallVector<llvm::Value *, 2> Args;
+ Args.push_back(Cast);
+ for (auto I = F->arg_begin() + 1, E = F->arg_end(); I != E; ++I)
+ Args.push_back(I);
+ Builder.CreateCall(Invoke, Args);
+ Builder.CreateRetVoid();
+ Builder.restoreIP(IP);
+
+ F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
+ F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
+ F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
+ F->setMetadata("kernel_arg_base_type",
+ llvm::MDNode::get(C, ArgBaseTypeNames));
+ F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
+ if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
+ F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
+
+ return F;
+}
Index: lib/CodeGen/CodeGenTypes.h
===================================================================
--- lib/CodeGen/CodeGenTypes.h
+++ lib/CodeGen/CodeGenTypes.h
@@ -164,8 +164,6 @@
llvm::SmallSet<const Type *, 8> RecordsWithOpaqueMemberPointers;
- unsigned ClangCallConvToLLVMCallConv(CallingConv CC);
-
public:
CodeGenTypes(CodeGenModule &cgm);
~CodeGenTypes();
@@ -180,6 +178,9 @@
llvm::LLVMContext &getLLVMContext() { return TheModule.getContext(); }
const CodeGenOptions &getCodeGenOpts() const;
+ /// Convert clang calling convention to LLVM callilng convention.
+ unsigned ClangCallConvToLLVMCallConv(CallingConv CC);
+
/// ConvertType - Convert type T into a llvm::Type.
llvm::Type *ConvertType(QualType T);
Index: lib/CodeGen/CGOpenCLRuntime.h
===================================================================
--- lib/CodeGen/CGOpenCLRuntime.h
+++ lib/CodeGen/CGOpenCLRuntime.h
@@ -17,11 +17,13 @@
#define LLVM_CLANG_LIB_CODEGEN_CGOPENCLRUNTIME_H
#include "clang/AST/Type.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
namespace clang {
+class Expr;
class VarDecl;
namespace CodeGen {
@@ -34,6 +36,8 @@
CodeGenModule &CGM;
llvm::Type *PipeTy;
llvm::PointerType *SamplerTy;
+ /// Maps block expression to llvm value.
+ llvm::DenseMap<const Expr *, llvm::Value *> EnqueuedBlockMap;
public:
CGOpenCLRuntime(CodeGenModule &CGM) : CGM(CGM), PipeTy(nullptr),
@@ -62,6 +66,9 @@
/// \return __generic void* type.
llvm::PointerType *getGenericVoidPointerType();
+
+ /// \return block literal for enqueued block.
+ llvm::Value *emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, const Expr *E);
};
}
Index: lib/CodeGen/CGOpenCLRuntime.cpp
===================================================================
--- lib/CodeGen/CGOpenCLRuntime.cpp
+++ lib/CodeGen/CGOpenCLRuntime.cpp
@@ -16,6 +16,7 @@
#include "CGOpenCLRuntime.h"
#include "CodeGenFunction.h"
#include "TargetInfo.h"
+#include "clang/CodeGen/ConstantInitBuilder.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/GlobalValue.h"
#include <assert.h>
@@ -110,3 +111,95 @@
CGM.getLLVMContext(),
CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic));
}
+
+llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF,
+ const Expr *E) {
+ // The block literal may be assigned to a const variable. Chasing down
+ // to get the block literal.
+ if (auto DR = dyn_cast<DeclRefExpr>(E)) {
+ E = cast<VarDecl>(DR->getDecl())->getInit();
+ }
+ if (auto Cast = dyn_cast<CastExpr>(E)) {
+ E = Cast->getSubExpr();
+ }
+ auto *Block = cast<BlockExpr>(E);
+
+ // The same block literal may be enqueued multiple times. Cache it if
+ // possible.
+ bool Cacheable = !Block->getBlockDecl()->hasCaptures();
+ if (Cacheable) {
+ auto Loc = EnqueuedBlockMap.find(Block);
+ if (Loc != EnqueuedBlockMap.end()) {
+ return Loc->second;
+ }
+ }
+
+ // Emit block literal as a common block expression. There are two situations.
+ // If there is no capture, the block literal is emitted as a global variable,
+ // otherwise it is emitted as an alloca. In both case, it is emitted as a
+ // structure and the block invoke function is the third field.
+ auto *V = CGF.EmitScalarExpr(Block);
+ llvm::Function *F;
+ if (auto *I = dyn_cast<llvm::Instruction>(V)) {
+ // If the block literal is emitted as an instruction, it is an alloca
+ // and the block invoke function is stored to GEP of this alloca.
+ // Find the GEP and store instruction and replace the invoke function
+ // with a kernel.
+ auto *Block = I->stripPointerCasts();
+ llvm::Function *Invoke = nullptr;
+ llvm::StoreInst *ST = nullptr;
+ llvm::GetElementPtrInst *GEP = nullptr;
+ for (; I; I = I->getPrevNode()) {
+ if ((ST = dyn_cast<llvm::StoreInst>(I))) {
+ if ((GEP =
+ dyn_cast<llvm::GetElementPtrInst>(ST->getPointerOperand()))) {
+ if (GEP->getPointerOperand() == Block &&
+ cast<llvm::ConstantInt>(GEP->getOperand(2))->getZExtValue() ==
+ 2) {
+ Invoke = cast<llvm::Function>(
+ ST->getValueOperand()->stripPointerCasts());
+ break;
+ }
+ }
+ }
+ }
+ assert(ST && GEP && Invoke && "Invalid block invoke function");
+ F = CGF.getTargetHooks().createEnqueuedBlockKernel(CGF, Invoke, Block);
+ ST->replaceUsesOfWith(
+ ST->getValueOperand(),
+ llvm::ConstantExpr::getPointerCast(
+ F, ST->getPointerOperand()->getType()->getPointerElementType()));
+ } else {
+ // If the block literal is emitted as a global variable, the block invoke
+ // function is in its initializer. Create a new global variable and replace
+ // the invoke function with a kernel.
+ auto *Block = cast<llvm::GlobalVariable>(
+ cast<llvm::Constant>(V)->stripPointerCasts());
+ auto *BlockInit = cast<llvm::ConstantStruct>(Block->getInitializer());
+ auto *Invoke = cast<llvm::Function>(
+ BlockInit->getAggregateElement(2)->stripPointerCasts());
+ F = CGF.getTargetHooks().createEnqueuedBlockKernel(CGF, Invoke, Block);
+ ConstantInitBuilder Builder(CGM);
+ auto Fields = Builder.beginStruct(BlockInit->getType());
+ Fields.add(BlockInit->getAggregateElement(0U));
+ Fields.add(BlockInit->getAggregateElement(1));
+ Fields.add(llvm::ConstantExpr::getPointerCast(
+ F, BlockInit->getAggregateElement(2)->getType()));
+ Block = Fields.finishAndCreateGlobal(
+ "__opencl_enqueued_block_literal_global",
+ CharUnits::fromQuantity(
+ Block->getPointerAlignment(CGM.getDataLayout())),
+ /*Constant=*/true, llvm::GlobalValue::InternalLinkage,
+ Block->getType()->getPointerAddressSpace());
+ V = llvm::ConstantExpr::getPointerCast(Block, V->getType());
+ }
+
+ // The common part of the post-processing of the kernel goes here.
+ F->addFnAttr(llvm::Attribute::NoUnwind);
+ F->setCallingConv(
+ CGF.getTypes().ClangCallConvToLLVMCallConv(CallingConv::CC_OpenCLKernel));
+ if (Cacheable) {
+ EnqueuedBlockMap[Block] = V;
+ }
+ return V;
+}
Index: lib/CodeGen/CGBuiltin.cpp
===================================================================
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -2609,7 +2609,8 @@
Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys, 4), false);
llvm::Value *Block = Builder.CreatePointerCast(
- EmitScalarExpr(E->getArg(3)), GenericVoidPtrTy);
+ CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3)),
+ GenericVoidPtrTy);
AttrBuilder B;
B.addAttribute(Attribute::ByVal);
@@ -2650,8 +2651,9 @@
if (E->getArg(3)->getType()->isBlockPointerType()) {
// No events passed, but has variadic arguments.
Name = "__enqueue_kernel_vaargs";
- auto *Block = Builder.CreatePointerCast(EmitScalarExpr(E->getArg(3)),
- GenericVoidPtrTy);
+ auto *Block = Builder.CreatePointerCast(
+ CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3)),
+ GenericVoidPtrTy);
auto *PtrToSizeArray = CreateArrayForSizeVar(4);
// Create a vector of the arguments, as well as a constant value to
@@ -2689,7 +2691,8 @@
EventList = Builder.CreatePointerCast(EventList, EventPtrTy);
ClkEvent = Builder.CreatePointerCast(ClkEvent, EventPtrTy);
llvm::Value *Block = Builder.CreatePointerCast(
- EmitScalarExpr(E->getArg(6)), GenericVoidPtrTy);
+ CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(6)),
+ GenericVoidPtrTy);
std::vector<llvm::Type *> ArgTys = {
QueueTy, Int32Ty, RangeTy, Int32Ty,
@@ -2730,7 +2733,8 @@
case Builtin::BIget_kernel_work_group_size: {
llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
getContext().getTargetAddressSpace(LangAS::opencl_generic));
- Value *Arg = EmitScalarExpr(E->getArg(0));
+ Value *Arg =
+ CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
Arg = Builder.CreatePointerCast(Arg, GenericVoidPtrTy);
return RValue::get(Builder.CreateCall(
CGM.CreateRuntimeFunction(
@@ -2741,7 +2745,8 @@
case Builtin::BIget_kernel_preferred_work_group_size_multiple: {
llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
getContext().getTargetAddressSpace(LangAS::opencl_generic));
- Value *Arg = EmitScalarExpr(E->getArg(0));
+ Value *Arg =
+ CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0));
Arg = Builder.CreatePointerCast(Arg, GenericVoidPtrTy);
return RValue::get(Builder.CreateCall(
CGM.CreateRuntimeFunction(
@@ -2755,7 +2760,8 @@
getContext().getTargetAddressSpace(LangAS::opencl_generic));
LValue NDRangeL = EmitAggExprToLValue(E->getArg(0));
llvm::Value *NDRange = NDRangeL.getAddress().getPointer();
- Value *Block = EmitScalarExpr(E->getArg(1));
+ Value *Block =
+ CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(1));
Block = Builder.CreatePointerCast(Block, GenericVoidPtrTy);
const char *Name =
BuiltinID == Builtin::BIget_kernel_max_sub_group_size_for_ndrange
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits