sfantao created this revision. sfantao added reviewers: ABataev, hfinkel, rjmccall. sfantao added subscribers: cfe-commits, Hahnfeld.
This patch rebases and creates a new revision for http://reviews.llvm.org/D11361 as requested by John. Here's the adapted original summary (the global captures issue has been fixed in the meantime): This patch implements the outlining for offloading functions for code annotated with the OpenMP target directive. It uses a temporary naming of the outlined functions that will have to be updated later on once target side codegen and registration of offloading libraries is implemented - the naming needs to be made unique in the produced library. Unlike other captured regions, target offloading cannot use directly the Capture declaration, as each captured field has to be passed explicitly to the runtime library and associated with potentially different mapping types (to/from/alloc...). A proxy function is used to wrap the default capturing implemented in clang and adapt it to what OpenMP offloading requires. Thanks! Samuel http://reviews.llvm.org/D12871 Files: lib/CodeGen/CGOpenMPRuntime.cpp lib/CodeGen/CGOpenMPRuntime.h lib/CodeGen/CGStmtOpenMP.cpp test/OpenMP/target_codegen.cpp
Index: test/OpenMP/target_codegen.cpp =================================================================== --- /dev/null +++ test/OpenMP/target_codegen.cpp @@ -0,0 +1,753 @@ +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK-DAG: [[TT:%.+]] = type { i64, i8 } +// CHECK-DAG: [[S1:%.+]] = type { double } + +// We have 8 target regions, but only 7 that actually will generate offloading +// code, only 6 will have mapped arguments, and only 4 have all-constant map +// sizes. + +// CHECK-DAG: [[SIZET2:@.+]] = private unnamed_addr constant [1 x i{{32|64}}] [i[[SZ:32|64]] 2] +// CHECK-DAG: [[MAPT2:@.+]] = private unnamed_addr constant [1 x i32] [i32 3] +// CHECK-DAG: [[SIZET3:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2] +// CHECK-DAG: [[MAPT3:@.+]] = private unnamed_addr constant [2 x i32] [i32 3, i32 3] +// CHECK-DAG: [[MAPT4:@.+]] = private unnamed_addr constant [9 x i32] [i32 3, i32 3, i32 1, i32 3, i32 3, i32 1, i32 1, i32 3, i32 3] +// CHECK-DAG: [[SIZET5:@.+]] = private unnamed_addr constant [3 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 40] +// CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i32] [i32 3, i32 3, i32 3] +// CHECK-DAG: [[SIZET6:@.+]] = private unnamed_addr constant [4 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 1, i[[SZ]] 40] +// CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i32] [i32 3, i32 3, i32 3, i32 3] +// CHECK-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i32] [i32 3, i32 3, i32 1, i32 1, i32 3] +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 +// CHECK-DAG: @{{.*}} = private constant i8 0 + +template<typename tx, typename ty> +struct TT{ + tx X; + ty Y; +}; + +// CHECK: define {{.*}}[[FOO:@.+]]( +int foo(int n) { + int a = 0; + short aa = 0; + float b[10]; + float bn[n]; + double c[5][10]; + double cn[5][n]; + TT<long long, char> d; + + // CHECK: br label %[[TRY:[^,]+]] + // CHECK: [[TRY]] + // CHECK: [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i[[SZ]]* null, i32* null) + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT0:@.+]]() + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target + { + } + + // CHECK: call void [[HVT1:@.+]](i32* {{[^,]+}}) + #pragma omp target if(0) + { + a += 1; + } + + // CHECK: br label %[[TRY:[^,]+]] + // CHECK: [[TRY]] + // CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 1, i8** [[BP:%[^,]+]], i8** [[P:%[^,]+]], i[[SZ]]* getelementptr inbounds ([1 x i[[SZ]]], [1 x i[[SZ]]]* [[SIZET2]], i32 0, i32 0), i32* getelementptr inbounds ([1 x i32], [1 x i32]* [[MAPT2]], i32 0, i32 0)) + // CHECK-DAG: [[BP]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BPR:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[P]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PR:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BPR]], i32 0, i32 [[IDX0:[0-9]+]] + // CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PR]], i32 0, i32 [[IDX0]] + // CHECK-DAG: store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]] + // CHECK-DAG: store i8* [[P0:%[^,]+]], i8** [[PADDR0]] + // CHECK-DAG: [[BP0]] = bitcast i16* %{{.+}} to i8* + // CHECK-DAG: [[P0]] = bitcast i16* %{{.+}} to i8* + + // CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT2:@.+]](i16* {{[^,]+}}) + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target if(1) + { + aa += 1; + } + + // CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 10 + // CHECK: br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]] + // CHECK: [[TRY]] + // CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 2, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* getelementptr inbounds ([2 x i[[SZ]]], [2 x i[[SZ]]]* [[SIZET3]], i32 0, i32 0), i32* getelementptr inbounds ([2 x i32], [2 x i32]* [[MAPT3]], i32 0, i32 0)) + // CHECK-DAG: [[BPR]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[PR]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P:%[^,]+]], i32 0, i32 0 + + // CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 0 + // CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0 + // CHECK-DAG: store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]] + // CHECK-DAG: store i8* [[P0:%[^,]+]], i8** [[PADDR0]] + // CHECK-DAG: [[BP0]] = bitcast i32* %{{.+}} to i8* + // CHECK-DAG: [[P0]] = bitcast i32* %{{.+}} to i8* + + // CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1 + // CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1 + // CHECK-DAG: store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]] + // CHECK-DAG: store i8* [[P1:%[^,]+]], i8** [[PADDR1]] + // CHECK-DAG: [[BP1]] = bitcast i16* %{{.+}} to i8* + // CHECK-DAG: [[P1]] = bitcast i16* %{{.+}} to i8* + + // CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT3:@.+]]({{[^,]+}}, {{[^,]+}}) + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target if(n>10) + { + a += 1; + aa += 1; + } + + // We capture 3 VLA sizes in this target region + // CHECK: store i[[SZ]] [[BNELEMSIZE:%.+]], i[[SZ]]* [[VLA0:%[^,]+]] + // CHECK: [[BNSIZE:%.+]] = mul nuw i[[SZ]] 4, [[BNELEMSIZE:%.+]] + + // CHECK: store i[[SZ]] 5, i[[SZ]]* [[VLA1:%[^,]+]] + // CHECK: store i[[SZ]] [[CNELEMSIZE1:%.+]], i[[SZ]]* [[VLA2:%[^,]+]] + // CHECK: [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[CNELEMSIZE1]] + // CHECK: [[CNSIZE:%.+]] = mul nuw i[[SZ]] 8, [[CNELEMSIZE2]] + + // CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20 + // CHECK: br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]] + // CHECK: [[TRY]] + // CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 9, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i32* getelementptr inbounds ([9 x i32], [9 x i32]* [[MAPT4]], i32 0, i32 0)) + // CHECK-DAG: [[BPR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[PR]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P:%[^,]+]], i32 0, i32 0 + // CHECK-DAG: [[SR]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S:%[^,]+]], i32 0, i32 0 + + // CHECK-DAG: [[SADDR0:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX0:[0-9]+]] + // CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX0]] + // CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX0]] + // CHECK-DAG: [[SADDR1:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX1:[0-9]+]] + // CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX1]] + // CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX1]] + // CHECK-DAG: [[SADDR2:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX2:[0-9]+]] + // CHECK-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX2]] + // CHECK-DAG: [[PADDR2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX2]] + // CHECK-DAG: [[SADDR3:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX3:[0-9]+]] + // CHECK-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX3]] + // CHECK-DAG: [[PADDR3:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX3]] + // CHECK-DAG: [[SADDR4:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX4:[0-9]+]] + // CHECK-DAG: [[BPADDR4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX4]] + // CHECK-DAG: [[PADDR4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX4]] + // CHECK-DAG: [[SADDR5:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX5:[0-9]+]] + // CHECK-DAG: [[BPADDR5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX5]] + // CHECK-DAG: [[PADDR5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX5]] + // CHECK-DAG: [[SADDR6:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX6:[0-9]+]] + // CHECK-DAG: [[BPADDR6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX6]] + // CHECK-DAG: [[PADDR6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX6]] + // CHECK-DAG: [[SADDR7:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX7:[0-9]+]] + // CHECK-DAG: [[BPADDR7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX7]] + // CHECK-DAG: [[PADDR7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX7]] + // CHECK-DAG: [[SADDR8:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[S]], i32 0, i32 [[IDX8:[0-9]+]] + // CHECK-DAG: [[BPADDR8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BP]], i32 0, i32 [[IDX8]] + // CHECK-DAG: [[PADDR8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[P]], i32 0, i32 [[IDX8]] + + // The names below are not necessarily consistent with the names used for the + // addresses above as some are repeated. + // CHECK-DAG: [[BP0:%[^,]+]] = bitcast i[[SZ]]* [[VLA0]] to i8* + // CHECK-DAG: [[P0:%[^,]+]] = bitcast i[[SZ]]* [[VLA0]] to i8* + // CHECK-DAG: store i8* [[BP0]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P0]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP1:%[^,]+]] = bitcast i[[SZ]]* [[VLA1]] to i8* + // CHECK-DAG: [[P1:%[^,]+]] = bitcast i[[SZ]]* [[VLA1]] to i8* + // CHECK-DAG: store i8* [[BP1]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P1]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP2:%[^,]+]] = bitcast i[[SZ]]* [[VLA2]] to i8* + // CHECK-DAG: [[P2:%[^,]+]] = bitcast i[[SZ]]* [[VLA2]] to i8* + // CHECK-DAG: store i8* [[BP2]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P2]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP3:%[^,]+]] = bitcast i32* %{{.+}} to i8* + // CHECK-DAG: [[P3:%[^,]+]] = bitcast i32* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP3]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P3]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] 4, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP4:%[^,]+]] = bitcast [10 x float]* %{{.+}} to i8* + // CHECK-DAG: [[P4:%[^,]+]] = bitcast [10 x float]* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP4]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P4]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] 40, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP5:%[^,]+]] = bitcast float* %{{.+}} to i8* + // CHECK-DAG: [[P5:%[^,]+]] = bitcast float* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP5]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P5]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] [[BNSIZE]], i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP6:%[^,]+]] = bitcast [5 x [10 x double]]* %{{.+}} to i8* + // CHECK-DAG: [[P6:%[^,]+]] = bitcast [5 x [10 x double]]* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP6]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P6]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] 400, i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP7:%[^,]+]] = bitcast double* %{{.+}} to i8* + // CHECK-DAG: [[P7:%[^,]+]] = bitcast double* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP7]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P7]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] [[CNSIZE]], i[[SZ]]* {{%[^,]+}} + + // CHECK-DAG: [[BP8:%[^,]+]] = bitcast [[TT]]* %{{.+}} to i8* + // CHECK-DAG: [[P8:%[^,]+]] = bitcast [[TT]]* %{{.+}} to i8* + // CHECK-DAG: store i8* [[BP8]], i8** {{%[^,]+}} + // CHECK-DAG: store i8* [[P8]], i8** {{%[^,]+}} + // CHECK-DAG: store i[[SZ]] {{12|16}}, i[[SZ]]* {{%[^,]+}} + + // CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT4:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}) + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target if(n>20) + { + a += 1; + b[2] += 1.0; + bn[3] += 1.0; + c[1][2] += 1.0; + cn[1][3] += 1.0; + d.X += 1; + d.Y += 1; + } + + return a; +} + +// Check that the offloading functions are emitted and that the arguments are +// correct and loaded correctly for the target regions in foo(). + +// CHECK: define internal void [[HVT0]] + +// CHECK: define internal void [[HVT1]] +// Create context and reference. +// CHECK-DAG: [[CAPREF:%.+]] = alloca [[CAPTY:%[^,].+]]*, +// CHECK-DAG: [[CAP:%.+]] = alloca [[CAPTY]], +// CHECK-DAG: store [[CAPTY]]* [[CAP]], [[CAPTY]]** [[CAPREF]] +// Create local storage for each capture. +// CHECK-DAG: [[LOCAL_A:%.+]] = alloca i32* +// CHECK-DAG: store i32* [[ARG_A:%.+]], i32** [[LOCAL_A]] +// Store captures in the context. +// CHECK-DAG: [[FIELD_A:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 0 +// CHECK-DAG: [[REF_A:%.+]] = load i32*, i32** [[LOCAL_A]], +// CHECK-DAG: store i32* [[REF_A]], i32** [[FIELD_A]], +// Get capture from context. +// CHECK: [[CAP2:%.+]] = load [[CAPTY]]*, [[CAPTY]]** [[CAPREF]] +// CHECK-DAG: [[FIELD2_A:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 0 +// CHECK-DAG: [[REF2_A:%.+]] = load i32*, i32** [[FIELD2_A]] +// Use captures. +// CHECK-DAG: load i32, i32* [[REF2_A]] + +// CHECK: define internal void [[HVT2]] +// CHECK-DAG: [[CAPREF:%.+]] = alloca [[CAPTY:%[^,].+]]*, +// CHECK-DAG: [[CAP:%.+]] = alloca [[CAPTY]], +// CHECK-DAG: store [[CAPTY]]* [[CAP]], [[CAPTY]]** [[CAPREF]] +// Create local storage for each capture. +// CHECK-DAG: [[LOCAL_AA:%.+]] = alloca i16* +// CHECK-DAG: store i16* [[ARG_AA:%.+]], i16** [[LOCAL_AA]] +// Store captures in the context. +// CHECK-DAG: [[FIELD_AA:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 0 +// CHECK-DAG: [[REF_AA:%.+]] = load i16*, i16** [[LOCAL_AA]], +// CHECK-DAG: store i16* [[REF_AA]], i16** [[FIELD_AA]], +// Get capture from context. +// CHECK: [[CAP2:%.+]] = load [[CAPTY]]*, [[CAPTY]]** [[CAPREF]] +// CHECK-DAG: [[FIELD2_AA:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 0 +// CHECK-DAG: [[REF2_AA:%.+]] = load i16*, i16** [[FIELD2_AA]] +// Use captures. +// CHECK-DAG: load i16, i16* [[REF2_AA]] + +// CHECK: define internal void [[HVT3]] +// Create context and reference. +// CHECK-DAG: [[CAPREF:%.+]] = alloca [[CAPTY:%[^,].+]]*, +// CHECK-DAG: [[CAP:%.+]] = alloca [[CAPTY]], +// CHECK-DAG: store [[CAPTY]]* [[CAP]], [[CAPTY]]** [[CAPREF]] +// Create local storage for each capture. +// CHECK-DAG: [[LOCAL_A:%.+]] = alloca i32* +// CHECK-DAG: [[LOCAL_AA:%.+]] = alloca i16* +// CHECK-DAG: store i32* [[ARG_A:%.+]], i32** [[LOCAL_A]] +// CHECK-DAG: store i16* [[ARG_AA:%.+]], i16** [[LOCAL_AA]] +// Store captures in the context. +// CHECK-DAG: [[REF_A:%.+]] = load i32*, i32** [[LOCAL_A]], +// CHECK-DAG: [[REF_AA:%.+]] = load i16*, i16** [[LOCAL_AA]], +// CHECK-DAG: store i32* [[REF_A]], i32** [[FIELD_A:%.+]], +// CHECK-DAG: store i16* [[REF_AA]], i16** [[FIELD_AA:%.+]], +// CHECK-DAG: [[FIELD_A]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_AA]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// Get capture from context. +// CHECK: [[CAP2:%.+]] = load [[CAPTY]]*, [[CAPTY]]** [[CAPREF]] +// CHECK-DAG: [[REF2_A:%.+]] = load i32*, i32** [[FIELD2_A:%.+]], +// CHECK-DAG: [[REF2_AA:%.+]] = load i16*, i16** [[FIELD2_AA:%.+]], +// CHECK-DAG: [[FIELD2_A]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD2_AA]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 {{[0-9]}} +// Use captures. +// CHECK-DAG: load i32, i32* [[REF2_A]] +// CHECK-DAG: load i16, i16* [[REF2_AA]] + +// CHECK: define internal void [[HVT4]] +// Create context and reference. +// CHECK: [[CAPREF:%.+]] = alloca [[CAPTY:%[^,].+]]*, +// CHECK: [[LOCAL_A:%.+]] = alloca i32* +// CHECK-DAG: [[CAP:%.+]] = alloca [[CAPTY]], +// CHECK-DAG: store [[CAPTY]]* [[CAP]], [[CAPTY]]** [[CAPREF]] +// Create local storage for each capture. +// CHECK-DAG: [[LOCAL_B:%.+]] = alloca [10 x float]* +// CHECK-DAG: [[LOCAL_VLA1:%.+]] = alloca i[[SZ]]* +// CHECK-DAG: [[LOCAL_BN:%.+]] = alloca float* +// CHECK-DAG: [[LOCAL_C:%.+]] = alloca [5 x [10 x double]]* +// CHECK-DAG: [[LOCAL_VLA2:%.+]] = alloca i[[SZ]]* +// CHECK-DAG: [[LOCAL_VLA3:%.+]] = alloca i[[SZ]]* +// CHECK-DAG: [[LOCAL_CN:%.+]] = alloca double* +// CHECK-DAG: [[LOCAL_D:%.+]] = alloca [[TT]]* +// CHECK-DAG: store i32* [[ARG_A:%.+]], i32** [[LOCAL_A]] +// CHECK-DAG: store [10 x float]* [[ARG_B:%.+]], [10 x float]** [[LOCAL_B]] +// CHECK-DAG: store i[[SZ]]* [[ARG_VLA1:%.+]], i[[SZ]]** [[LOCAL_VLA1]] +// CHECK-DAG: store float* [[ARG_BN:%.+]], float** [[LOCAL_BN]] +// CHECK-DAG: store [5 x [10 x double]]* [[ARG_C:%.+]], [5 x [10 x double]]** [[LOCAL_C]] +// CHECK-DAG: store i[[SZ]]* [[ARG_VLA2:%.+]], i[[SZ]]** [[LOCAL_VLA2]] +// CHECK-DAG: store i[[SZ]]* [[ARG_VLA3:%.+]], i[[SZ]]** [[LOCAL_VLA3]] +// CHECK-DAG: store double* [[ARG_CN:%.+]], double** [[LOCAL_CN]] +// CHECK-DAG: store [[TT]]* [[ARG_D:%.+]], [[TT]]** [[LOCAL_D]] +// Store captures in the context. +// CHECK-DAG: [[REF_A:%.+]] = load i32*, i32** [[LOCAL_A]], +// CHECK-DAG: [[REF_B:%.+]] = load [10 x float]*, [10 x float]** [[LOCAL_B]], +// CHECK-DAG: [[REF_VLA1:%.+]] = load i[[SZ]]*, i[[SZ]]** [[LOCAL_VLA1]], +// CHECK-DAG: [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[REF_VLA1]], +// CHECK-DAG: [[REF_BN:%.+]] = load float*, float** [[LOCAL_BN]], +// CHECK-DAG: [[REF_C:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[LOCAL_C]], +// CHECK-DAG: [[REF_VLA2:%.+]] = load i[[SZ]]*, i[[SZ]]** [[LOCAL_VLA2]], +// CHECK-DAG: [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[REF_VLA2]], +// CHECK-DAG: [[REF_VLA3:%.+]] = load i[[SZ]]*, i[[SZ]]** [[LOCAL_VLA3]], +// CHECK-DAG: [[VAL_VLA3:%.+]] = load i[[SZ]], i[[SZ]]* [[REF_VLA3]], +// CHECK-DAG: [[REF_CN:%.+]] = load double*, double** [[LOCAL_CN]], +// CHECK-DAG: [[REF_D:%.+]] = load [[TT]]*, [[TT]]** [[LOCAL_D]], +// CHECK-DAG: store i32* [[REF_A]], i32** [[FIELD_A:%.+]], +// CHECK-DAG: store [10 x float]* [[REF_B]], [10 x float]** [[FIELD_B:%.+]], +// CHECK-DAG: store i[[SZ]] [[VAL_VLA1]], i[[SZ]]* [[FIELD_VLA1:%.+]], +// CHECK-DAG: store float* [[REF_BN]], float** [[FIELD_BN:%.+]], +// CHECK-DAG: store [5 x [10 x double]]* [[REF_C]], [5 x [10 x double]]** [[FIELD_C:%.+]], +// CHECK-DAG: store i[[SZ]] [[VAL_VLA2]], i[[SZ]]* [[FIELD_VLA2:%.+]], +// CHECK-DAG: store i[[SZ]] [[VAL_VLA3]], i[[SZ]]* [[FIELD_VLA3:%.+]], +// CHECK-DAG: store double* [[REF_CN]], double** [[FIELD_CN:%.+]], +// CHECK-DAG: store [[TT]]* [[REF_D]], [[TT]]** [[FIELD_D:%.+]], +// CHECK-DAG: [[FIELD_A]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_B]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_VLA1]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_BN]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_C]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_VLA2]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_VLA3]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_CN]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_D]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// Get capture from context. +// CHECK: [[CAP2:%.+]] = load [[CAPTY]]*, [[CAPTY]]** [[CAPREF]] +// CHECK-DAG: [[FIELD2_A:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 0 +// CHECK-DAG: [[FIELD2_B:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 1 +// CHECK-DAG: [[FIELD2_VLA1:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 2 +// CHECK-DAG: [[FIELD2_BN:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 3 +// CHECK-DAG: [[FIELD2_C:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 4 +// CHECK-DAG: [[FIELD2_VLA2:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 5 +// CHECK-DAG: [[FIELD2_VLA3:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 6 +// CHECK-DAG: [[FIELD2_CN:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 7 +// CHECK-DAG: [[FIELD2_D:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 8 +// CHECK-DAG: [[REF2_A:%.+]] = load i32*, i32** [[FIELD2_A]], +// CHECK-DAG: [[REF2_B:%.+]] = load [10 x float]*, [10 x float]** [[FIELD2_B]], +// CHECK-DAG: [[REF2_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[FIELD2_VLA1]], +// CHECK-DAG: [[REF2_BN:%.+]] = load float*, float** [[FIELD2_BN]], +// CHECK-DAG: [[REF2_C:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[FIELD2_C]], +// CHECK-DAG: [[REF2_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[FIELD2_VLA2]], +// CHECK-DAG: [[REF2_VLA3:%.+]] = load i[[SZ]], i[[SZ]]* [[FIELD2_VLA3]], +// CHECK-DAG: [[REF2_CN:%.+]] = load double*, double** [[FIELD2_CN]], +// CHECK-DAG: [[REF2_D:%.+]] = load [[TT]]*, [[TT]]** [[FIELD2_D]], +// Use captures. +// CHECK-DAG: load i32, i32* [[REF2_A]] +// CHECK-DAG: getelementptr inbounds [10 x float], [10 x float]* [[REF2_B]], i[[SZ]] 0, i[[SZ]] 2 +// CHECK-DAG: getelementptr inbounds float, float* [[REF2_BN]], i[[SZ]] 3 +// CHECK-DAG: getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[REF2_C]], i[[SZ]] 0, i[[SZ]] 1 +// CHECK-DAG: getelementptr inbounds double, double* [[REF2_CN]], i[[SZ]] %{{.+}} +// CHECK-DAG: getelementptr inbounds [[TT]], [[TT]]* [[REF2_D]], i32 0, i32 0 + +template<typename tx> +tx ftemplate(int n) { + tx a = 0; + short aa = 0; + tx b[10]; + + #pragma omp target if(n>40) + { + a += 1; + aa += 1; + b[2] += 1; + } + + return a; +} + +static +int fstatic(int n) { + int a = 0; + short aa = 0; + char aaa = 0; + int b[10]; + + #pragma omp target if(n>50) + { + a += 1; + aa += 1; + aaa += 1; + b[2] += 1; + } + + return a; +} + +struct S1 { + double a; + + int r1(int n){ + int b = n+1; + short int c[2][n]; + + #pragma omp target if(n>60) + { + this->a = (double)b + 1.5; + c[1][1] = ++a; + } + + return c[1][1] + (int)b; + } +}; + +// CHECK: define {{.*}}@{{.*}}bar{{.*}} +int bar(int n){ + int a = 0; + + // CHECK: call {{.*}}i32 [[FOO]](i32 {{.*}}) + a += foo(n); + + S1 S; + // CHECK: call {{.*}}i32 [[FS1:@.+]]([[S1]]* {{.*}}, i32 {{.*}}) + a += S.r1(n); + + // CHECK: call {{.*}}i32 [[FSTATIC:@.+]](i32 {{.*}}) + a += fstatic(n); + + // CHECK: call {{.*}}i32 [[FTEMPLATE:@.+]](i32 {{.*}}) + a += ftemplate<int>(n); + + return a; +} + +// +// CHECK: define {{.*}}[[FS1]] +// +// We capture 2 VLA sizes in this target region +// CHECK: store i[[SZ]] 2, i[[SZ]]* [[VLA0:%[^,]+]] +// CHECK: store i[[SZ]] [[CELEMSIZE1:%.+]], i[[SZ]]* [[VLA1:%[^,]+]] +// CHECK: [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[CELEMSIZE1]] +// CHECK: [[CSIZE:%.+]] = mul nuw i[[SZ]] 2, [[CELEMSIZE2]] + +// CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60 +// CHECK: br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]] +// CHECK: [[TRY]] +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 5, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* [[SR:%[^,]+]], i32* getelementptr inbounds ([5 x i32], [5 x i32]* [[MAPT7]], i32 0, i32 0)) +// CHECK-DAG: [[BPR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP:%.+]], i32 0, i32 0 +// CHECK-DAG: [[PR]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P:%.+]], i32 0, i32 0 +// CHECK-DAG: [[SR]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S:%.+]], i32 0, i32 0 +// CHECK-DAG: [[SADDR0:%.+]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S]], i32 [[IDX0:[0-9]+]] +// CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX0]] +// CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX0]] +// CHECK-DAG: [[SADDR1:%.+]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S]], i32 [[IDX1:[0-9]+]] +// CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX1]] +// CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX1]] +// CHECK-DAG: [[SADDR2:%.+]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S]], i32 [[IDX2:[0-9]+]] +// CHECK-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX2]] +// CHECK-DAG: [[PADDR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX2]] +// CHECK-DAG: [[SADDR3:%.+]] = getelementptr inbounds [5 x i[[SZ]]], [5 x i[[SZ]]]* [[S]], i32 [[IDX3:[0-9]+]] +// CHECK-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BP]], i32 [[IDX3]] +// CHECK-DAG: [[PADDR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[P]], i32 [[IDX3]] + +// The names below are not necessarily consistent with the names used for the +// addresses above as some are repeated. +// CHECK-DAG: [[BP0:%[^,]+]] = bitcast i[[SZ]]* [[VLA0]] to i8* +// CHECK-DAG: [[P0:%[^,]+]] = bitcast i[[SZ]]* [[VLA0]] to i8* +// CHECK-DAG: store i8* [[BP0]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P0]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + +// CHECK-DAG: [[BP1:%[^,]+]] = bitcast i[[SZ]]* [[VLA1]] to i8* +// CHECK-DAG: [[P1:%[^,]+]] = bitcast i[[SZ]]* [[VLA1]] to i8* +// CHECK-DAG: store i8* [[BP1]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P1]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}} + +// CHECK-DAG: [[BP2:%[^,]+]] = bitcast i32* %{{.+}} to i8* +// CHECK-DAG: [[P2:%[^,]+]] = bitcast i32* %{{.+}} to i8* +// CHECK-DAG: store i8* [[BP2]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P2]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] 4, i[[SZ]]* {{%[^,]+}} + +// CHECK-DAG: [[BP3:%[^,]+]] = bitcast [[S1]]* %{{.+}} to i8* +// CHECK-DAG: [[P3:%[^,]+]] = bitcast [[S1]]* %{{.+}} to i8* +// CHECK-DAG: store i8* [[BP3]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P3]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] 8, i[[SZ]]* {{%[^,]+}} + +// CHECK-DAG: [[BP4:%[^,]+]] = bitcast i16* %{{.+}} to i8* +// CHECK-DAG: [[P4:%[^,]+]] = bitcast i16* %{{.+}} to i8* +// CHECK-DAG: store i8* [[BP4]], i8** {{%[^,]+}} +// CHECK-DAG: store i8* [[P4]], i8** {{%[^,]+}} +// CHECK-DAG: store i[[SZ]] [[CSIZE]], i[[SZ]]* {{%[^,]+}} + +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL]], label %[[END:[^,]+]] +// CHECK: [[FAIL]] +// CHECK: call void [[HVT7:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}) +// CHECK-NEXT: br label %[[END]] +// CHECK: [[END]] + +// +// CHECK: define {{.*}}[[FSTATIC]] +// +// CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 50 +// CHECK: br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]] +// CHECK: [[TRY]] +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 4, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* getelementptr inbounds ([4 x i[[SZ]]], [4 x i[[SZ]]]* [[SIZET6]], i32 0, i32 0), i32* getelementptr inbounds ([4 x i32], [4 x i32]* [[MAPT6]], i32 0, i32 0)) +// CHECK-DAG: [[BPR]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP:%.+]], i32 0, i32 0 +// CHECK-DAG: [[PR]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P:%.+]], i32 0, i32 0 + +// CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 0 +// CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 0 +// CHECK-DAG: store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]] +// CHECK-DAG: store i8* [[P0:%[^,]+]], i8** [[PADDR0]] +// CHECK-DAG: [[BP0]] = bitcast i32* %{{.+}} to i8* +// CHECK-DAG: [[P0]] = bitcast i32* %{{.+}} to i8* + +// CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 1 +// CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 1 +// CHECK-DAG: store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]] +// CHECK-DAG: store i8* [[P1:%[^,]+]], i8** [[PADDR1]] +// CHECK-DAG: [[BP1]] = bitcast i16* %{{.+}} to i8* +// CHECK-DAG: [[P1]] = bitcast i16* %{{.+}} to i8* + +// CHECK-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 2 +// CHECK-DAG: [[PADDR2:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 2 +// CHECK-DAG: store i8* [[BP2:%[^,]+]], i8** [[BPADDR2]] +// CHECK-DAG: store i8* [[P2:%[^,]+]], i8** [[PADDR2]] + +// CHECK-DAG: [[BPADDR3:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 3 +// CHECK-DAG: [[PADDR3:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 3 +// CHECK-DAG: store i8* [[BP3:%[^,]+]], i8** [[BPADDR3]] +// CHECK-DAG: store i8* [[P3:%[^,]+]], i8** [[PADDR3]] +// CHECK-DAG: [[BP3]] = bitcast [10 x i32]* %{{.+}} to i8* +// CHECK-DAG: [[P3]] = bitcast [10 x i32]* %{{.+}} to i8* + +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL]], label %[[END:[^,]+]] +// CHECK: [[FAIL]] +// CHECK: call void [[HVT6:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}, {{[^,]+}}) +// CHECK-NEXT: br label %[[END]] +// CHECK: [[END]] + +// +// CHECK: define {{.*}}[[FTEMPLATE]] +// +// CHECK: [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 40 +// CHECK: br i1 [[IF]], label %[[TRY:[^,]+]], label %[[FAIL:[^,]+]] +// CHECK: [[TRY]] +// CHECK-DAG: [[RET:%.+]] = call i32 @__tgt_target(i32 -1, i8* @{{[^,]+}}, i32 3, i8** [[BPR:%[^,]+]], i8** [[PR:%[^,]+]], i[[SZ]]* getelementptr inbounds ([3 x i[[SZ]]], [3 x i[[SZ]]]* [[SIZET5]], i32 0, i32 0), i32* getelementptr inbounds ([3 x i32], [3 x i32]* [[MAPT5]], i32 0, i32 0)) +// CHECK-DAG: [[BPR]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP:%.+]], i32 0, i32 0 +// CHECK-DAG: [[PR]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P:%.+]], i32 0, i32 0 + +// CHECK-DAG: [[BPADDR0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 0 +// CHECK-DAG: [[PADDR0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 0 +// CHECK-DAG: store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]] +// CHECK-DAG: store i8* [[P0:%[^,]+]], i8** [[PADDR0]] +// CHECK-DAG: [[BP0]] = bitcast i32* %{{.+}} to i8* +// CHECK-DAG: [[P0]] = bitcast i32* %{{.+}} to i8* + +// CHECK-DAG: [[BPADDR1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 1 +// CHECK-DAG: [[PADDR1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 1 +// CHECK-DAG: store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]] +// CHECK-DAG: store i8* [[P1:%[^,]+]], i8** [[PADDR1]] +// CHECK-DAG: [[BP1]] = bitcast i16* %{{.+}} to i8* +// CHECK-DAG: [[P1]] = bitcast i16* %{{.+}} to i8* + +// CHECK-DAG: [[BPADDR2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 2 +// CHECK-DAG: [[PADDR2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 2 +// CHECK-DAG: store i8* [[BP2:%[^,]+]], i8** [[BPADDR2]] +// CHECK-DAG: store i8* [[P2:%[^,]+]], i8** [[PADDR2]] +// CHECK-DAG: [[BP2]] = bitcast [10 x i32]* %{{.+}} to i8* +// CHECK-DAG: [[P2]] = bitcast [10 x i32]* %{{.+}} to i8* + +// CHECK: [[ERROR:%.+]] = icmp ne i32 [[RET]], 0 +// CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL]], label %[[END:[^,]+]] +// CHECK: [[FAIL]] +// CHECK: call void [[HVT5:@.+]]({{[^,]+}}, {{[^,]+}}, {{[^,]+}}) +// CHECK-NEXT: br label %[[END]] +// CHECK: [[END]] + +// Check that the offloading functions are emitted and that the arguments are +// correct and loaded correctly for the target regions of the callees of bar(). + +// CHECK: define internal void [[HVT7]] +// Create context and reference. +// CHECK: [[CAPREF:%.+]] = alloca [[CAPTY:%[^,].+]]*, + +// Create local storage for each capture. +// CHECK: [[LOCAL_THIS:%.+]] = alloca [[S1]]* +// CHECK: [[LOCAL_B:%.+]] = alloca i32* +// CHECK: [[LOCAL_VLA1:%.+]] = alloca i[[SZ]]* +// CHECK: [[LOCAL_VLA2:%.+]] = alloca i[[SZ]]* +// CHECK: [[LOCAL_C:%.+]] = alloca i16* +// CHECK: [[CAP:%.+]] = alloca [[CAPTY]], +// CHECK-DAG: store [[CAPTY]]* [[CAP]], [[CAPTY]]** [[CAPREF]] +// CHECK-DAG: store [[S1]]* [[ARG_THIS:%.+]], [[S1]]** [[LOCAL_THIS]] +// CHECK-DAG: store i32* [[ARG_B:%.+]], i32** [[LOCAL_B]] +// CHECK-DAG: store i[[SZ]]* [[ARG_VLA1:%.+]], i[[SZ]]** [[LOCAL_VLA1]] +// CHECK-DAG: store i[[SZ]]* [[ARG_VLA2:%.+]], i[[SZ]]** [[LOCAL_VLA2]] +// CHECK-DAG: store i16* [[ARG_C:%.+]], i16** [[LOCAL_C]] +// Store captures in the context. +// CHECK-DAG: [[REF_THIS:%.+]] = load [[S1]]*, [[S1]]** [[LOCAL_THIS]], +// CHECK-DAG: [[REF_B:%.+]] = load i32*, i32** [[LOCAL_B]], +// CHECK-DAG: [[REF_VLA1:%.+]] = load i[[SZ]]*, i[[SZ]]** [[LOCAL_VLA1]], +// CHECK-DAG: [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[REF_VLA1]], +// CHECK-DAG: [[REF_VLA2:%.+]] = load i[[SZ]]*, i[[SZ]]** [[LOCAL_VLA2]], +// CHECK-DAG: [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[REF_VLA2]], +// CHECK-DAG: [[REF_C:%.+]] = load i16*, i16** [[LOCAL_C]], +// CHECK-DAG: store [[S1]]* [[REF_THIS]], [[S1]]** [[FIELD_THIS:%.+]], +// CHECK-DAG: store i32* [[REF_B]], i32** [[FIELD_B:%.+]], +// CHECK-DAG: store i[[SZ]] [[VAL_VLA1]], i[[SZ]]* [[FIELD_VLA1:%.+]], +// CHECK-DAG: store i[[SZ]] [[VAL_VLA2]], i[[SZ]]* [[FIELD_VLA2:%.+]], +// CHECK-DAG: store i16* [[REF_C]], i16** [[FIELD_C:%.+]], +// CHECK-DAG: [[FIELD_THIS]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_B]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_VLA1]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_VLA2]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_C]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// Get capture from context. +// CHECK: [[CAP2:%.+]] = load [[CAPTY]]*, [[CAPTY]]** [[CAPREF]] +// CHECK-DAG: [[FIELD2_THIS:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 0 +// CHECK-DAG: [[FIELD2_B:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 1 +// CHECK-DAG: [[FIELD2_VLA1:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 2 +// CHECK-DAG: [[FIELD2_VLA2:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 3 +// CHECK-DAG: [[FIELD2_C:%.+]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 4 +// CHECK-DAG: [[REF2_THIS:%.+]] = load [[S1]]*, [[S1]]** [[FIELD2_THIS]], +// CHECK-DAG: [[REF2_B:%.+]] = load i32*, i32** [[FIELD2_B]], +// CHECK-DAG: [[REF2_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[FIELD2_VLA1]], +// CHECK-DAG: [[REF2_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[FIELD2_VLA2]], +// CHECK-DAG: [[REF2_C:%.+]] = load i16*, i16** [[FIELD2_C]], +// Use captures. +// CHECK-DAG: getelementptr inbounds [[S1]], [[S1]]* [[REF2_THIS]], i32 0, i32 0 +// CHECK-DAG: load i32, i32* [[REF2_B]] +// CHECK-DAG: getelementptr inbounds i16, i16* [[REF2_C]], i[[SZ]] %{{.+}} + + +// CHECK: define internal void [[HVT6]] +// Create context and reference. +// CHECK-DAG: [[CAPREF:%.+]] = alloca [[CAPTY:%[^,].+]]*, +// CHECK-DAG: [[CAP:%.+]] = alloca [[CAPTY]], +// CHECK-DAG: store [[CAPTY]]* [[CAP]], [[CAPTY]]** [[CAPREF]] +// Create local storage for each capture. +// CHECK-DAG: [[LOCAL_A:%.+]] = alloca i32* +// CHECK-DAG: [[LOCAL_AA:%.+]] = alloca i16* +// CHECK-DAG: [[LOCAL_AAA:%.+]] = alloca i8* +// CHECK-DAG: [[LOCAL_B:%.+]] = alloca [10 x i32]* +// CHECK-DAG: store i32* [[ARG_A:%.+]], i32** [[LOCAL_A]] +// CHECK-DAG: store i16* [[ARG_AA:%.+]], i16** [[LOCAL_AA]] +// CHECK-DAG: store i8* [[ARG_AAA:%.+]], i8** [[LOCAL_AAA]] +// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]] +// Store captures in the context. +// CHECK-DAG: [[REF_A:%.+]] = load i32*, i32** [[LOCAL_A]], +// CHECK-DAG: [[REF_AA:%.+]] = load i16*, i16** [[LOCAL_AA]], +// CHECK-DAG: [[REF_AAA:%.+]] = load i8*, i8** [[LOCAL_AAA]], +// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], +// CHECK-DAG: store i32* [[REF_A]], i32** [[FIELD_A:%.+]], +// CHECK-DAG: store i16* [[REF_AA]], i16** [[FIELD_AA:%.+]], +// CHECK-DAG: store i8* [[REF_AAA]], i8** [[FIELD_AAA:%.+]], +// CHECK-DAG: store [10 x i32]* [[REF_B]], [10 x i32]** [[FIELD_B:%.+]], +// CHECK-DAG: [[FIELD_A]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_AA]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_AAA]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_B]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// Get capture from context. +// CHECK: [[CAP2:%.+]] = load [[CAPTY]]*, [[CAPTY]]** [[CAPREF]] +// CHECK-DAG: [[REF2_A:%.+]] = load i32*, i32** [[FIELD2_A:%.+]], +// CHECK-DAG: [[REF2_AA:%.+]] = load i16*, i16** [[FIELD2_AA:%.+]], +// CHECK-DAG: [[REF2_AAA:%.+]] = load i8*, i8** [[FIELD2_AAA:%.+]], +// CHECK-DAG: [[REF2_B:%.+]] = load [10 x i32]*, [10 x i32]** [[FIELD2_B:%.+]], +// CHECK-DAG: [[FIELD2_A]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD2_AA]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD2_AAA]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD2_B]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 {{[0-9]}} +// Use captures. +// CHECK-DAG: load i32, i32* [[REF2_A]] +// CHECK-DAG: load i16, i16* [[REF2_AA]] +// CHECK-DAG: load i8, i8* [[REF2_AAA]] +// CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF2_B]], i[[SZ]] 0, i[[SZ]] 2 + +// CHECK: define internal void [[HVT5]] +// Create context and reference. +// CHECK-DAG: [[CAPREF:%.+]] = alloca [[CAPTY:%[^,].+]]*, +// CHECK-DAG: [[CAP:%.+]] = alloca [[CAPTY]], +// CHECK-DAG: store [[CAPTY]]* [[CAP]], [[CAPTY]]** [[CAPREF]] +// Create local storage for each capture. +// CHECK-DAG: [[LOCAL_A:%.+]] = alloca i32* +// CHECK-DAG: [[LOCAL_AA:%.+]] = alloca i16* +// CHECK-DAG: [[LOCAL_B:%.+]] = alloca [10 x i32]* +// CHECK-DAG: store i32* [[ARG_A:%.+]], i32** [[LOCAL_A]] +// CHECK-DAG: store i16* [[ARG_AA:%.+]], i16** [[LOCAL_AA]] +// CHECK-DAG: store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]] +// Store captures in the context. +// CHECK-DAG: [[REF_A:%.+]] = load i32*, i32** [[LOCAL_A]], +// CHECK-DAG: [[REF_AA:%.+]] = load i16*, i16** [[LOCAL_AA]], +// CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], +// CHECK-DAG: store i32* [[REF_A]], i32** [[FIELD_A:%.+]], +// CHECK-DAG: store i16* [[REF_AA]], i16** [[FIELD_AA:%.+]], +// CHECK-DAG: store [10 x i32]* [[REF_B]], [10 x i32]** [[FIELD_B:%.+]], +// CHECK-DAG: [[FIELD_A]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_AA]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD_B]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP]], i32 0, i32 {{[0-9]}} +// Get capture from context. +// CHECK: [[CAP2:%.+]] = load [[CAPTY]]*, [[CAPTY]]** [[CAPREF]] +// CHECK-DAG: [[REF2_A:%.+]] = load i32*, i32** [[FIELD2_A:%.+]], +// CHECK-DAG: [[REF2_AA:%.+]] = load i16*, i16** [[FIELD2_AA:%.+]], +// CHECK-DAG: [[REF2_B:%.+]] = load [10 x i32]*, [10 x i32]** [[FIELD2_B:%.+]], +// CHECK-DAG: [[FIELD2_A]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD2_AA]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 {{[0-9]}} +// CHECK-DAG: [[FIELD2_B]] = getelementptr inbounds [[CAPTY]], [[CAPTY]]* [[CAP2]], i32 0, i32 {{[0-9]}} +// Use captures. +// CHECK-DAG: load i32, i32* [[REF2_A]] +// CHECK-DAG: load i16, i16* [[REF2_AA]] +// CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF2_B]], i[[SZ]] 0, i[[SZ]] 2 +#endif Index: lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- lib/CodeGen/CGStmtOpenMP.cpp +++ lib/CodeGen/CGStmtOpenMP.cpp @@ -2230,8 +2230,42 @@ CGM.getOpenMPRuntime().emitInlinedDirective(*this, OMPD_atomic, CodeGen); } -void CodeGenFunction::EmitOMPTargetDirective(const OMPTargetDirective &) { - llvm_unreachable("CodeGen for 'omp target' is not supported yet."); +void CodeGenFunction::EmitOMPTargetDirective(const OMPTargetDirective &S) { + LexicalScope Scope(*this, S.getSourceRange()); + const CapturedStmt &CS = *cast<CapturedStmt>(S.getAssociatedStmt()); + + // Emit target region as a standalone region. + auto &&CodeGen = [&CS](CodeGenFunction &CGF) { + CGF.EmitStmt(CS.getCapturedStmt()); + }; + + // Obtain the target region outlined function. + llvm::Value *Fn = + CGM.getOpenMPRuntime().emitTargetOutlinedFunction(S, CodeGen); + + // Check if we have any if clause associated with the directive. + const Expr *IfCond = nullptr; + + if (auto *C = S.getSingleClause<OMPIfClause>()) { + IfCond = C->getCondition(); + } + + // Check if we have any device clause associated with the directive. + const Expr *Device = nullptr; + if (auto *C = S.getSingleClause<OMPDeviceClause>()) { + Device = C->getDevice(); + } + + SmallVector<llvm::Value *, 8> VLASizesInit; + for (auto F : CS.getCapturedRecordDecl()->fields()) + if (F->hasCapturedVLAType()) { + auto *V = VLASizeMap[F->getCapturedVLAType()->getSizeExpr()]; + assert(V && "VLA size must exist!"); + VLASizesInit.push_back(V); + } + + CGM.getOpenMPRuntime().emitTargetCall(*this, S, Fn, IfCond, Device, + VLASizesInit); } void CodeGenFunction::EmitOMPTeamsDirective(const OMPTeamsDirective &) { Index: lib/CodeGen/CGOpenMPRuntime.h =================================================================== --- lib/CodeGen/CGOpenMPRuntime.h +++ lib/CodeGen/CGOpenMPRuntime.h @@ -154,6 +154,14 @@ // Call to kmp_int32 __kmpc_cancel(ident_t *loc, kmp_int32 global_tid, // kmp_int32 cncl_kind); OMPRTL__kmpc_cancel, + + // + // Offloading related calls + // + // Call to int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t + // arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t + // *arg_types); + OMPRTL__tgt_target, }; /// \brief Values for bit flags used in the ident_t to describe the fields. @@ -177,6 +185,7 @@ /// \brief Implicit barrier in 'single' directive. OMP_IDENT_BARRIER_IMPL_SINGLE = 0x140 }; + CodeGenModule &CGM; /// \brief Default const ident_t object used for initialization of all other /// ident_t objects. @@ -716,6 +725,28 @@ /// virtual void emitCancelCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind CancelRegion); + + /// \brief Emit outilined function for 'target' directive. + /// \param D Directive to emit. + /// \param CodeGen Code generation sequence for the \a D directive. + virtual llvm::Value * + emitTargetOutlinedFunction(const OMPExecutableDirective &D, + const RegionCodeGenTy &CodeGen); + + /// \brief Emit the target offloading code associated with \a D. The emitted + /// code attempts offloading the execution to the device, an the event of + /// a failure it executes the host version outlined in \a OutlinedFn. + /// \param D Directive to emit. + /// \param OutlinedFn Host version of the code to be offloaded. + /// \param IfCond Expression evaluated in if clause associated with the target + /// directive, or null if no if clause is used. + /// \param Device Expression evaluated in device clause associated with the + /// target directive, or null if no device clause is used. + virtual void emitTargetCall(CodeGenFunction &CGF, + const OMPExecutableDirective &D, + llvm::Value *OutlinedFn, const Expr *IfCond, + const Expr *Device, + ArrayRef<llvm::Value *> VLASizesInit); }; } // namespace CodeGen Index: lib/CodeGen/CGOpenMPRuntime.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntime.cpp +++ lib/CodeGen/CGOpenMPRuntime.cpp @@ -41,6 +41,8 @@ /// \brief Region for constructs that do not require function outlining, /// like 'for', 'sections', 'atomic' etc. directives. InlinedRegion, + /// \brief Region with outlined function for standalone 'target' directive. + TargetRegion, }; CGOpenMPRegionInfo(const CapturedStmt &CS, @@ -204,6 +206,28 @@ CGOpenMPRegionInfo *OuterRegionInfo; }; +/// \brief API for captured statement code generation in OpenMP target +/// constructs. For this captures, implicit parameters are used instead of the +/// captured fields. +class CGOpenMPTargetRegionInfo : public CGOpenMPRegionInfo { +public: + CGOpenMPTargetRegionInfo(const CapturedStmt &CS, + const RegionCodeGenTy &CodeGen) + : CGOpenMPRegionInfo(CS, TargetRegion, CodeGen, OMPD_target) {} + + /// \brief This is unused for target regions because each starts executing + /// with a single thread. + const VarDecl *getThreadIDVariable() const override { return nullptr; } + + /// \brief Get the name of the capture helper. + StringRef getHelperName() const override { return ".omp_offloading."; } + + static bool classof(const CGCapturedStmtInfo *Info) { + return CGOpenMPRegionInfo::classof(Info) && + cast<CGOpenMPRegionInfo>(Info)->getRegionKind() == TargetRegion; + } +}; + /// \brief RAII for emitting code of OpenMP constructs. class InlinedOpenMPRegionRAII { CodeGenFunction &CGF; @@ -861,6 +885,22 @@ RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_cancel"); break; } + case OMPRTL__tgt_target: { + // Build int32_t __tgt_target(int32_t device_id, void *host_ptr, int32_t + // arg_num, void** args_base, void **args, size_t *arg_sizes, int32_t + // *arg_types); + llvm::Type *TypeParams[] = {CGM.Int32Ty, + CGM.VoidPtrTy, + CGM.Int32Ty, + CGM.VoidPtrPtrTy, + CGM.VoidPtrPtrTy, + CGM.SizeTy->getPointerTo(), + CGM.Int32Ty->getPointerTo()}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_target"); + break; + } } return RTLFn; } @@ -2921,3 +2961,394 @@ } } } + +/// \brief Emit a proxy function for target regions, which accepts each capture +/// as an argument and maps each argument to a new context to pass to the +/// main outlined function. +/// \code +/// void .omp_offloading_entry.(ty1 capture1, ..., tyn capture n) { +/// Context ctx; +/// // fillup the context with all the arguments. +/// .omp_offloading.(ctx); +/// return; +/// } +/// \endcode +static llvm::Value *emitProxyTargetFunction(CodeGenModule &CGM, + const CapturedStmt &CS, + SourceLocation Loc, + llvm::Value *TargetFunction) { + auto &C = CGM.getContext(); + + // Collect the arguments of the main function. + FunctionArgList Args; + auto *RDecl = CS.getCapturedRecordDecl(); + auto RI = RDecl->field_begin(); + for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(), + CE = CS.capture_end(); + CI != CE; ++CI, ++RI) { + StringRef Name; + QualType Ty; + if (CI->capturesVariableArrayType()) { + Ty = C.getPointerType(RI->getType()); + Name = "__vla_size"; + } else if (CI->capturesThis()) { + Ty = RI->getType(); + Name = "__this"; + } else { + const VarDecl *VD = CI->getCapturedVar(); + Ty = VD->getType(); + + // If this declaration has a variable-length array type, we use a + // pointer type to the element type of the VLA. The resulting IR type is + // the same, will avoid the trouble of emiting the VLAs, and we won't + // be dereferencing the the VLA in this function anyway. + + // Get the element type of the variable-length array type. + while (auto *VAT = dyn_cast<VariableArrayType>(Ty.getTypePtr())) + Ty = VAT->getElementType(); + Ty = C.getPointerType(Ty); + Name = VD->getName(); + } + + IdentifierInfo *ParamName = &C.Idents.get(Name); + ImplicitParamDecl *Param = + ImplicitParamDecl::Create(C, /*DC=*/nullptr, Loc, ParamName, Ty); + Args.push_back(Param); + } + + // Create the main function. + FunctionType::ExtInfo Info; + auto &FnInfo = + CGM.getTypes().arrangeFreeFunctionDeclaration(C.VoidTy, Args, Info, + /*isVariadic=*/false); + auto *FnTy = CGM.getTypes().GetFunctionType(FnInfo); + auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage, + ".omp_offloading_entry.", &CGM.getModule()); + CGM.SetLLVMFunctionAttributes(/*D=*/nullptr, FnInfo, Fn); + CodeGenFunction CGF(CGM); + CGF.disableDebugInfo(); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args); + + // Create the context. + auto ContextType = C.getTagDeclType(RDecl); + auto Context = CGF.CreateMemTemp(ContextType, ".omp_offloading.context"); + + LValue ContextLV = + CGF.MakeNaturalAlignAddrLValue(Context.getPointer(), ContextType); + + auto AI = Args.begin(); + for (RecordDecl::field_iterator RI = RDecl->field_begin(), + RE = RDecl->field_end(); + RI != RE; ++RI, ++AI) { + + // Get the address of the field. + auto Dst = CGF.EmitLValueForFieldInitialization(ContextLV, *RI); + auto ArgRef = CGF.MakeNaturalAlignAddrLValue( + CGF.GetAddrOfLocalVar(*AI).getPointer(), (*AI)->getType()); + auto Arg = CGF.EmitLoadOfLValue(ArgRef, Loc); + if (RI->hasCapturedVLAType()) { + Address ArgAddr(Arg.getScalarVal(), C.getTypeAlignInChars(RI->getType())); + auto *Val = + CGF.EmitLoadOfScalar(ArgAddr, /*Volatile=*/false, RI->getType(), Loc); + CGF.EmitStoreOfScalar(Val, Dst); + continue; + } + CGF.EmitStoreOfScalar(Arg.getScalarVal(), Dst); + } + CGF.EmitCallOrInvoke(TargetFunction, Context.getPointer()); + CGF.FinishFunction(); + return Fn; +} + +llvm::Value * +CGOpenMPRuntime::emitTargetOutlinedFunction(const OMPExecutableDirective &D, + const RegionCodeGenTy &CodeGen) { + const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt()); + + CodeGenFunction CGF(CGM, true); + CGOpenMPTargetRegionInfo CGInfo(CS, CodeGen); + CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo); + auto *Fn = CGF.GenerateCapturedStmtFunction(CS); + Fn->addFnAttr(llvm::Attribute::AlwaysInline); + + return emitProxyTargetFunction(CGM, CS, D.getLocStart(), Fn); +} + +void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF, + const OMPExecutableDirective &D, + llvm::Value *OutlinedFn, + const Expr *IfCond, const Expr *Device, + ArrayRef<llvm::Value *> VLASizesInit) { + + /// \brief Values for bit flags used to specify the mapping type for + /// offloading. + enum OpenMPOffloadMappingFlags { + /// \brief Allocate memory on the device and move data from host to device. + OMP_MAP_TO = 0x01, + /// \brief Allocate memory on the device and move data from device to host. + OMP_MAP_FROM = 0x02, + }; + + enum OpenMPOffloadingReservedDeviceIDs { + /// \brief Device ID if the device was not defined, runtime should get it + /// from environment variables in the spec. + OMP_DEVICEID_UNDEF = -1, + }; + + // Fill up the arrays with the all the captured variables. + SmallVector<llvm::Value *, 16> BasePointers; + SmallVector<llvm::Value *, 16> Pointers; + SmallVector<llvm::Value *, 16> Sizes; + SmallVector<unsigned, 16> MapTypes; + + bool hasVLACaptures = VLASizesInit.size(); + + const CapturedStmt &CS = *cast<CapturedStmt>(D.getAssociatedStmt()); + auto RI = CS.getCapturedRecordDecl()->field_begin(); + auto II = CS.capture_init_begin(); + auto VI = VLASizesInit.begin(); + for (CapturedStmt::const_capture_iterator CI = CS.capture_begin(), + CE = CS.capture_end(); + CI != CE; ++CI, ++RI, ++II) { + StringRef Name; + QualType Ty; + llvm::Value *BasePointer; + llvm::Value *Pointer; + llvm::Value *Size; + unsigned MapType; + + if (CI->capturesVariableArrayType()) { + LValue LV = CGF.MakeNaturalAlignAddrLValue( + CGF.CreateMemTemp(RI->getType(), "__vla_size").getPointer(), + RI->getType()); + CGF.EmitStoreThroughLValue(RValue::get(*VI), LV); + ++VI; + BasePointer = Pointer = LV.getAddress().getPointer(); + uint64_t SizeVal = + CGM.getContext().getTypeSizeInChars(RI->getType()).getQuantity(); + Size = llvm::ConstantInt::get(CGM.SizeTy, SizeVal); + + hasVLACaptures = true; + // VLA sizes don't need to be copied back from the device. + MapType = OMP_MAP_TO; + } else if (CI->capturesThis()) { + BasePointer = Pointer = CGF.LoadCXXThis(); + const PointerType *PtrTy = cast<PointerType>(RI->getType().getTypePtr()); + uint64_t SizeVal = CGM.getContext() + .getTypeSizeInChars(PtrTy->getPointeeType()) + .getQuantity(); + Size = llvm::ConstantInt::get(CGM.SizeTy, SizeVal); + + // Default map type. + MapType = OMP_MAP_TO | OMP_MAP_FROM; + } else { + BasePointer = Pointer = + CGF.EmitLValue(cast<DeclRefExpr>(*II)).getAddress().getPointer(); + + const ReferenceType *PtrTy = + cast<ReferenceType>(RI->getType().getTypePtr()); + QualType ElementType = PtrTy->getPointeeType(); + + if (auto *VAT = dyn_cast<VariableArrayType>(ElementType.getTypePtr())) { + auto VATInfo = CGF.getVLASize(VAT); + Size = llvm::ConstantInt::get( + CGM.SizeTy, + CGM.getContext().getTypeSizeInChars(VATInfo.second).getQuantity()); + Size = CGF.Builder.CreateNUWMul(Size, VATInfo.first); + } else { + uint64_t ElementTypeSize = + CGM.getContext().getTypeSizeInChars(ElementType).getQuantity(); + Size = llvm::ConstantInt::get(CGM.SizeTy, ElementTypeSize); + } + + // Default map type. + MapType = OMP_MAP_TO | OMP_MAP_FROM; + } + + BasePointers.push_back(BasePointer); + Pointers.push_back(Pointer); + Sizes.push_back(Size); + MapTypes.push_back(MapType); + } + + if (IfCond) { + // Check if the if clause conditional always evaluates to true or false. + // If it evaluates to false, we only need to emit the host version of the + // target region. If it evaluates to true, we can proceed with the codegen + // as if no if clause was provided. + bool CondConstant; + if (CGF.ConstantFoldsToSimpleInteger(IfCond, CondConstant)) { + if (CondConstant) { + IfCond = nullptr; + } else { + CGF.Builder.CreateCall(OutlinedFn, BasePointers); + return; + } + } + } + + // Generate the code to launch the target region. The pattern is the + // following: + // + // ... + // br IfCond (if any), omp_offload, omp_offload_fail + // + // omp_offload.try: + // ; create arrays for offloading + // error = __tgt_target(...) + // br error, omp_offload_fail, omp_offload_end + // + // omp_offload.fail: + // host_version(...) + // + // omp_offload.end: + // ... + // + + auto OffloadTryBlock = CGF.createBasicBlock("omp_offload.try"); + auto OffloadFailBlock = CGF.createBasicBlock("omp_offload.fail"); + auto ContBlock = CGF.createBasicBlock("omp_offload.end"); + + if (IfCond) + CGF.EmitBranchOnBoolExpr(IfCond, OffloadTryBlock, OffloadFailBlock, + /*TrueCount=*/0); + + CGF.EmitBlock(OffloadTryBlock); + + unsigned PointerNumVal = BasePointers.size(); + llvm::Value *PointerNum = CGF.Builder.getInt32(PointerNumVal); + llvm::Value *BasePointersArray; + llvm::Value *PointersArray; + llvm::Value *SizesArray; + llvm::Value *MapTypesArray; + + if (PointerNumVal) { + llvm::APInt PointerNumAP(32, PointerNumVal, /*isSigned=*/true); + QualType PointerArrayType = CGF.getContext().getConstantArrayType( + CGF.getContext().VoidPtrTy, PointerNumAP, ArrayType::Normal, + /*IndexTypeQuals=*/0); + + BasePointersArray = + CGF.CreateMemTemp(PointerArrayType, ".offload_baseptrs").getPointer(); + PointersArray = + CGF.CreateMemTemp(PointerArrayType, ".offload_ptrs").getPointer(); + + // If we don't have any VLA types, we can use a constant array for the map + // sizes, otherwise we need to fill up the arrays as we do for the pointers. + if (hasVLACaptures) { + QualType SizeArrayType = CGF.getContext().getConstantArrayType( + CGF.getContext().getSizeType(), PointerNumAP, ArrayType::Normal, + /*IndexTypeQuals=*/0); + SizesArray = + CGF.CreateMemTemp(SizeArrayType, ".offload_sizes").getPointer(); + } else { + // We expect all the sizes to be constant, so we collect them to create + // a constant array. + SmallVector<llvm::Constant *, 16> ConstSizes; + for (auto S : Sizes) + ConstSizes.push_back(cast<llvm::Constant>(S)); + + auto *SizesArrayInit = llvm::ConstantArray::get( + llvm::ArrayType::get(CGM.SizeTy, ConstSizes.size()), ConstSizes); + auto *SizesArrayGbl = new llvm::GlobalVariable( + CGM.getModule(), SizesArrayInit->getType(), + /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, + SizesArrayInit, ".offload_sizes"); + SizesArrayGbl->setUnnamedAddr(true); + SizesArray = SizesArrayGbl; + } + + // The map types are always constant so we don't need to generate code to + // fill arrays. Instead, we create an array constant. + llvm::Constant *MapTypesArrayInit = + llvm::ConstantDataArray::get(CGF.Builder.getContext(), MapTypes); + auto *MapTypesArrayGbl = new llvm::GlobalVariable( + CGM.getModule(), MapTypesArrayInit->getType(), + /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, + MapTypesArrayInit, ".offload_maptypes"); + MapTypesArrayGbl->setUnnamedAddr(true); + MapTypesArray = MapTypesArrayGbl; + + for (unsigned i = 0; i < PointerNumVal; ++i) { + llvm::Value *BP = CGF.Builder.CreateConstInBoundsGEP2_32( + llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal), BasePointersArray, + 0, i); + Address BPAddr( + BP, CGM.getContext().getTypeAlignInChars(CGM.getContext().VoidPtrTy)); + CGF.Builder.CreateStore( + CGF.Builder.CreateBitCast(BasePointers[i], CGM.VoidPtrTy), BPAddr); + + llvm::Value *P = CGF.Builder.CreateConstInBoundsGEP2_32( + llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal), PointersArray, 0, + i); + Address PAddr( + P, CGM.getContext().getTypeAlignInChars(CGM.getContext().VoidPtrTy)); + CGF.Builder.CreateStore( + CGF.Builder.CreateBitCast(Pointers[i], CGM.VoidPtrTy), PAddr); + + if (hasVLACaptures) { + llvm::Value *S = CGF.Builder.CreateConstInBoundsGEP2_32( + llvm::ArrayType::get(CGM.SizeTy, PointerNumVal), SizesArray, 0, i); + Address SAddr(S, CGM.getContext().getTypeAlignInChars( + CGM.getContext().getSizeType())); + CGF.Builder.CreateStore( + CGF.Builder.CreateIntCast(Sizes[i], CGM.SizeTy, /*isSigned=*/true), + SAddr); + } + } + + BasePointersArray = CGF.Builder.CreateConstInBoundsGEP2_32( + llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal), BasePointersArray, + 0, 0); + PointersArray = CGF.Builder.CreateConstInBoundsGEP2_32( + llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal), PointersArray, 0, + 0); + SizesArray = CGF.Builder.CreateConstInBoundsGEP2_32( + llvm::ArrayType::get(CGM.SizeTy, PointerNumVal), SizesArray, 0, 0); + MapTypesArray = CGF.Builder.CreateConstInBoundsGEP2_32( + llvm::ArrayType::get(CGM.Int32Ty, PointerNumVal), MapTypesArray, 0, 0); + + } else { + BasePointersArray = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy); + PointersArray = llvm::ConstantPointerNull::get(CGM.VoidPtrPtrTy); + SizesArray = llvm::ConstantPointerNull::get(CGM.SizeTy->getPointerTo()); + MapTypesArray = llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()); + } + + // On top of the arrays that were filled up, the target offloading call takes + // as arguments the device id as well as the host pointer. The host pointer + // is used by the runtime library to identify the current target region, so + // it only has to be unique and not necessarily point to anything. It could be + // the pointer to the outlined function that implements the target region, but + // we aren't using that so that the compiler doesn't need to keep that, and + // could therefore inline the host function if proven worthwhile during + // optimization. + + llvm::Value *HostPtr = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, + llvm::GlobalValue::PrivateLinkage, + llvm::Constant::getNullValue(CGM.Int8Ty), ".offload_hstptr"); + + // Emit device ID if any. + llvm::Value *DeviceID; + if (Device) + DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device), + CGM.Int32Ty, /*isSigned=*/true); + else + DeviceID = CGF.Builder.getInt32(OMP_DEVICEID_UNDEF); + + llvm::Value *OffloadingArgs[] = {DeviceID, HostPtr, PointerNum, + BasePointersArray, PointersArray, SizesArray, + MapTypesArray}; + auto Return = CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__tgt_target), + OffloadingArgs); + auto Error = CGF.Builder.CreateIsNotNull(Return); + CGF.Builder.CreateCondBr(Error, OffloadFailBlock, ContBlock); + + CGF.EmitBlock(OffloadFailBlock); + CGF.Builder.CreateCall(OutlinedFn, BasePointers); + CGF.EmitBranch(ContBlock); + + CGF.EmitBlock(ContBlock, /*IsFinished=*/true); + return; +}
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits