AlexVlx created this revision. AlexVlx added reviewers: yaxunl, rjmccall, eli.friedman, arsenm, tra, jlebar. AlexVlx added a project: clang. Herald added a subscriber: ormris. Herald added a project: All. AlexVlx requested review of this revision. Herald added subscribers: cfe-commits, wdng.
This patch adds the CodeGen changes needed by the standard algorithm offload feature being proposed here: https://discourse.llvm.org/t/rfc-adding-c-parallel-algorithm-offload-support-to-clang-llvm/72159/1. The verbose documentation is included in the head of the patch series. This change concludes the set of additions needed in Clang, and essentially relaxes restrictions on what gets emitted on the device path, when compiling in `stdpar` mode (after the previous patch relaxed restrictions on what is semantically correct): 1. Unless a function is explicitly marked `__host__`, it will get emitted, whereas before only `__device__` and `__global__` functions would be emitted; - At the moment we special case `thread_local` handling and still do not emit them, as they will require more scaffolding that will be proposed at some point in the future. 2. Unsupported builtins are ignored as opposed to being marked as an error, as the decision on their validity is deferred to the `stdpar` specific code selection pass we are adding, which will be the topic of the final patch in this series; 3. We add the `stdpar` specific passes to the `opt` pipeline, independent of optimisation level: - When compiling for the accelerator / offload device, we add a code selection pass; - When compiling for the host, iff the user requested it via the `--stdpar-interpose-alloc` flag, we add a pass which replaces canonical allocation / deallocation functions with accelerator aware equivalents. A test to validate that unannotated functions get correctly emitted is added as well. Please note that `__device__`, `__global__` and `__host__` are used to match existing nomenclature, they would not be present in user code. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D155850 Files: clang/lib/CodeGen/BackendUtil.cpp clang/lib/CodeGen/CGBuiltin.cpp clang/lib/CodeGen/CodeGenModule.cpp clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp Index: clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp =================================================================== --- /dev/null +++ clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -x hip -emit-llvm -fcuda-is-device \ +// RUN: -o - %s | FileCheck --check-prefix=NO-STDPAR-DEV %s + +// RUN: %clang_cc1 --stdpar -emit-llvm -fcuda-is-device \ +// RUN: -o - %s | FileCheck --check-prefix=STDPAR-DEV %s + +#define __device__ __attribute__((device)) + +// NO-STDPAR-DEV-NOT: define {{.*}} void @_Z3fooPff({{.*}}) +// STDPAR-DEV: define {{.*}} void @_Z3fooPff({{.*}}) +void foo(float *a, float b) { + *a = b; +} + +// NO-STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}}) +// STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}}) +__device__ void bar(float *a, float b) { + *a = b; +} \ No newline at end of file Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -3545,7 +3545,12 @@ !Global->hasAttr<CUDAConstantAttr>() && !Global->hasAttr<CUDASharedAttr>() && !Global->getType()->isCUDADeviceBuiltinSurfaceType() && - !Global->getType()->isCUDADeviceBuiltinTextureType()) + !Global->getType()->isCUDADeviceBuiltinTextureType() && + !(LangOpts.HIPStdPar && + isa<FunctionDecl>(Global) && + !cast<FunctionDecl>(Global)->getBuiltinID() && + !Global->hasAttr<CUDAHostAttr>() && + !cast<FunctionDecl>(Global)->isVariadic())) return; } else { // We need to emit host-side 'shadows' for all global @@ -5310,7 +5315,9 @@ setNonAliasAttributes(D, GV); - if (D->getTLSKind() && !GV->isThreadLocal()) { + if (D->getTLSKind() && + !GV->isThreadLocal() && + !(getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice)) { if (D->getTLSKind() == VarDecl::TLS_Dynamic) CXXThreadLocals.push_back(D); setTLSMode(GV, *D); Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -5538,7 +5538,8 @@ llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr"); } - ErrorUnsupported(E, "builtin function"); + if (!getLangOpts().HIPStdPar) + ErrorUnsupported(E, "builtin function"); // Unknown builtin, for now just dump it out and return undef. return GetUndefRValue(E->getType()); Index: clang/lib/CodeGen/BackendUtil.cpp =================================================================== --- clang/lib/CodeGen/BackendUtil.cpp +++ clang/lib/CodeGen/BackendUtil.cpp @@ -77,6 +77,7 @@ #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/JumpThreading.h" +#include "llvm/Transforms/StdPar/StdPar.h" #include "llvm/Transforms/Utils/Debugify.h" #include "llvm/Transforms/Utils/EntryExitInstrumenter.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -1093,6 +1094,13 @@ TheModule->addModuleFlag(Module::Error, "UnifiedLTO", uint32_t(1)); } + if (LangOpts.HIPStdPar) { + if (LangOpts.CUDAIsDevice) + MPM.addPass(StdParAcceleratorCodeSelectionPass()); + else if (LangOpts.HIPStdParInterposeAlloc) + MPM.addPass(StdParAllocationInterpositionPass()); + } + // Now that we have all of the passes ready, run them. { PrettyStackTraceString CrashInfo("Optimizer");
Index: clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp =================================================================== --- /dev/null +++ clang/test/CodeGenStdPar/unannotated-functions-get-emitted.cpp @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -x hip -emit-llvm -fcuda-is-device \ +// RUN: -o - %s | FileCheck --check-prefix=NO-STDPAR-DEV %s + +// RUN: %clang_cc1 --stdpar -emit-llvm -fcuda-is-device \ +// RUN: -o - %s | FileCheck --check-prefix=STDPAR-DEV %s + +#define __device__ __attribute__((device)) + +// NO-STDPAR-DEV-NOT: define {{.*}} void @_Z3fooPff({{.*}}) +// STDPAR-DEV: define {{.*}} void @_Z3fooPff({{.*}}) +void foo(float *a, float b) { + *a = b; +} + +// NO-STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}}) +// STDPAR-DEV: define {{.*}} void @_Z3barPff({{.*}}) +__device__ void bar(float *a, float b) { + *a = b; +} \ No newline at end of file Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -3545,7 +3545,12 @@ !Global->hasAttr<CUDAConstantAttr>() && !Global->hasAttr<CUDASharedAttr>() && !Global->getType()->isCUDADeviceBuiltinSurfaceType() && - !Global->getType()->isCUDADeviceBuiltinTextureType()) + !Global->getType()->isCUDADeviceBuiltinTextureType() && + !(LangOpts.HIPStdPar && + isa<FunctionDecl>(Global) && + !cast<FunctionDecl>(Global)->getBuiltinID() && + !Global->hasAttr<CUDAHostAttr>() && + !cast<FunctionDecl>(Global)->isVariadic())) return; } else { // We need to emit host-side 'shadows' for all global @@ -5310,7 +5315,9 @@ setNonAliasAttributes(D, GV); - if (D->getTLSKind() && !GV->isThreadLocal()) { + if (D->getTLSKind() && + !GV->isThreadLocal() && + !(getLangOpts().HIPStdPar && getLangOpts().CUDAIsDevice)) { if (D->getTLSKind() == VarDecl::TLS_Dynamic) CXXThreadLocals.push_back(D); setTLSMode(GV, *D); Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -5538,7 +5538,8 @@ llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr"); } - ErrorUnsupported(E, "builtin function"); + if (!getLangOpts().HIPStdPar) + ErrorUnsupported(E, "builtin function"); // Unknown builtin, for now just dump it out and return undef. return GetUndefRValue(E->getType()); Index: clang/lib/CodeGen/BackendUtil.cpp =================================================================== --- clang/lib/CodeGen/BackendUtil.cpp +++ clang/lib/CodeGen/BackendUtil.cpp @@ -77,6 +77,7 @@ #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/JumpThreading.h" +#include "llvm/Transforms/StdPar/StdPar.h" #include "llvm/Transforms/Utils/Debugify.h" #include "llvm/Transforms/Utils/EntryExitInstrumenter.h" #include "llvm/Transforms/Utils/ModuleUtils.h" @@ -1093,6 +1094,13 @@ TheModule->addModuleFlag(Module::Error, "UnifiedLTO", uint32_t(1)); } + if (LangOpts.HIPStdPar) { + if (LangOpts.CUDAIsDevice) + MPM.addPass(StdParAcceleratorCodeSelectionPass()); + else if (LangOpts.HIPStdParInterposeAlloc) + MPM.addPass(StdParAllocationInterpositionPass()); + } + // Now that we have all of the passes ready, run them. { PrettyStackTraceString CrashInfo("Optimizer");
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits