[PATCH] D154568: [Clang][OpenMP] GPU simd directive code generation
efwright updated this revision to Diff 543540. efwright added a comment. Dropping off a simple test case. If this looks about what you would expect for the tests I have a couple more involved ones that I can repurpose and add in. For more complex tests we have a couple of the benchmark codes from ICPP that were working. Some cleanup of the code gen is coming, will be on travel tomorrow so might take a day or two. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D154568/new/ https://reviews.llvm.org/D154568 Files: clang/lib/CodeGen/CGStmtOpenMP.cpp clang/lib/CodeGen/CodeGenFunction.cpp clang/lib/Parse/ParseOpenMP.cpp llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h llvm/include/llvm/Frontend/OpenMP/OMPKinds.def llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp openmp/libomptarget/DeviceRTL/src/Workshare.cpp openmp/libomptarget/test/offloading/simd.c Index: openmp/libomptarget/test/offloading/simd.c === --- /dev/null +++ openmp/libomptarget/test/offloading/simd.c @@ -0,0 +1,27 @@ +#include +#include +#include + +int main() { + + int *A = (int*) malloc(32*sizeof(int)); + + #pragma omp target teams map(tofrom:A[0:32]) num_teams(1) + { +#pragma omp parallel num_threads(32) +{ + #pragma omp simd + for(int i = 0; i < 32; i++) +A[i] = 1; +} + } + + for(int i = 0; i < 32; i++) +assert(A[i] == 1); + + free(A); + + printf("PASS\n"); +} +// CHECK: PASS + Index: openmp/libomptarget/DeviceRTL/src/Workshare.cpp === --- openmp/libomptarget/DeviceRTL/src/Workshare.cpp +++ openmp/libomptarget/DeviceRTL/src/Workshare.cpp @@ -658,6 +658,19 @@ void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) { FunctionTracingRAII(); } + +void __kmpc_simd_51( + IdentTy *ident, void *WorkFn, uint64_t TripCount, + void **Args, uint32_t nargs +) { + FunctionTracingRAII(); + + ASSERT(WorkFn); + for(uint64_t omp_iv = 0; omp_iv < TripCount; omp_iv++) { +((void (*)(uint64_t, void**))WorkFn)(omp_iv, Args); + } + +} } #pragma omp end declare target Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp === --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -454,6 +454,7 @@ for (OutlineInfo &OI : OutlineInfos) { // Skip functions that have not finalized yet; may happen with nested // function generation. + if (Fn && OI.getFunction() != Fn) { DeferredOutlines.push_back(OI); continue; @@ -462,7 +463,6 @@ ParallelRegionBlockSet.clear(); Blocks.clear(); OI.collectBlocks(ParallelRegionBlockSet, Blocks); - Function *OuterFn = OI.getFunction(); CodeExtractorAnalysisCache CEAC(*OuterFn); CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, @@ -481,11 +481,10 @@ assert(Extractor.isEligible() && "Expected OpenMP outlining to be possible!"); -for (auto *V : OI.ExcludeArgsFromAggregate) +for (auto *V : OI.ExcludeArgsFromAggregate) { Extractor.excludeArgFromAggregate(V); - +} Function *OutlinedFn = Extractor.extractCodeRegion(CEAC); - LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n"); LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n"); assert(OutlinedFn->getReturnType()->isVoidTy() && @@ -1230,6 +1229,308 @@ return AfterIP; } + +IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( + const LocationDescription &Loc, InsertPointTy OuterAllocaIP, + LoopBodyCallbackTy BodyGenCB, + TripCountCallbackTy DistanceCB, + PrivatizeCallbackTy PrivCB, + FinalizeCallbackTy FiniCB, + bool SPMDMode +) +{ + assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous"); + + if (!updateToLocation(Loc)) +return Loc.IP; + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); + Value *ThreadID = getOrCreateThreadID(Ident); + + BasicBlock *InsertBB = Builder.GetInsertBlock(); + Function *OuterFn = InsertBB->getParent(); + + LLVM_DEBUG(dbgs() << "At the start of createSimdLoop: " << *OuterFn << "\n"); + + // Save the outer alloca block because the insertion iterator may get + // invalidated and we still need this later. + BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock(); + + // Vector to remember instructions we used only during the modeling but which + // we want to delete at the end. + SmallVector ToBeDeleted; + + // Create an artificial insertion point that will also ensure the blocks we + // are about to split are not degenerated. + auto *UI = new UnreachableInst(Builder.getContext(), InsertBB); + + Instruction *ThenTI = UI, *ElseTI = nullptr; + + BasicBlock *ThenBB = ThenTI->getParent(); + BasicBlock *LRegDistanceBB = ThenBB->split
[PATCH] D154568: OpenMP GPU simd directive code generation
efwright created this revision. efwright added a reviewer: jdoerfert. Herald added subscribers: sunshaoce, guansong, hiraditya, yaxunl. Herald added a project: All. efwright requested review of this revision. Herald added subscribers: llvm-commits, openmp-commits, cfe-commits, jplehr, sstefan1. Herald added projects: clang, OpenMP, LLVM. This is a portion of the work for implementing OpenMP's "simd" loop directive for GPUs. For now only working on upstreaming the code generation portion. Right now the runtime just runs the loop sequentially. Style-wise we're using a similar methodology of other directives in libomptarget where the "parallel region" is outlined, and passed as an argument into the appropriate runtime function. These changes are in the OMPIRBuilder and right now are only enabled if OMPIRBuilder is enabled. The code also depends on the OMPCanonicalLoop class existing in the AST, which currently only happens when the OMPIRBuilder is enabled. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D154568 Files: clang/lib/CodeGen/CGStmtOpenMP.cpp clang/lib/CodeGen/CodeGenFunction.cpp clang/lib/Parse/ParseOpenMP.cpp llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h llvm/include/llvm/Frontend/OpenMP/OMPKinds.def llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp openmp/libomptarget/DeviceRTL/src/Workshare.cpp Index: openmp/libomptarget/DeviceRTL/src/Workshare.cpp === --- openmp/libomptarget/DeviceRTL/src/Workshare.cpp +++ openmp/libomptarget/DeviceRTL/src/Workshare.cpp @@ -658,6 +658,19 @@ void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) { FunctionTracingRAII(); } + +void __kmpc_simd_51( + IdentTy *ident, void *WorkFn, uint64_t TripCount, + void **Args, uint32_t nargs +) { + FunctionTracingRAII(); + + ASSERT(WorkFn); + for(uint64_t omp_iv = 0; omp_iv < TripCount; omp_iv++) { +((void (*)(uint64_t, void**))WorkFn)(omp_iv, Args); + } + +} } #pragma omp end declare target Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp === --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -454,6 +454,7 @@ for (OutlineInfo &OI : OutlineInfos) { // Skip functions that have not finalized yet; may happen with nested // function generation. + if (Fn && OI.getFunction() != Fn) { DeferredOutlines.push_back(OI); continue; @@ -462,7 +463,6 @@ ParallelRegionBlockSet.clear(); Blocks.clear(); OI.collectBlocks(ParallelRegionBlockSet, Blocks); - Function *OuterFn = OI.getFunction(); CodeExtractorAnalysisCache CEAC(*OuterFn); CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, @@ -481,11 +481,10 @@ assert(Extractor.isEligible() && "Expected OpenMP outlining to be possible!"); -for (auto *V : OI.ExcludeArgsFromAggregate) +for (auto *V : OI.ExcludeArgsFromAggregate) { Extractor.excludeArgFromAggregate(V); - +} Function *OutlinedFn = Extractor.extractCodeRegion(CEAC); - LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n"); LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n"); assert(OutlinedFn->getReturnType()->isVoidTy() && @@ -1230,6 +1229,308 @@ return AfterIP; } + +IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop( + const LocationDescription &Loc, InsertPointTy OuterAllocaIP, + LoopBodyCallbackTy BodyGenCB, + TripCountCallbackTy DistanceCB, + PrivatizeCallbackTy PrivCB, + FinalizeCallbackTy FiniCB, + bool SPMDMode +) +{ + assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous"); + + if (!updateToLocation(Loc)) +return Loc.IP; + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); + Value *ThreadID = getOrCreateThreadID(Ident); + + BasicBlock *InsertBB = Builder.GetInsertBlock(); + Function *OuterFn = InsertBB->getParent(); + + LLVM_DEBUG(dbgs() << "At the start of createSimdLoop: " << *OuterFn << "\n"); + + // Save the outer alloca block because the insertion iterator may get + // invalidated and we still need this later. + BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock(); + + // Vector to remember instructions we used only during the modeling but which + // we want to delete at the end. + SmallVector ToBeDeleted; + + // Create an artificial insertion point that will also ensure the blocks we + // are about to split are not degenerated. + auto *UI = new UnreachableInst(Builder.getContext(), InsertBB); + + Instruction *ThenTI = UI, *ElseTI = nullptr; + + BasicBlock *ThenBB = ThenTI->getParent(); + BasicBlock *LRegDistanceBB = ThenBB->splitBasicBlock(ThenTI, "omp.loop.distance"); + BasicBlock *PRegEntryBB = LRegDistanceBB->splitBasicBlock(ThenTI, "omp.loop.entry"); +