[PATCH] D154568: [Clang][OpenMP] GPU simd directive code generation

2023-07-24 Thread Eric Wright via Phabricator via cfe-commits
efwright updated this revision to Diff 543540.
efwright added a comment.

Dropping off a simple test case. If this looks about what you would expect for 
the tests I have a couple more involved ones that I can repurpose and add in. 
For more complex tests we have a couple of the benchmark codes from ICPP that 
were working.

Some cleanup of the code gen is coming, will be on travel tomorrow so might 
take a day or two.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D154568/new/

https://reviews.llvm.org/D154568

Files:
  clang/lib/CodeGen/CGStmtOpenMP.cpp
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/Parse/ParseOpenMP.cpp
  llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
  llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
  llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
  openmp/libomptarget/DeviceRTL/src/Workshare.cpp
  openmp/libomptarget/test/offloading/simd.c

Index: openmp/libomptarget/test/offloading/simd.c
===
--- /dev/null
+++ openmp/libomptarget/test/offloading/simd.c
@@ -0,0 +1,27 @@
+#include 
+#include 
+#include 
+
+int main() {
+
+  int *A = (int*) malloc(32*sizeof(int));
+
+  #pragma omp target teams map(tofrom:A[0:32]) num_teams(1)
+  {
+#pragma omp parallel num_threads(32)
+{
+  #pragma omp simd
+  for(int i = 0; i < 32; i++)
+A[i] = 1;
+}
+  }
+
+  for(int i = 0; i < 32; i++)
+assert(A[i] == 1);
+
+  free(A);
+
+  printf("PASS\n");
+}
+// CHECK: PASS
+
Index: openmp/libomptarget/DeviceRTL/src/Workshare.cpp
===
--- openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -658,6 +658,19 @@
 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {
   FunctionTracingRAII();
 }
+
+void __kmpc_simd_51(
+  IdentTy *ident, void *WorkFn, uint64_t TripCount,
+  void **Args, uint32_t nargs
+) {
+  FunctionTracingRAII();
+
+  ASSERT(WorkFn); 
+  for(uint64_t omp_iv = 0; omp_iv < TripCount; omp_iv++) {
+((void (*)(uint64_t, void**))WorkFn)(omp_iv, Args);
+  }
+  
+}
 }
 
 #pragma omp end declare target
Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
===
--- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -454,6 +454,7 @@
   for (OutlineInfo &OI : OutlineInfos) {
 // Skip functions that have not finalized yet; may happen with nested
 // function generation.
+
 if (Fn && OI.getFunction() != Fn) {
   DeferredOutlines.push_back(OI);
   continue;
@@ -462,7 +463,6 @@
 ParallelRegionBlockSet.clear();
 Blocks.clear();
 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
-
 Function *OuterFn = OI.getFunction();
 CodeExtractorAnalysisCache CEAC(*OuterFn);
 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
@@ -481,11 +481,10 @@
 assert(Extractor.isEligible() &&
"Expected OpenMP outlining to be possible!");
 
-for (auto *V : OI.ExcludeArgsFromAggregate)
+for (auto *V : OI.ExcludeArgsFromAggregate) {
   Extractor.excludeArgFromAggregate(V);
-
+}
 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
-
 LLVM_DEBUG(dbgs() << "After  outlining: " << *OuterFn << "\n");
 LLVM_DEBUG(dbgs() << "   Outlined function: " << *OutlinedFn << "\n");
 assert(OutlinedFn->getReturnType()->isVoidTy() &&
@@ -1230,6 +1229,308 @@
   return AfterIP;
 }
 
+
+IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
+  const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
+  LoopBodyCallbackTy BodyGenCB,
+  TripCountCallbackTy DistanceCB,
+  PrivatizeCallbackTy PrivCB,
+  FinalizeCallbackTy FiniCB,
+  bool SPMDMode
+)
+{
+  assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
+
+  if (!updateToLocation(Loc))
+return Loc.IP;
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  Value *ThreadID = getOrCreateThreadID(Ident);
+
+  BasicBlock *InsertBB = Builder.GetInsertBlock();
+  Function *OuterFn = InsertBB->getParent();
+
+  LLVM_DEBUG(dbgs() << "At the start of createSimdLoop: " << *OuterFn << "\n");
+
+  // Save the outer alloca block because the insertion iterator may get
+  // invalidated and we still need this later.
+  BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
+
+  // Vector to remember instructions we used only during the modeling but which
+  // we want to delete at the end.
+  SmallVector ToBeDeleted;
+
+  // Create an artificial insertion point that will also ensure the blocks we
+  // are about to split are not degenerated.
+  auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
+
+  Instruction *ThenTI = UI, *ElseTI = nullptr;
+
+  BasicBlock *ThenBB = ThenTI->getParent();
+  BasicBlock *LRegDistanceBB = ThenBB->split

[PATCH] D154568: OpenMP GPU simd directive code generation

2023-07-05 Thread Eric Wright via Phabricator via cfe-commits
efwright created this revision.
efwright added a reviewer: jdoerfert.
Herald added subscribers: sunshaoce, guansong, hiraditya, yaxunl.
Herald added a project: All.
efwright requested review of this revision.
Herald added subscribers: llvm-commits, openmp-commits, cfe-commits, jplehr, 
sstefan1.
Herald added projects: clang, OpenMP, LLVM.

This is a portion of the work for implementing OpenMP's "simd" loop directive 
for GPUs. For now only working on upstreaming the code generation portion. 
Right now the runtime just runs the loop sequentially.

Style-wise we're using a similar methodology of other directives in 
libomptarget where the "parallel region" is outlined, and passed as an argument 
into the appropriate runtime function. These changes are in the OMPIRBuilder 
and right now are only enabled if OMPIRBuilder is enabled. The code also 
depends on the OMPCanonicalLoop class existing in the AST, which currently only 
happens when the OMPIRBuilder is enabled.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D154568

Files:
  clang/lib/CodeGen/CGStmtOpenMP.cpp
  clang/lib/CodeGen/CodeGenFunction.cpp
  clang/lib/Parse/ParseOpenMP.cpp
  llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
  llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
  llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
  openmp/libomptarget/DeviceRTL/src/Workshare.cpp

Index: openmp/libomptarget/DeviceRTL/src/Workshare.cpp
===
--- openmp/libomptarget/DeviceRTL/src/Workshare.cpp
+++ openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -658,6 +658,19 @@
 void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {
   FunctionTracingRAII();
 }
+
+void __kmpc_simd_51(
+  IdentTy *ident, void *WorkFn, uint64_t TripCount,
+  void **Args, uint32_t nargs
+) {
+  FunctionTracingRAII();
+
+  ASSERT(WorkFn); 
+  for(uint64_t omp_iv = 0; omp_iv < TripCount; omp_iv++) {
+((void (*)(uint64_t, void**))WorkFn)(omp_iv, Args);
+  }
+  
+}
 }
 
 #pragma omp end declare target
Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
===
--- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -454,6 +454,7 @@
   for (OutlineInfo &OI : OutlineInfos) {
 // Skip functions that have not finalized yet; may happen with nested
 // function generation.
+
 if (Fn && OI.getFunction() != Fn) {
   DeferredOutlines.push_back(OI);
   continue;
@@ -462,7 +463,6 @@
 ParallelRegionBlockSet.clear();
 Blocks.clear();
 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
-
 Function *OuterFn = OI.getFunction();
 CodeExtractorAnalysisCache CEAC(*OuterFn);
 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
@@ -481,11 +481,10 @@
 assert(Extractor.isEligible() &&
"Expected OpenMP outlining to be possible!");
 
-for (auto *V : OI.ExcludeArgsFromAggregate)
+for (auto *V : OI.ExcludeArgsFromAggregate) {
   Extractor.excludeArgFromAggregate(V);
-
+}
 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
-
 LLVM_DEBUG(dbgs() << "After  outlining: " << *OuterFn << "\n");
 LLVM_DEBUG(dbgs() << "   Outlined function: " << *OutlinedFn << "\n");
 assert(OutlinedFn->getReturnType()->isVoidTy() &&
@@ -1230,6 +1229,308 @@
   return AfterIP;
 }
 
+
+IRBuilder<>::InsertPoint OpenMPIRBuilder::createSimdLoop(
+  const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
+  LoopBodyCallbackTy BodyGenCB,
+  TripCountCallbackTy DistanceCB,
+  PrivatizeCallbackTy PrivCB,
+  FinalizeCallbackTy FiniCB,
+  bool SPMDMode
+)
+{
+  assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
+
+  if (!updateToLocation(Loc))
+return Loc.IP;
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  Value *ThreadID = getOrCreateThreadID(Ident);
+
+  BasicBlock *InsertBB = Builder.GetInsertBlock();
+  Function *OuterFn = InsertBB->getParent();
+
+  LLVM_DEBUG(dbgs() << "At the start of createSimdLoop: " << *OuterFn << "\n");
+
+  // Save the outer alloca block because the insertion iterator may get
+  // invalidated and we still need this later.
+  BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
+
+  // Vector to remember instructions we used only during the modeling but which
+  // we want to delete at the end.
+  SmallVector ToBeDeleted;
+
+  // Create an artificial insertion point that will also ensure the blocks we
+  // are about to split are not degenerated.
+  auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
+
+  Instruction *ThenTI = UI, *ElseTI = nullptr;
+
+  BasicBlock *ThenBB = ThenTI->getParent();
+  BasicBlock *LRegDistanceBB = ThenBB->splitBasicBlock(ThenTI, "omp.loop.distance");
+  BasicBlock *PRegEntryBB = LRegDistanceBB->splitBasicBlock(ThenTI, "omp.loop.entry");
+