llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-flang-openmp Author: Thirumalai Shaktivel (Thirumalai-Shaktivel) <details> <summary>Changes</summary> Continuation from https://github.com/llvm/llvm-project/pull/104748 > From Documentation: \ Evaluation of transformational array intrinsic functions may be freely subdivided into any number of units of work. The transformational array intrinsic functions are MATMUL, DOT_PRODUCT, SUM, PRODUCT, MAXVAL, MINVAL, COUNT, ANY, ALL, SPREAD, PACK, UNPACK, RESHAPE, TRANSPOSE, EOSHIFT, CSHIFT, MINLOC, and MAXLOC. Using Ivan's patch: https://github.com/llvm/llvm-project/pull/104748 the intrinsic calls were enclosed within the single construct. So, I decided to pick one and experiment to enclose them into a wsloop node which helps us to share the operations among different threads. Here, I picked SUM intrinsic and fixed it to generate wsloop and its operations as a body. See: for the generated MLIR. The same idea has to be performed to implement all the intrinsics mentioned in the documentation. --- Full diff: https://github.com/llvm/llvm-project/pull/113082.diff 6 Files Affected: - (modified) flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt (+1) - (modified) flang/lib/Optimizer/OpenMP/CMakeLists.txt (+1) - (modified) flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp (+116-35) - (modified) flang/lib/Optimizer/Passes/Pipelines.cpp (+2-2) - (modified) flang/test/Fir/basic-program.fir (+1-1) - (added) flang/test/Integration/OpenMP/workshare02.f90 (+36) ``````````diff diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt index fa3a59303137ff..43da1eb92d0306 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt @@ -26,6 +26,7 @@ add_flang_library(HLFIRTransforms FIRTransforms HLFIRDialect MLIRIR + FlangOpenMPTransforms ${dialect_libs} LINK_COMPONENTS diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt index 39e92d388288d4..776798d7239117 100644 --- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt @@ -21,6 +21,7 @@ add_flang_library(FlangOpenMPTransforms FortranCommon MLIRFuncDialect MLIROpenMPDialect + MLIRArithDialect HLFIRDialect MLIRIR MLIRPass diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp index 225c585a02d913..9fb84f3e099c4c 100644 --- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp +++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp @@ -16,6 +16,7 @@ // //===----------------------------------------------------------------------===// +#include "flang/Optimizer/Builder/HLFIRTools.h" #include <flang/Optimizer/Builder/FIRBuilder.h> #include <flang/Optimizer/Dialect/FIROps.h> #include <flang/Optimizer/Dialect/FIRType.h> @@ -335,49 +336,129 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion, for (auto [i, opOrSingle] : llvm::enumerate(regions)) { bool isLast = i + 1 == regions.size(); if (std::holds_alternative<SingleRegion>(opOrSingle)) { - OpBuilder singleBuilder(sourceRegion.getContext()); - Block *singleBlock = new Block(); - singleBuilder.setInsertionPointToStart(singleBlock); - OpBuilder allocaBuilder(sourceRegion.getContext()); Block *allocaBlock = new Block(); allocaBuilder.setInsertionPointToStart(allocaBlock); - OpBuilder parallelBuilder(sourceRegion.getContext()); - Block *parallelBlock = new Block(); - parallelBuilder.setInsertionPointToStart(parallelBlock); - - auto [allParallelized, copyprivateVars] = - moveToSingle(std::get<SingleRegion>(opOrSingle), allocaBuilder, - singleBuilder, parallelBuilder); - if (allParallelized) { - // The single region was not required as all operations were safe to - // parallelize - assert(copyprivateVars.empty()); - assert(allocaBlock->empty()); - delete singleBlock; + it = block.begin(); + while (&*it != terminator) + if (isa<hlfir::SumOp>(it)) + break; + else + it++; + + if (auto sumOp = dyn_cast<hlfir::SumOp>(it)) { + /// Implementation: + /// Intrinsic function `SUM` operations + /// -- + /// x = sum(array) + /// + /// is converted to + /// + /// !$omp parallel do + /// do i = 1, size(array) + /// x = x + array(i) + /// end do + /// !$omp end parallel do + + OpBuilder wslBuilder(sourceRegion.getContext()); + Block *wslBlock = new Block(); + wslBuilder.setInsertionPointToStart(wslBlock); + + Value target = dyn_cast<hlfir::AssignOp>(++it).getLhs(); + Value array = sumOp.getArray(); + Value dim = sumOp.getDim(); + fir::SequenceType arrayTy = dyn_cast<fir::SequenceType>( + hlfir::getFortranElementOrSequenceType(array.getType())); + llvm::ArrayRef<int64_t> arrayShape = arrayTy.getShape(); + if (arrayShape.size() == 1 && !dim) { + Value itr = allocaBuilder.create<fir::AllocaOp>( + loc, allocaBuilder.getI64Type()); + Value c_one = allocaBuilder.create<arith::ConstantOp>( + loc, allocaBuilder.getI64IntegerAttr(1)); + Value c_arr_size = allocaBuilder.create<arith::ConstantOp>( + loc, allocaBuilder.getI64IntegerAttr(arrayShape[0])); + // Value c_zero = allocaBuilder.create<arith::ConstantOp>(loc, + // allocaBuilder.getZeroAttr(arrayTy.getEleTy())); + // allocaBuilder.create<fir::StoreOp>(loc, c_zero, target); + + omp::WsloopOperands wslOps; + omp::WsloopOp wslOp = + rootBuilder.create<omp::WsloopOp>(loc, wslOps); + + hlfir::LoopNest ln; + ln.outerOp = wslOp; + omp::LoopNestOperands lnOps; + lnOps.loopLowerBounds.push_back(c_one); + lnOps.loopUpperBounds.push_back(c_arr_size); + lnOps.loopSteps.push_back(c_one); + lnOps.loopInclusive = wslBuilder.getUnitAttr(); + omp::LoopNestOp lnOp = + wslBuilder.create<omp::LoopNestOp>(loc, lnOps); + Block *lnBlock = wslBuilder.createBlock(&lnOp.getRegion()); + lnBlock->addArgument(c_one.getType(), loc); + wslBuilder.create<fir::StoreOp>( + loc, lnOp.getRegion().getArgument(0), itr); + Value tarLoad = wslBuilder.create<fir::LoadOp>(loc, target); + Value itrLoad = wslBuilder.create<fir::LoadOp>(loc, itr); + hlfir::DesignateOp arrDesOp = wslBuilder.create<hlfir::DesignateOp>( + loc, fir::ReferenceType::get(arrayTy.getEleTy()), array, + itrLoad); + Value desLoad = wslBuilder.create<fir::LoadOp>(loc, arrDesOp); + Value addf = + wslBuilder.create<arith::AddFOp>(loc, tarLoad, desLoad); + wslBuilder.create<fir::StoreOp>(loc, addf, target); + wslBuilder.create<omp::YieldOp>(loc); + ln.body = lnBlock; + wslOp.getRegion().push_back(wslBlock); + targetRegion.front().getOperations().splice( + wslOp->getIterator(), allocaBlock->getOperations()); + } else { + emitError(loc, "Only 1D array scalar assignment for sum " + "instrinsic is supported in workshare construct"); + return; + } } else { - omp::SingleOperands singleOperands; - if (isLast) - singleOperands.nowait = rootBuilder.getUnitAttr(); - singleOperands.copyprivateVars = copyprivateVars; - cleanupBlock(singleBlock); - for (auto var : singleOperands.copyprivateVars) { - mlir::func::FuncOp funcOp = - createCopyFunc(loc, var.getType(), firCopyFuncBuilder); - singleOperands.copyprivateSyms.push_back( - SymbolRefAttr::get(funcOp)); + OpBuilder singleBuilder(sourceRegion.getContext()); + Block *singleBlock = new Block(); + singleBuilder.setInsertionPointToStart(singleBlock); + + OpBuilder parallelBuilder(sourceRegion.getContext()); + Block *parallelBlock = new Block(); + parallelBuilder.setInsertionPointToStart(parallelBlock); + + auto [allParallelized, copyprivateVars] = + moveToSingle(std::get<SingleRegion>(opOrSingle), allocaBuilder, + singleBuilder, parallelBuilder); + if (allParallelized) { + // The single region was not required as all operations were safe to + // parallelize + assert(copyprivateVars.empty()); + assert(allocaBlock->empty()); + delete singleBlock; + } else { + omp::SingleOperands singleOperands; + if (isLast) + singleOperands.nowait = rootBuilder.getUnitAttr(); + singleOperands.copyprivateVars = copyprivateVars; + cleanupBlock(singleBlock); + for (auto var : singleOperands.copyprivateVars) { + mlir::func::FuncOp funcOp = + createCopyFunc(loc, var.getType(), firCopyFuncBuilder); + singleOperands.copyprivateSyms.push_back( + SymbolRefAttr::get(funcOp)); + } + omp::SingleOp singleOp = + rootBuilder.create<omp::SingleOp>(loc, singleOperands); + singleOp.getRegion().push_back(singleBlock); + targetRegion.front().getOperations().splice( + singleOp->getIterator(), allocaBlock->getOperations()); } - omp::SingleOp singleOp = - rootBuilder.create<omp::SingleOp>(loc, singleOperands); - singleOp.getRegion().push_back(singleBlock); - targetRegion.front().getOperations().splice( - singleOp->getIterator(), allocaBlock->getOperations()); + rootBuilder.getInsertionBlock()->getOperations().splice( + rootBuilder.getInsertionPoint(), parallelBlock->getOperations()); + delete parallelBlock; } - rootBuilder.getInsertionBlock()->getOperations().splice( - rootBuilder.getInsertionPoint(), parallelBlock->getOperations()); delete allocaBlock; - delete parallelBlock; } else { auto op = std::get<Operation *>(opOrSingle); if (auto wslw = dyn_cast<omp::WorkshareLoopWrapperOp>(op)) { diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index c1a5902b747887..3a9166f2e0aa52 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -227,11 +227,11 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP, hlfir::createOptimizedBufferization); } pm.addPass(hlfir::createLowerHLFIROrderedAssignments()); + if (enableOpenMP) + pm.addPass(flangomp::createLowerWorkshare()); pm.addPass(hlfir::createLowerHLFIRIntrinsics()); pm.addPass(hlfir::createBufferizeHLFIR()); pm.addPass(hlfir::createConvertHLFIRtoFIR()); - if (enableOpenMP) - pm.addPass(flangomp::createLowerWorkshare()); } /// Create a pass pipeline for handling certain OpenMP transformations needed diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index 4b18acb7c2b430..9b9651a476e583 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -44,10 +44,10 @@ func.func @_QQmain() { // PASSES-NEXT: 'omp.private' Pipeline // PASSES-NEXT: OptimizedBufferization // PASSES-NEXT: LowerHLFIROrderedAssignments +// PASSES-NEXT: LowerWorkshare // PASSES-NEXT: LowerHLFIRIntrinsics // PASSES-NEXT: BufferizeHLFIR // PASSES-NEXT: ConvertHLFIRtoFIR -// PASSES-NEXT: LowerWorkshare // PASSES-NEXT: CSE // PASSES-NEXT: (S) 0 num-cse'd - Number of operations CSE'd // PASSES-NEXT: (S) 0 num-dce'd - Number of operations DCE'd diff --git a/flang/test/Integration/OpenMP/workshare02.f90 b/flang/test/Integration/OpenMP/workshare02.f90 new file mode 100644 index 00000000000000..68b810a32f247e --- /dev/null +++ b/flang/test/Integration/OpenMP/workshare02.f90 @@ -0,0 +1,36 @@ +!===----------------------------------------------------------------------===! +! This directory can be used to add Integration tests involving multiple +! stages of the compiler (for eg. from Fortran to LLVM IR). It should not +! contain executable tests. We should only add tests here sparingly and only +! if there is no other way to test. Repeat this message in each test that is +! added to this directory and sub-directories. +!===----------------------------------------------------------------------===! + +!RUN: %flang_fc1 -emit-mlir -fopenmp -O3 %s -o - | FileCheck %s --check-prefix MLIR + +program test_ws_01 + implicit none + real(8) :: arr_01(10), x + arr_01 = [0.347,0.892,0.573,0.126,0.788,0.412,0.964,0.205,0.631,0.746] + + !$omp parallel workshare + x = sum(arr_01) + !$omp end parallel workshare +end program test_ws_01 + +! MLIR: func.func @_QQmain +! MLIR: omp.parallel { +! [...] +! MLIR: omp.wsloop { +! MLIR: omp.loop_nest {{.*}} +! [...] +! MLIR: %[[SUM:.*]] = arith.addf {{.*}} +! [...] +! MLIR: omp.yield +! MLIR: } +! MLIR: } +! MLIR: omp.barrier +! MLIR: omp.terminator +! MLIR: } +! MLIR: return +! MLIR: } `````````` </details> https://github.com/llvm/llvm-project/pull/113082 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits