[llvm-branch-commits] [llvm] [NVPTX] add support for encoding PTX registers for DWARF (PR #109495)

2024-09-22 Thread Walter Erquinigo via llvm-branch-commits


@@ -141,3 +142,47 @@ NVPTXRegisterInfo::getFrameLocalRegister(const 
MachineFunction &MF) const {
   static_cast(MF.getTarget());
   return TM.is64Bit() ? NVPTX::VRFrameLocal64 : NVPTX::VRFrameLocal32;
 }
+
+void NVPTXRegisterInfo::clearDebugRegisterMap() const {
+  debugRegisterMap.clear();
+}
+
+static uint64_t encodeRegisterForDwarf(std::string registerName) {
+  if (registerName.length() > 8) {
+// The name is more than 8 characters long, and so won't fit into 64 bits.
+return 0;
+  }
+
+  // Encode the name string into a DWARF register number using cuda-gdb's
+  // encoding.  See cuda_check_dwarf2_reg_ptx_virtual_register in cuda-tdep.c,
+  // 
https://github.com/NVIDIA/cuda-gdb/blob/e5cf3bddae520ffb326f95b4d98ce5c7474b828b/gdb/cuda/cuda-tdep.c#L353
+  // IE the bytes of the string are concatenated in reverse into a single
+  // number, which is stored in ULEB128, but in practice must be no more than 8
+  // bytes (excluding null terminator, which is not included).
+  uint64_t result = 0;
+  for (int i = 0; i < registerName.length(); ++i) {
+result = result << 8;
+char c = registerName[i];
+result += c;

walter-erquinigo wrote:

Use |= for clarity because you are doing a bitwise operation

https://github.com/llvm/llvm-project/pull/109495
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NVPTX] add support for encoding PTX registers for DWARF (PR #109495)

2024-09-22 Thread Walter Erquinigo via llvm-branch-commits


@@ -1773,6 +1774,26 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   OutStreamer->emitRawText(O.str());
 }
 
+/// Translate virtual register numbers in DebugInfo locations to their printed
+/// encodings, as used by CUDA-GDB.
+void NVPTXAsmPrinter::encodeDebugInfoRegisterNumbers(
+const MachineFunction &MF) {
+  const NVPTXSubtarget &STI = MF.getSubtarget();
+  const NVPTXRegisterInfo *registerInfo = STI.getRegisterInfo();
+
+  // Clear the old mapping, and add the new one.  This mapping is used after 
the
+  // printing of the current function is complete, but before the next function
+  // is printed.
+  registerInfo->clearDebugRegisterMap();
+
+  for (auto classMap : VRegMapping) {
+for (auto registerMapping : classMap.getSecond()) {

walter-erquinigo wrote:

could you make these iterators references?

https://github.com/llvm/llvm-project/pull/109495
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NVPTX] add support for encoding PTX registers for DWARF (PR #109495)

2024-09-22 Thread Walter Erquinigo via llvm-branch-commits


@@ -141,3 +142,47 @@ NVPTXRegisterInfo::getFrameLocalRegister(const 
MachineFunction &MF) const {
   static_cast(MF.getTarget());
   return TM.is64Bit() ? NVPTX::VRFrameLocal64 : NVPTX::VRFrameLocal32;
 }
+
+void NVPTXRegisterInfo::clearDebugRegisterMap() const {
+  debugRegisterMap.clear();
+}
+
+static uint64_t encodeRegisterForDwarf(std::string registerName) {
+  if (registerName.length() > 8) {
+// The name is more than 8 characters long, and so won't fit into 64 bits.
+return 0;
+  }

walter-erquinigo wrote:

could this validly happen or would it be an error if it happens?

https://github.com/llvm/llvm-project/pull/109495
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NVPTX] add support for encoding PTX registers for DWARF (PR #109495)

2024-09-22 Thread Walter Erquinigo via llvm-branch-commits


@@ -141,3 +142,47 @@ NVPTXRegisterInfo::getFrameLocalRegister(const 
MachineFunction &MF) const {
   static_cast(MF.getTarget());
   return TM.is64Bit() ? NVPTX::VRFrameLocal64 : NVPTX::VRFrameLocal32;
 }
+
+void NVPTXRegisterInfo::clearDebugRegisterMap() const {
+  debugRegisterMap.clear();
+}
+
+static uint64_t encodeRegisterForDwarf(std::string registerName) {
+  if (registerName.length() > 8) {
+// The name is more than 8 characters long, and so won't fit into 64 bits.
+return 0;
+  }
+
+  // Encode the name string into a DWARF register number using cuda-gdb's
+  // encoding.  See cuda_check_dwarf2_reg_ptx_virtual_register in cuda-tdep.c,
+  // 
https://github.com/NVIDIA/cuda-gdb/blob/e5cf3bddae520ffb326f95b4d98ce5c7474b828b/gdb/cuda/cuda-tdep.c#L353
+  // IE the bytes of the string are concatenated in reverse into a single
+  // number, which is stored in ULEB128, but in practice must be no more than 8
+  // bytes (excluding null terminator, which is not included).
+  uint64_t result = 0;
+  for (int i = 0; i < registerName.length(); ++i) {
+result = result << 8;
+char c = registerName[i];
+result += c;
+  }
+  return result;
+}
+
+void NVPTXRegisterInfo::addToDebugRegisterMap(
+uint64_t preEncodedVirtualRegister, std::string registerName) const {
+  uint64_t mapped = encodeRegisterForDwarf(registerName);
+  if (mapped == 0)
+return;
+  debugRegisterMap.insert({preEncodedVirtualRegister, mapped});
+}
+
+int64_t NVPTXRegisterInfo::getDwarfRegNum(MCRegister RegNum, bool isEH) const {
+  if (Register::isPhysicalRegister(RegNum)) {
+std::string name = NVPTXInstPrinter::getRegisterName(RegNum.id());
+return encodeRegisterForDwarf(name);
+  }
+  auto lookup = debugRegisterMap.lookup(RegNum.id());

walter-erquinigo wrote:

don't use auto here

https://github.com/llvm/llvm-project/pull/109495
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [NVPTX] add support for encoding PTX registers for DWARF (PR #109495)

2024-09-22 Thread Walter Erquinigo via llvm-branch-commits


@@ -141,3 +142,47 @@ NVPTXRegisterInfo::getFrameLocalRegister(const 
MachineFunction &MF) const {
   static_cast(MF.getTarget());
   return TM.is64Bit() ? NVPTX::VRFrameLocal64 : NVPTX::VRFrameLocal32;
 }
+
+void NVPTXRegisterInfo::clearDebugRegisterMap() const {
+  debugRegisterMap.clear();
+}
+
+static uint64_t encodeRegisterForDwarf(std::string registerName) {
+  if (registerName.length() > 8) {
+// The name is more than 8 characters long, and so won't fit into 64 bits.
+return 0;
+  }
+
+  // Encode the name string into a DWARF register number using cuda-gdb's
+  // encoding.  See cuda_check_dwarf2_reg_ptx_virtual_register in cuda-tdep.c,
+  // 
https://github.com/NVIDIA/cuda-gdb/blob/e5cf3bddae520ffb326f95b4d98ce5c7474b828b/gdb/cuda/cuda-tdep.c#L353
+  // IE the bytes of the string are concatenated in reverse into a single
+  // number, which is stored in ULEB128, but in practice must be no more than 8
+  // bytes (excluding null terminator, which is not included).
+  uint64_t result = 0;
+  for (int i = 0; i < registerName.length(); ++i) {
+result = result << 8;
+char c = registerName[i];

walter-erquinigo wrote:

shouldn't this be unsigned char?

https://github.com/llvm/llvm-project/pull/109495
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [WIP][flang] Introduce HLFIR lowerings to omp.workshare_loop_nest (PR #104748)

2024-09-22 Thread Ivan R. Ivanov via llvm-branch-commits

https://github.com/ivanradanov updated 
https://github.com/llvm/llvm-project/pull/104748

>From e0ef194ecf8bf0e9c450ee11c244eb4450548aef Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov 
Date: Sun, 4 Aug 2024 17:33:52 +0900
Subject: [PATCH 1/2] Add workshare loop wrapper lowerings

Bufferize test

Bufferize test

Bufferize test

Add test for should use workshare lowering
---
 .../HLFIR/Transforms/BufferizeHLFIR.cpp   |   4 +-
 .../Transforms/OptimizedBufferization.cpp |  10 +-
 flang/test/HLFIR/bufferize-workshare.fir  |  58 
 .../OpenMP/should-use-workshare-lowering.mlir | 140 ++
 4 files changed, 208 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/HLFIR/bufferize-workshare.fir
 create mode 100644 
flang/test/Transforms/OpenMP/should-use-workshare-lowering.mlir

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp 
b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
index 07794828fce267..1848dbe2c7a2c2 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp
@@ -26,6 +26,7 @@
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenMP/Passes.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
@@ -792,7 +793,8 @@ struct ElementalOpConversion
 // Generate a loop nest looping around the fir.elemental shape and clone
 // fir.elemental region inside the inner loop.
 hlfir::LoopNest loopNest =
-hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered());
+hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered(),
+   flangomp::shouldUseWorkshareLowering(elemental));
 auto insPt = builder.saveInsertionPoint();
 builder.setInsertionPointToStart(loopNest.body);
 auto yield = hlfir::inlineElementalOp(loc, builder, elemental,
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp 
b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
index 3a0a98dc594463..f014724861e333 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/OptimizedBufferization.cpp
@@ -20,6 +20,7 @@
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/Transforms/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Dominance.h"
@@ -482,7 +483,8 @@ llvm::LogicalResult 
ElementalAssignBufferization::matchAndRewrite(
   // Generate a loop nest looping around the hlfir.elemental shape and clone
   // hlfir.elemental region inside the inner loop
   hlfir::LoopNest loopNest =
-  hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered());
+  hlfir::genLoopNest(loc, builder, extents, !elemental.isOrdered(),
+ flangomp::shouldUseWorkshareLowering(elemental));
   builder.setInsertionPointToStart(loopNest.body);
   auto yield = hlfir::inlineElementalOp(loc, builder, elemental,
 loopNest.oneBasedIndices);
@@ -553,7 +555,8 @@ llvm::LogicalResult 
BroadcastAssignBufferization::matchAndRewrite(
   llvm::SmallVector extents =
   hlfir::getIndexExtents(loc, builder, shape);
   hlfir::LoopNest loopNest =
-  hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true);
+  hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true,
+ flangomp::shouldUseWorkshareLowering(assign));
   builder.setInsertionPointToStart(loopNest.body);
   auto arrayElement =
   hlfir::getElementAt(loc, builder, lhs, loopNest.oneBasedIndices);
@@ -648,7 +651,8 @@ llvm::LogicalResult 
VariableAssignBufferization::matchAndRewrite(
   llvm::SmallVector extents =
   hlfir::getIndexExtents(loc, builder, shape);
   hlfir::LoopNest loopNest =
-  hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true);
+  hlfir::genLoopNest(loc, builder, extents, /*isUnordered=*/true,
+ flangomp::shouldUseWorkshareLowering(assign));
   builder.setInsertionPointToStart(loopNest.body);
   auto rhsArrayElement =
   hlfir::getElementAt(loc, builder, rhs, loopNest.oneBasedIndices);
diff --git a/flang/test/HLFIR/bufferize-workshare.fir 
b/flang/test/HLFIR/bufferize-workshare.fir
new file mode 100644
index 00..9b7341ae43398a
--- /dev/null
+++ b/flang/test/HLFIR/bufferize-workshare.fir
@@ -0,0 +1,58 @@
+// RUN: fir-opt --bufferize-hlfir %s | FileCheck %s
+
+// CHECK-LABEL:   func.func @simple(
+// CHECK-SAME:  %[[VAL_0:.*]]: 
!fir.ref>) {
+// CHECK:   omp.parallel {
+// CHECK: omp.workshare {
+// CHECK:   %[[VAL_1:.*]] = arith.c

[llvm-branch-commits] [flang] [flang] Lower omp.workshare to other omp constructs (PR #101446)

2024-09-22 Thread via llvm-branch-commits

github-actions[bot] wrote:




:warning: C/C++ code formatter, clang-format found issues in your code. 
:warning:



You can test this locally with the following command:


``bash
git-clang-format --diff d5fbe9c7482b87be295be03aafd5917dd7c17859 
79ac7998609480d18be4ea3bc61b6c1c77089f70 --extensions h,cpp,inc -- 
flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp 
flang/include/flang/Optimizer/OpenMP/Passes.h 
flang/include/flang/Tools/CLOptions.inc 
flang/include/flang/Tools/CrossToolHelpers.h 
flang/lib/Frontend/FrontendActions.cpp flang/tools/bbc/bbc.cpp 
flang/tools/tco/tco.cpp
``





View the diff from clang-format here.


``diff
diff --git a/flang/include/flang/Tools/CLOptions.inc 
b/flang/include/flang/Tools/CLOptions.inc
index bb00e07900..81ce69b4ec 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -336,8 +336,8 @@ inline void createDefaultFIROptimizerPassPipeline(
 /// \param pm - MLIR pass manager that will hold the pipeline definition
 /// \param optLevel - optimization level used for creating FIR optimization
 ///   passes pipeline
-inline void createHLFIRToFIRPassPipeline(
-mlir::PassManager &pm, bool enableOpenMP, llvm::OptimizationLevel optLevel 
= defaultOptLevel) {
+inline void createHLFIRToFIRPassPipeline(mlir::PassManager &pm,
+bool enableOpenMP, llvm::OptimizationLevel optLevel = defaultOptLevel) {
   if (optLevel.isOptimizingForSpeed()) {
 addCanonicalizerPassWithoutRegionSimplification(pm);
 addNestedPassToAllTopLevelOperations(

``




https://github.com/llvm/llvm-project/pull/101446
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [flang] [WIP][flang] Introduce HLFIR lowerings to omp.workshare_loop_nest (PR #104748)

2024-09-22 Thread Ivan R. Ivanov via llvm-branch-commits

https://github.com/ivanradanov updated 
https://github.com/llvm/llvm-project/pull/104748

>From 975a0d74c5ae81c69844b8bd089832ed53278477 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov 
Date: Mon, 23 Sep 2024 15:07:48 +0900
Subject: [PATCH 1/4] Emit a proper error message for CFG in workshare

---
 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 13 +-
 .../OpenMP/lower-workshare-todo-cfg-dom.mlir  | 23 ++
 .../OpenMP/lower-workshare-todo-cfg.mlir  | 20 +
 .../Transforms/OpenMP/lower-workshare5.mlir   | 42 ---
 4 files changed, 55 insertions(+), 43 deletions(-)
 create mode 100644 
flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir
 create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir
 delete mode 100644 flang/test/Transforms/OpenMP/lower-workshare5.mlir

diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp 
b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index 6e5538b54ba5e0..cf1867311cc236 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -16,6 +16,7 @@
 //
 
//===--===//
 
+#include "flang/Optimizer/Builder/Todo.h"
 #include 
 #include 
 #include 
@@ -416,8 +417,18 @@ LogicalResult lowerWorkshare(mlir::omp::WorkshareOp wsOp, 
DominanceInfo &di) {
 
   parallelizeRegion(wsOp.getRegion(), newOp.getRegion(), rootMapping, loc, di);
 
+  // FIXME Currently, we only support workshare constructs with structured
+  // control flow. The transformation itself supports CFG, however, once we
+  // transform the MLIR region in the omp.workshare, we need to inline that
+  // region in the parent block. We have no guarantees at this point of the
+  // pipeline that the parent op supports CFG (e.g. fir.if), thus this is not
+  // generally possible.  The alternative is to put the lowered region in an
+  // operation akin to scf.execute_region, which will get lowered at the same
+  // time when fir ops get lowered to CFG. However, SCF is not registered in
+  // flang so we cannot use it. Remove this requirement once we have
+  // scf.execute_region or an alternative operation available.
   if (wsOp.getRegion().getBlocks().size() != 1)
-return failure();
+TODO(wsOp->getLoc(), "omp workshare with unstructured control flow");
 
   // Inline the contents of the placeholder workshare op into its parent block.
   Block *theBlock = &newOp.getRegion().front();
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir 
b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir
new file mode 100644
index 00..1c47d448f597d9
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg-dom.mlir
@@ -0,0 +1,23 @@
+// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s 2>&1 | 
FileCheck %s
+
+// CHECK: not yet implemented: omp workshare with unstructured control flow
+
+// Check that the definition of %r dominates its use post-transform
+func.func @wsfunc() {
+  %a = fir.alloca i32
+  omp.parallel {
+omp.workshare {
+^bb1:
+  %c1 = arith.constant 1 : i32
+  cf.br ^bb3(%c1: i32)
+^bb2:
+  "test.test2"(%r) : (i32) -> ()
+  omp.terminator
+^bb3(%arg1: i32):
+  %r = "test.test2"(%arg1) : (i32) -> i32
+  cf.br ^bb2
+}
+omp.terminator
+  }
+  return
+}
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir 
b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir
new file mode 100644
index 00..bf6c196a05b4a3
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-todo-cfg.mlir
@@ -0,0 +1,20 @@
+// RUN: fir-opt --lower-workshare --allow-unregistered-dialect %s 2>&1 | 
FileCheck %s
+
+// CHECK: not yet implemented: omp workshare with unstructured control flow
+
+// Check transforming a simple CFG
+func.func @wsfunc() {
+  %a = fir.alloca i32
+  omp.parallel {
+omp.workshare {
+^bb1:
+  %c1 = arith.constant 1 : i32
+  cf.br ^bb3(%c1: i32)
+^bb3(%arg1: i32):
+  "test.test2"(%arg1) : (i32) -> ()
+  omp.terminator
+}
+omp.terminator
+  }
+  return
+}
diff --git a/flang/test/Transforms/OpenMP/lower-workshare5.mlir 
b/flang/test/Transforms/OpenMP/lower-workshare5.mlir
deleted file mode 100644
index 177f8aa8f86c7c..00
--- a/flang/test/Transforms/OpenMP/lower-workshare5.mlir
+++ /dev/null
@@ -1,42 +0,0 @@
-// XFAIL: *
-// RUN: fir-opt --split-input-file --lower-workshare 
--allow-unregistered-dialect %s | FileCheck %s
-
-// TODO we can lower these but we have no guarantee that the parent of
-// omp.workshare supports multi-block regions, thus we fail for now.
-
-func.func @wsfunc() {
-  %a = fir.alloca i32
-  omp.parallel {
-omp.workshare {
-^bb1:
-  %c1 = arith.constant 1 : i32
-  cf.br ^bb3(%c1: i32)
-^bb3(%arg1: i32):
-  "test.test2"(%arg1) : (i32) -> ()
-  omp.terminator
-}
-omp.terminator
-  }
-

[llvm-branch-commits] [flang] [flang] Lower omp.workshare to other omp constructs (PR #101446)

2024-09-22 Thread Ivan R. Ivanov via llvm-branch-commits

https://github.com/ivanradanov updated 
https://github.com/llvm/llvm-project/pull/101446

>From e56dbd6a0625890fd9a3d6a62675e864ca94a8f5 Mon Sep 17 00:00:00 2001
From: Ivan Radanov Ivanov 
Date: Sun, 4 Aug 2024 22:06:55 +0900
Subject: [PATCH 1/3] [flang] Lower omp.workshare to other omp constructs

Change to workshare loop wrapper op

Move single op declaration

Schedule pass properly

Correctly handle nested nested loop nests to be parallelized by workshare

Leave comments for shouldUseWorkshareLowering

Use copyprivate to scatter val from omp.single

TODO still need to implement copy function
TODO transitive check for usage outside of omp.single not imiplemented yet

Transitively check for users outisde of single op

TODO need to implement copy func
TODO need to hoist allocas outside of single regions

Add tests

Hoist allocas

More tests

Emit body for copy func

Test the tmp storing logic

Clean up trivially dead ops

Only handle single-block regions for now

Fix tests for custom assembly for loop wrapper

Only run the lower workshare pass if openmp is enabled

Implement some missing functionality

Fix tests

Fix test

Iterate backwards to find all trivially dead ops

Add expalanation comment for createCopyFun

Update test
---
 flang/include/flang/Optimizer/OpenMP/Passes.h |   5 +
 .../include/flang/Optimizer/OpenMP/Passes.td  |   5 +
 flang/include/flang/Tools/CLOptions.inc   |   6 +-
 flang/include/flang/Tools/CrossToolHelpers.h  |   1 +
 flang/lib/Frontend/FrontendActions.cpp|  10 +-
 flang/lib/Optimizer/OpenMP/CMakeLists.txt |   1 +
 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 446 ++
 flang/test/Fir/basic-program.fir  |   1 +
 .../Transforms/OpenMP/lower-workshare.mlir| 189 
 .../Transforms/OpenMP/lower-workshare2.mlir   |  23 +
 .../Transforms/OpenMP/lower-workshare3.mlir   |  74 +++
 .../Transforms/OpenMP/lower-workshare4.mlir   |  59 +++
 .../Transforms/OpenMP/lower-workshare5.mlir   |  42 ++
 .../Transforms/OpenMP/lower-workshare6.mlir   |  51 ++
 flang/tools/bbc/bbc.cpp   |   5 +-
 flang/tools/tco/tco.cpp   |   1 +
 16 files changed, 915 insertions(+), 4 deletions(-)
 create mode 100644 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
 create mode 100644 flang/test/Transforms/OpenMP/lower-workshare.mlir
 create mode 100644 flang/test/Transforms/OpenMP/lower-workshare2.mlir
 create mode 100644 flang/test/Transforms/OpenMP/lower-workshare3.mlir
 create mode 100644 flang/test/Transforms/OpenMP/lower-workshare4.mlir
 create mode 100644 flang/test/Transforms/OpenMP/lower-workshare5.mlir
 create mode 100644 flang/test/Transforms/OpenMP/lower-workshare6.mlir

diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.h 
b/flang/include/flang/Optimizer/OpenMP/Passes.h
index 403d79667bf448..feb395f1a12dbd 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.h
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.h
@@ -25,6 +25,11 @@ namespace flangomp {
 #define GEN_PASS_REGISTRATION
 #include "flang/Optimizer/OpenMP/Passes.h.inc"
 
+/// Impelements the logic specified in the 2.8.3  workshare Construct section 
of
+/// the OpenMP standard which specifies what statements or constructs shall be
+/// divided into units of work.
+bool shouldUseWorkshareLowering(mlir::Operation *op);
+
 } // namespace flangomp
 
 #endif // FORTRAN_OPTIMIZER_OPENMP_PASSES_H
diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td 
b/flang/include/flang/Optimizer/OpenMP/Passes.td
index 395178e26a5762..041240cad12eb3 100644
--- a/flang/include/flang/Optimizer/OpenMP/Passes.td
+++ b/flang/include/flang/Optimizer/OpenMP/Passes.td
@@ -37,4 +37,9 @@ def FunctionFiltering : Pass<"omp-function-filtering"> {
   ];
 }
 
+// Needs to be scheduled on Module as we create functions in it
+def LowerWorkshare : Pass<"lower-workshare", "::mlir::ModuleOp"> {
+  let summary = "Lower workshare construct";
+}
+
 #endif //FORTRAN_OPTIMIZER_OPENMP_PASSES
diff --git a/flang/include/flang/Tools/CLOptions.inc 
b/flang/include/flang/Tools/CLOptions.inc
index 1881e23b00045a..bb00e079008a0b 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -337,7 +337,7 @@ inline void createDefaultFIROptimizerPassPipeline(
 /// \param optLevel - optimization level used for creating FIR optimization
 ///   passes pipeline
 inline void createHLFIRToFIRPassPipeline(
-mlir::PassManager &pm, llvm::OptimizationLevel optLevel = defaultOptLevel) 
{
+mlir::PassManager &pm, bool enableOpenMP, llvm::OptimizationLevel optLevel 
= defaultOptLevel) {
   if (optLevel.isOptimizingForSpeed()) {
 addCanonicalizerPassWithoutRegionSimplification(pm);
 addNestedPassToAllTopLevelOperations(
@@ -354,6 +354,8 @@ inline void createHLFIRToFIRPassPipeline(
   pm.addPass(hlfir::createLowerHLFIRIntrinsics());
   pm.addPass(hlfir::createBufferizeHLFIR());
   pm.addPass(hlfir::createConvertHLFIRtoFIR());
+  if (enableOpenMP)
+pm.add