[clang] [CIR][OpenMP] Emit #pragma omp for as omp.wsloop + omp.loop_nest (PR #181841)

Luca Parigi via cfe-commits Mon, 02 Mar 2026 04:11:02 -0800

https://github.com/Parigi updated 
https://github.com/llvm/llvm-project/pull/181841


>From b937e1e308df7de84265fd57282cd2d33c40ecc2 Mon Sep 17 00:00:00 2001
From: Luca Parigi <[email protected]>
Date: Tue, 17 Feb 2026 16:41:08 +0100
Subject: [PATCH 1/2] [CIR][OpenMP] Emit #pragma omp for as omp.wsloop +
 omp.loop_nest

OMPForDirective emit omp.wsloop with omp.loop_nest using CIR ops
for bounds/step, converting to standard MLIR integers via
UnrealizedConversionCastOp.

Add reconcile-unrealized-casts pass to the CIR-to-LLVM pipeline.
Add CIR-level and LLVM IR lowering tests.
---
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  13 +
 clang/lib/CIR/CodeGen/CIRGenStmt.cpp          | 170 ++++++---
 clang/lib/CIR/CodeGen/CIRGenStmtOpenMP.cpp    | 202 ++++++++++-
 .../CIR/Lowering/DirectToLLVM/CMakeLists.txt  |   1 +
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |   6 +
 clang/test/CIR/CodeGenOpenMP/pragma-omp-for.c | 326 ++++++++++++++++++
 clang/test/CIR/Lowering/pragma-omp-for.c      | 188 ++++++++++
 7 files changed, 854 insertions(+), 52 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenOpenMP/pragma-omp-for.c
 create mode 100644 clang/test/CIR/Lowering/pragma-omp-for.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h 
b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index cc0087ba2d6bd..32e3044dc08a5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -63,6 +63,19 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// is where the next operations will be introduced.
   CIRGenBuilderTy &builder;
 
+  /// State used to communicate OpenMP loop bounds from `emitOMPForDirective`
+  /// to `emitForStmt`.
+  struct LoopBounds {
+    mlir::Value lowerBound;
+    mlir::Value upperBound;
+    mlir::Value step;
+    mlir::Type inductionVarType;
+    const VarDecl *inductionVar;
+    bool inclusive;
+  };
+
+  std::optional<LoopBounds> currentOMPLoopBounds;
+
   /// A jump destination is an abstract label, branching to which may
   /// require a jump out through normal cleanups.
   struct JumpDest {
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp 
b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index db3827340c455..f6b3b976ac499 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -22,6 +22,10 @@
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/CIR/MissingFeatures.h"
 
+// Required to construct OpenMP operations such as `omp.wsloop` and
+// `omp.loop_nest` during lowering.
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+
 using namespace clang;
 using namespace clang::CIRGen;
 using namespace cir;
@@ -939,16 +943,36 @@ CIRGenFunction::emitCXXForRangeStmt(const CXXForRangeStmt 
&s,
   return mlir::success();
 }
 
+/// Emit a `for` statement as either a CIR `cir.for` or, when inside an
+/// OpenMP `#pragma omp for`, an `omp.loop_nest` within the wsloop created
+/// by emitOMPForDirective.
+
 mlir::LogicalResult CIRGenFunction::emitForStmt(const ForStmt &s) {
+
+  // CIR for-loop operation (used in the non-OpenMP case).
   cir::ForOp forOp;
 
+  // OpenMP loop nest operation (used when inside `omp.wsloop`).
+  mlir::omp::LoopNestOp loopNestOp;
+
+  auto scopeLoc = getLoc(s.getSourceRange());
+  bool isOpenMPFor = currentOMPLoopBounds.has_value();
+
+  // This lambda emits either an OpenMP `omp.loop_nest` or a regular CIR
+  // `cir.for`, depending on whether we are inside an OpenMP for directive.
   // TODO: pass in an array of attributes.
   auto forStmtBuilder = [&]() -> mlir::LogicalResult {
     mlir::LogicalResult loopRes = mlir::success();
-    // Evaluate the first part before the loop.
-    if (s.getInit())
-      if (emitStmt(s.getInit(), /*useCurrentScope=*/true).failed())
-        return mlir::failure();
+
+    // For OpenMP loops, init is emitted by emitOMPForDirective before the
+    // wsloop so that the alloca lives outside the loop region.
+    if (!isOpenMPFor) {
+      // Evaluate the first part before the loop.
+      if (s.getInit())
+        if (emitStmt(s.getInit(), /*useCurrentScope=*/true).failed())
+          return mlir::failure();
+    }
+
     assert(!cir::MissingFeatures::loopInfoStack());
     // In the classic codegen, if there are any cleanups between here and the
     // loop-exit scope, a block is created to stage the loop exit. We probably
@@ -956,58 +980,110 @@ mlir::LogicalResult CIRGenFunction::emitForStmt(const 
ForStmt &s) {
     // to be sure we handle all cases.
     assert(!cir::MissingFeatures::requiresCleanups());
 
-    forOp = builder.createFor(
-        getLoc(s.getSourceRange()),
-        /*condBuilder=*/
-        [&](mlir::OpBuilder &b, mlir::Location loc) {
-          assert(!cir::MissingFeatures::createProfileWeightsForLoop());
-          
assert(!cir::MissingFeatures::emitCondLikelihoodViaExpectIntrinsic());
-          mlir::Value condVal;
-          if (s.getCond()) {
-            // If the for statement has a condition scope,
-            // emit the local variable declaration.
-            if (s.getConditionVariable())
-              emitDecl(*s.getConditionVariable());
-            // C99 6.8.5p2/p4: The first substatement is executed if the
-            // expression compares unequal to 0. The condition must be a
-            // scalar type.
-            condVal = evaluateExprAsBool(s.getCond());
-          } else {
-            condVal = cir::ConstantOp::create(b, loc, builder.getTrueAttr());
-          }
-          builder.createCondition(condVal);
-        },
-        /*bodyBuilder=*/
-        [&](mlir::OpBuilder &b, mlir::Location loc) {
-          // The scope of the for loop body is nested within the scope of the
-          // for loop's init-statement and condition.
-          if (emitStmt(s.getBody(), /*useCurrentScope=*/false).failed())
-            loopRes = mlir::failure();
-          emitStopPoint(&s);
-        },
-        /*stepBuilder=*/
-        [&](mlir::OpBuilder &b, mlir::Location loc) {
-          if (s.getInc())
-            if (emitStmt(s.getInc(), /*useCurrentScope=*/true).failed())
+    // OpenMP path: emit omp.loop_nest using bounds from emitOMPForDirective.
+    if (isOpenMPFor) {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+
+      mlir::Type loopBoundsType = currentOMPLoopBounds->inductionVarType;
+      mlir::Value lb = currentOMPLoopBounds->lowerBound;
+      mlir::Value ub = currentOMPLoopBounds->upperBound;
+      mlir::Value step = currentOMPLoopBounds->step;
+      bool inclusive = currentOMPLoopBounds->inclusive;
+      const VarDecl *inductionVar = currentOMPLoopBounds->inductionVar;
+
+      loopNestOp = loopNestOp.create(builder, scopeLoc, 1, lb, ub, step,
+                                     inclusive, nullptr);
+
+      mlir::Region &region = loopNestOp.getRegion();
+      mlir::Block *block = new mlir::Block();
+      region.push_back(block);
+
+      block->addArgument(loopBoundsType, scopeLoc);
+      builder.setInsertionPointToStart(block);
+
+      // Store the IV block argument into the loop variable alloca, converting
+      // back from standard integer to CIR integer type.
+      mlir::Value iv = block->getArgument(0);
+      Address inductionAddr = getAddrOfLocalVar(inductionVar);
+      mlir::Value civVal =
+          mlir::UnrealizedConversionCastOp::create(
+              builder, scopeLoc, inductionAddr.getElementType(), iv)
+              .getResult(0);
+      cir::StoreOp::create(builder, scopeLoc, civVal,
+                           inductionAddr.getPointer(),
+                           /*is_volatile=*/nullptr, /*alignment=*/nullptr,
+                           /*sync_scope=*/nullptr, /*mem_order=*/nullptr);
+
+      // Emit the loop body.
+      if (s.getBody()) {
+        if (emitStmt(s.getBody(), /*useCurrentScope=*/true).failed())
+          loopRes = mlir::failure();
+      }
+
+      mlir::omp::YieldOp::create(builder, getLoc(s.getEndLoc()));
+    } else {
+      forOp = builder.createFor(
+          getLoc(s.getSourceRange()),
+          /*condBuilder=*/
+          [&](mlir::OpBuilder &b, mlir::Location loc) {
+            assert(!cir::MissingFeatures::createProfileWeightsForLoop());
+            assert(
+                !cir::MissingFeatures::emitCondLikelihoodViaExpectIntrinsic());
+            mlir::Value condVal;
+            if (s.getCond()) {
+              // If the for statement has a condition scope,
+              // emit the local variable declaration.
+              if (s.getConditionVariable())
+                emitDecl(*s.getConditionVariable());
+              // C99 6.8.5p2/p4: The first substatement is executed if the
+              // expression compares unequal to 0. The condition must be a
+              // scalar type.
+              condVal = evaluateExprAsBool(s.getCond());
+            } else {
+              condVal = cir::ConstantOp::create(b, loc, builder.getTrueAttr());
+            }
+            builder.createCondition(condVal);
+          },
+          /*bodyBuilder=*/
+          [&](mlir::OpBuilder &b, mlir::Location loc) {
+            // The scope of the for loop body is nested within the scope of the
+            // for loop's init-statement and condition.
+            if (emitStmt(s.getBody(), /*useCurrentScope=*/false).failed())
               loopRes = mlir::failure();
-          builder.createYield(loc);
-        });
+            emitStopPoint(&s);
+          },
+          /*stepBuilder=*/
+          [&](mlir::OpBuilder &b, mlir::Location loc) {
+            if (s.getInc())
+              if (emitStmt(s.getInc(), /*useCurrentScope=*/true).failed())
+                loopRes = mlir::failure();
+            builder.createYield(loc);
+          });
+    }
     return loopRes;
   };
 
   auto res = mlir::success();
-  auto scopeLoc = getLoc(s.getSourceRange());
-  cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/
-                       [&](mlir::OpBuilder &b, mlir::Location loc) {
-                         LexicalScope lexScope{*this, loc,
-                                               builder.getInsertionBlock()};
-                         res = forStmtBuilder();
-                       });
+
+  if (isOpenMPFor) {
+    res = forStmtBuilder();
+  } else {
+    cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/
+                         [&](mlir::OpBuilder &b, mlir::Location loc) {
+                           LexicalScope lexScope{*this, loc,
+                                                 builder.getInsertionBlock()};
+                           res = forStmtBuilder();
+                         });
+  }
 
   if (res.failed())
     return res;
 
-  terminateBody(builder, forOp.getBody(), getLoc(s.getEndLoc()));
+  // Only regular CIR loops require explicit termination.
+  // OpenMP wsloop/loop_nest regions terminate via omp.yield.
+  if (!isOpenMPFor) {
+    terminateBody(builder, forOp.getBody(), getLoc(s.getEndLoc()));
+  }
   return mlir::success();
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenMP.cpp 
b/clang/lib/CIR/CodeGen/CIRGenStmtOpenMP.cpp
index 0d3b44db98307..a3eab79fbba64 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenMP.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenMP.cpp
@@ -13,8 +13,11 @@
 #include "CIRGenBuilder.h"
 #include "CIRGenFunction.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "clang/AST/StmtOpenMP.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
+
 using namespace clang;
 using namespace clang::CIRGen;
 
@@ -65,6 +68,200 @@ CIRGenFunction::emitOMPParallelDirective(const 
OMPParallelDirective &s) {
   return res;
 }
 
+// Helpers and implementation for emitOMPForDirective, which lowers an
+// OMPForDirective into an omp.wsloop + omp.loop_nest.
+
+namespace {
+/// Extract integer literal value from an expression, if present.
+static std::optional<int64_t> getIntLiteralValue(const Expr *expr) {
+  if (const auto *intLit = dyn_cast<IntegerLiteral>(expr->IgnoreImpCasts()))
+    return intLit->getValue().getSExtValue();
+  return std::nullopt;
+}
+
+/// Ensure a CIR value has the given CIR integer type, inserting an integral
+/// cast if necessary. Loads through CIR pointers first.
+static mlir::Value ensureCIRIntType(CIRGenBuilderTy &builder,
+                                    mlir::Location loc, mlir::Value cirValue,
+                                    cir::IntType targetCIRType) {
+  if (mlir::isa<cir::PointerType>(cirValue.getType()))
+    cirValue = cir::LoadOp::create(builder, loc, cirValue).getResult();
+
+  if (cirValue.getType() == targetCIRType)
+    return cirValue;
+
+  return builder.createCast(loc, cir::CastKind::integral, cirValue,
+                            targetCIRType);
+}
+
+/// Convert a CIR integer value to a standard MLIR integer type suitable for
+/// use as an omp.loop_nest operand.
+static mlir::Value cirIntToStdInt(mlir::OpBuilder &builder, mlir::Location loc,
+                                  mlir::Value cirValue) {
+  auto cirIntType = mlir::cast<cir::IntType>(cirValue.getType());
+  mlir::Type stdIntType = builder.getIntegerType(cirIntType.getWidth());
+  return mlir::UnrealizedConversionCastOp::create(builder, loc, stdIntType,
+                                                  cirValue)
+      .getResult(0);
+}
+} // anonymous namespace
+
+mlir::LogicalResult
+CIRGenFunction::emitOMPForDirective(const OMPForDirective &s) {
+
+  mlir::LogicalResult res = mlir::success();
+  mlir::Location begin = getLoc(s.getBeginLoc());
+
+  // Extract the underlying canonical `for` loop from the CapturedStmt
+  const CapturedStmt *capturedStmt = s.getInnermostCapturedStmt();
+  const ForStmt *forStmt = dyn_cast<ForStmt>(capturedStmt->getCapturedStmt());
+
+  if (!forStmt) {
+    return mlir::failure();
+  }
+
+  // Loop bounds are first built as CIR integer values, then converted to
+  // standard MLIR integers via UnrealizedConversionCastOp before being
+  // passed to omp.loop_nest (which requires IntLikeType operands).
+  mlir::Value lowerBound;
+  mlir::Value upperBound;
+  mlir::Value step;
+  bool inclusive = false;
+
+  // Extract loop variable type and lower bound.
+  const auto *declStmt = dyn_cast_or_null<DeclStmt>(forStmt->getInit());
+  const auto *varDecl =
+      declStmt ? dyn_cast<VarDecl>(declStmt->getSingleDecl()) : nullptr;
+
+  if (!varDecl)
+    return mlir::failure();
+
+  // The loop variable's CIR integer type is the canonical type for all bounds.
+  QualType loopVarQType = varDecl->getType();
+  auto cirType = convertType(loopVarQType);
+  auto cirIntType = mlir::cast<cir::IntType>(cirType);
+
+  // Extract lower bound.
+  if (!varDecl->hasInit())
+    return mlir::failure();
+
+  if (auto constVal = getIntLiteralValue(varDecl->getInit())) {
+    lowerBound = builder.getConstInt(begin, cirIntType, *constVal);
+  } else {
+    mlir::Value cirValue = emitScalarExpr(varDecl->getInit());
+    lowerBound = ensureCIRIntType(builder, begin, cirValue, cirIntType);
+  }
+
+  // Extract upper bound and comparison operator.
+  const auto *condBinOp = dyn_cast_or_null<BinaryOperator>(forStmt->getCond());
+  if (!condBinOp)
+    return mlir::failure();
+
+  BinaryOperatorKind opKind = condBinOp->getOpcode();
+
+  // Determine which side of the comparison holds the upper bound.
+  // Canonical forms: `i < ub`, `i <= ub` (var on LHS, bound on RHS)
+  //                  `ub > i`, `ub >= i` (bound on LHS, var on RHS)
+  const Expr *boundExpr = nullptr;
+  if (opKind == BO_LT || opKind == BO_LE) {
+    boundExpr = condBinOp->getRHS();
+    inclusive = (opKind == BO_LE);
+  } else if (opKind == BO_GT || opKind == BO_GE) {
+    boundExpr = condBinOp->getLHS();
+    inclusive = (opKind == BO_GE);
+  } else {
+    return mlir::failure();
+  }
+
+  if (auto constVal = getIntLiteralValue(boundExpr)) {
+    upperBound = builder.getConstInt(begin, cirIntType, *constVal);
+  } else {
+    mlir::Value cirValue = emitScalarExpr(boundExpr);
+    upperBound = ensureCIRIntType(builder, begin, cirValue, cirIntType);
+  }
+
+  // Extract step.
+  if (const auto *unaryOp =
+          dyn_cast_or_null<UnaryOperator>(forStmt->getInc())) {
+    int64_t val = unaryOp->isIncrementOp() ? 1 : -1;
+    step = builder.getConstInt(begin, cirIntType, val);
+  } else if (const auto *binOp =
+                 dyn_cast_or_null<BinaryOperator>(forStmt->getInc())) {
+    const Expr *stepExpr = nullptr;
+
+    if (binOp->isCompoundAssignmentOp()) {
+      stepExpr = binOp->getRHS();
+    } else if (binOp->isAssignmentOp()) {
+      // i = i + step or i = step + i
+      if (auto *subBinOp =
+              dyn_cast<BinaryOperator>(binOp->getRHS()->IgnoreImpCasts())) {
+        const Expr *lhs = subBinOp->getLHS()->IgnoreImpCasts();
+        const Expr *rhs = subBinOp->getRHS()->IgnoreImpCasts();
+        // Identify which operand is the loop variable and which is the step.
+        if (auto *lhsRef = dyn_cast<DeclRefExpr>(lhs)) {
+          stepExpr = (lhsRef->getDecl() == varDecl) ? rhs : lhs;
+        } else if (auto *rhsRef = dyn_cast<DeclRefExpr>(rhs)) {
+          stepExpr = (rhsRef->getDecl() == varDecl) ? lhs : rhs;
+        }
+      }
+    }
+
+    if (stepExpr) {
+      if (auto constVal = getIntLiteralValue(stepExpr)) {
+        step = builder.getConstInt(begin, cirIntType, *constVal);
+      } else {
+        mlir::Value cirValue = emitScalarExpr(stepExpr);
+        step = ensureCIRIntType(builder, begin, cirValue, cirIntType);
+      }
+    }
+  }
+
+  // Default to unit step if not recognized.
+  if (!step)
+    step = builder.getConstInt(begin, cirIntType, 1);
+
+  // Emit init, convert bounds to std integers, and create the wsloop.
+
+  // Emit the loop init statement (e.g. `int i = 0`) to create the alloca
+  // for the induction variable *before* the wsloop.
+  if (forStmt->getInit())
+    if (emitStmt(forStmt->getInit(), /*useCurrentScope=*/true).failed())
+      return mlir::failure();
+
+  // Convert CIR integer bounds to standard MLIR integers at the boundary.
+  // omp.loop_nest requires IntLikeType (AnyInteger | Index), not CIR types.
+  mlir::Value stdLB = cirIntToStdInt(builder, begin, lowerBound);
+  mlir::Value stdUB = cirIntToStdInt(builder, begin, upperBound);
+  mlir::Value stdStep = cirIntToStdInt(builder, begin, step);
+  mlir::Type loopBoundsType = stdLB.getType();
+
+  currentOMPLoopBounds =
+      LoopBounds{stdLB, stdUB, stdStep, loopBoundsType, varDecl, inclusive};
+
+  // Create wsloop with empty region
+  llvm::SmallVector<mlir::Type> retTy;
+  llvm::SmallVector<mlir::Value> operands;
+  auto wsloopOp = mlir::omp::WsloopOp::create(builder, begin, retTy, operands);
+
+  mlir::Region &region = wsloopOp.getRegion();
+  mlir::Block *block = new mlir::Block();
+  region.push_back(block);
+
+  // Emit the ForStmt body (will create loop_nest when it detects OpenMP
+  // context)
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(block);
+
+  if (emitStmt(forStmt, /*useCurrentScope=*/false).failed()) {
+    res = mlir::failure();
+  }
+
+  // Clear loop-bound state
+  currentOMPLoopBounds = std::nullopt;
+
+  return res;
+}
+
 mlir::LogicalResult
 CIRGenFunction::emitOMPTaskwaitDirective(const OMPTaskwaitDirective &s) {
   getCIRGenModule().errorNYI(s.getSourceRange(), "OpenMP 
OMPTaskwaitDirective");
@@ -113,11 +310,6 @@ CIRGenFunction::emitOMPFuseDirective(const 
OMPFuseDirective &s) {
   return mlir::failure();
 }
 mlir::LogicalResult
-CIRGenFunction::emitOMPForDirective(const OMPForDirective &s) {
-  getCIRGenModule().errorNYI(s.getSourceRange(), "OpenMP OMPForDirective");
-  return mlir::failure();
-}
-mlir::LogicalResult
 CIRGenFunction::emitOMPForSimdDirective(const OMPForSimdDirective &s) {
   getCIRGenModule().errorNYI(s.getSourceRange(), "OpenMP OMPForSimdDirective");
   return mlir::failure();
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt 
b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
index c7467fe40ba30..49864dcdb62d5 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
@@ -22,6 +22,7 @@ add_clang_library(clangCIRLoweringDirectToLLVM
   MLIRBuiltinToLLVMIRTranslation
   MLIRLLVMToLLVMIRTranslation
   MLIROpenMPToLLVMIRTranslation
+  MLIRReconcileUnrealizedCasts
   MLIRIR
   )
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp 
b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 28b3454d20613..0feeaf748fd75 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -17,6 +17,7 @@
 #include <optional>
 
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -3562,6 +3563,10 @@ void ConvertCIRToLLVMPass::runOnOperation() {
   target.addIllegalDialect<mlir::BuiltinDialect, cir::CIRDialect,
                            mlir::func::FuncDialect>();
 
+  // Allow unrealized conversion casts to survive CIR-to-LLVM conversion.
+  // They are resolved by the reconcile-unrealized-casts pass that runs after.
+  target.addLegalOp<mlir::UnrealizedConversionCastOp>();
+
   llvm::SmallVector<mlir::Operation *> ops;
   ops.push_back(module);
   collectUnreachable(module, ops);
@@ -4800,6 +4805,7 @@ std::unique_ptr<mlir::Pass> createConvertCIRToLLVMPass() {
 void populateCIRToLLVMPasses(mlir::OpPassManager &pm) {
   mlir::populateCIRPreLoweringPasses(pm);
   pm.addPass(createConvertCIRToLLVMPass());
+  pm.addPass(mlir::createReconcileUnrealizedCastsPass());
 }
 
 std::unique_ptr<llvm::Module>
diff --git a/clang/test/CIR/CodeGenOpenMP/pragma-omp-for.c 
b/clang/test/CIR/CodeGenOpenMP/pragma-omp-for.c
new file mode 100644
index 0000000000000..49a046f358e10
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenMP/pragma-omp-for.c
@@ -0,0 +1,326 @@
+// RUN: %clang_cc1 -fopenmp -emit-cir -fclangir %s -o - | FileCheck %s
+
+void before(int);
+void during(int);
+void after(int);
+
+void emit_simple_for() {
+  // CHECK: cir.func{{.*}}@{{.*}}emit_simple_for
+  int j = 5;
+  before(j);
+  // CHECK: cir.call @{{.*}}before
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; i < 10; i++) {
+        during(j);
+    }
+  }
+  // CHECK: omp.parallel {
+
+  // CIR constants for bounds, then cast to std integer
+  // CHECK: %[[C0_CIR:.*]] = cir.const #cir.int<0> : !s32i
+  // CHECK: %[[C10_CIR:.*]] = cir.const #cir.int<10> : !s32i
+  // CHECK: %[[C1_CIR:.*]] = cir.const #cir.int<1> : !s32i
+
+  // induction variable alloca (emitted before wsloop)
+  // CHECK: %[[I_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
+
+  // conversion to std integer for omp.loop_nest
+  // CHECK: %[[C0:.*]] = builtin.unrealized_conversion_cast %[[C0_CIR]] : 
!s32i to i32
+  // CHECK: %[[C10:.*]] = builtin.unrealized_conversion_cast %[[C10_CIR]] : 
!s32i to i32
+  // CHECK: %[[C1:.*]] = builtin.unrealized_conversion_cast %[[C1_CIR]] : 
!s32i to i32
+
+  // omp loop
+  // CHECK: omp.wsloop {
+  // CHECK-NEXT: omp.loop_nest (%[[IV:.*]]) : i32 = (%[[C0]]) to (%[[C10]]) 
step (%[[C1]]) {
+
+  // store induction variable block arg into alloca
+  // CHECK: %[[IV_CIR:.*]] = builtin.unrealized_conversion_cast %[[IV]] : i32 
to !s32i
+  // CHECK: cir.store %[[IV_CIR]], %[[I_ALLOCA]] : !s32i, !cir.ptr<!s32i>
+
+  // during(j)
+  // CHECK: cir.load {{.*}} %{{.*}} : !cir.ptr<!s32i>, !s32i
+  // CHECK: cir.call @{{.*}}during
+
+  // CHECK: omp.yield
+  // CHECK: }
+  // CHECK: }
+
+  // CHECK: omp.terminator
+  // CHECK: }
+  after(j);
+  // CHECK: cir.call @{{.*}}after
+}
+
+void emit_for_with_vars() {
+  // CHECK: cir.func{{.*}}@{{.*}}emit_for_with_vars
+  int j = 5;
+  before(j);
+  // CHECK: cir.call @{{.*}}before
+#pragma omp parallel
+  {
+    int lb = 1;
+    long ub = 10;
+    short step = 1;
+#pragma omp for
+    for (int i = 0; i < ub; i=i+step) {
+        during(j);
+    }
+  }
+
+  // CHECK: omp.parallel {
+
+  // allocas
+  // CHECK: %[[LB:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["lb", init]
+  // CHECK: %[[UB:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["ub", init]
+  // CHECK: %[[STEP:.*]] = cir.alloca !s16i, !cir.ptr<!s16i>, ["step", init]
+
+  // stores
+  // CHECK: cir.store {{.*}}, %[[LB]] : !s32i, !cir.ptr<!s32i>
+  // CHECK: cir.store {{.*}}, %[[UB]] : !s64i, !cir.ptr<!s64i>
+  // CHECK: cir.store {{.*}}, %[[STEP]] : !s16i, !cir.ptr<!s16i>
+
+  // lower bound (CIR constant + cast to i32)
+  // CHECK: %[[LB0_CIR:.*]] = cir.const #cir.int<0> : !s32i
+
+  // upper bound: load, integral cast to i32, then unrealized cast
+  // CHECK: %[[UBLOAD:.*]] = cir.load {{.*}} %[[UB]] : !cir.ptr<!s64i>, !s64i
+  // CHECK: %[[UBCAST:.*]] = cir.cast integral %[[UBLOAD]] : !s64i -> !s32i
+
+  // step: load, integral cast to i32, then unrealized cast
+  // CHECK: %[[STEPLOAD:.*]] = cir.load {{.*}} %[[STEP]] : !cir.ptr<!s16i>, 
!s16i
+  // CHECK: %[[STEPCONV:.*]] = cir.cast integral %[[STEPLOAD]] : !s16i -> !s32i
+
+  // induction variable alloca (emitted before wsloop)
+  // CHECK: %[[I2_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
+
+  // conversion to std integer for omp.loop_nest
+  // CHECK: %[[LB0:.*]] = builtin.unrealized_conversion_cast %[[LB0_CIR]] : 
!s32i to i32
+  // CHECK: %[[UBSTD:.*]] = builtin.unrealized_conversion_cast %[[UBCAST]] : 
!s32i to i32
+  // CHECK: %[[STEPSTD:.*]] = builtin.unrealized_conversion_cast %[[STEPCONV]] 
: !s32i to i32
+
+  // omp loop
+  // CHECK: omp.wsloop {
+  // CHECK-NEXT: omp.loop_nest (%[[IV2:.*]]) : i32 = (%[[LB0]]) to 
(%[[UBSTD]]) step (%[[STEPSTD]]) {
+
+  // store induction variable block arg into alloca
+  // CHECK: %[[IV2_CIR:.*]] = builtin.unrealized_conversion_cast %[[IV2]] : 
i32 to !s32i
+  // CHECK: cir.store %[[IV2_CIR]], %[[I2_ALLOCA]] : !s32i, !cir.ptr<!s32i>
+
+  // during(j)
+  // CHECK: cir.load {{.*}} %{{.*}} : !cir.ptr<!s32i>, !s32i
+  // CHECK: cir.call @{{.*}}during
+
+  // CHECK: omp.yield
+  // CHECK: }
+  // CHECK: }
+
+  // CHECK: omp.terminator
+  // CHECK: }
+
+  after(j);
+  // CHECK: cir.call @{{.*}}after
+}
+
+void emit_for_with_induction_var() {
+  // CHECK: cir.func{{.*}}@{{.*}}emit_for_with_induction_var
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; i < 10; i++) {
+        during(i);
+    }
+  }
+  // CHECK: omp.parallel {
+
+  // CIR constants
+  // CHECK: %[[IC0_CIR:.*]] = cir.const #cir.int<0> : !s32i
+  // CHECK: %[[IC10_CIR:.*]] = cir.const #cir.int<10> : !s32i
+  // CHECK: %[[IC1_CIR:.*]] = cir.const #cir.int<1> : !s32i
+
+  // induction variable alloca
+  // CHECK: %[[IV_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
+
+  // conversion to std integer
+  // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %[[IC0_CIR]] : 
!s32i to i32
+  // CHECK: %[[IC10:.*]] = builtin.unrealized_conversion_cast %[[IC10_CIR]] : 
!s32i to i32
+  // CHECK: %[[IC1:.*]] = builtin.unrealized_conversion_cast %[[IC1_CIR]] : 
!s32i to i32
+
+  // omp loop
+  // CHECK: omp.wsloop {
+  // CHECK-NEXT: omp.loop_nest (%[[IV3:.*]]) : i32 = (%[[IC0]]) to (%[[IC10]]) 
step (%[[IC1]]) {
+
+  // store induction variable into alloca
+  // CHECK: %[[IV3_CIR:.*]] = builtin.unrealized_conversion_cast %[[IV3]] : 
i32 to !s32i
+  // CHECK: cir.store %[[IV3_CIR]], %[[IV_ALLOCA]] : !s32i, !cir.ptr<!s32i>
+
+  // during(i) - loads the induction variable from the alloca
+  // CHECK: %[[I_VAL:.*]] = cir.load %[[IV_ALLOCA]] : !cir.ptr<!s32i>, !s32i
+  // CHECK: cir.call @{{.*}}during(%[[I_VAL]])
+
+  // CHECK: omp.yield
+  // CHECK: }
+  // CHECK: }
+
+  // CHECK: omp.terminator
+  // CHECK: }
+}
+
+// Test inclusive upper bound (i <= 9)
+void emit_for_inclusive_bound() {
+  // CHECK: cir.func{{.*}}@{{.*}}emit_for_inclusive_bound
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; i <= 9; i++) {
+        during(i);
+    }
+  }
+  // CHECK: omp.parallel {
+
+  // CHECK: cir.const #cir.int<0> : !s32i
+  // CHECK: cir.const #cir.int<9> : !s32i
+  // CHECK: cir.const #cir.int<1> : !s32i
+  // CHECK: %[[INC_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
+  // CHECK: %[[INC_C0:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+  // CHECK: %[[INC_C9:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+  // CHECK: %[[INC_C1:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+
+  // CHECK: omp.wsloop {
+  // inclusive = true
+  // CHECK-NEXT: omp.loop_nest (%[[INC_IV:.*]]) : i32 = (%[[INC_C0]]) to 
(%[[INC_C9]]) inclusive step (%[[INC_C1]]) {
+
+  // CHECK: builtin.unrealized_conversion_cast %[[INC_IV]] : i32 to !s32i
+  // CHECK: cir.store
+  // CHECK: cir.call @{{.*}}during
+
+  // CHECK: omp.yield
+  // CHECK: }
+  // CHECK: }
+  // CHECK: omp.terminator
+  // CHECK: }
+}
+
+// Test reversed comparison (10 > i)
+void emit_for_reversed_cmp() {
+  // CHECK: cir.func{{.*}}@{{.*}}emit_for_reversed_cmp
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; 10 > i; i++) {
+        during(i);
+    }
+  }
+  // CHECK: omp.parallel {
+
+  // CHECK: cir.const #cir.int<0> : !s32i
+  // CHECK: cir.const #cir.int<10> : !s32i
+  // CHECK: cir.const #cir.int<1> : !s32i
+  // CHECK: cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
+  // CHECK: %[[REV_C0:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+  // CHECK: %[[REV_C10:.*]] = builtin.unrealized_conversion_cast {{.*}} : 
!s32i to i32
+  // CHECK: %[[REV_C1:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+
+  // CHECK: omp.wsloop {
+  // CHECK-NEXT: omp.loop_nest (%{{.*}}) : i32 = (%[[REV_C0]]) to 
(%[[REV_C10]]) step (%[[REV_C1]]) {
+  // CHECK: omp.yield
+  // CHECK: }
+  // CHECK: }
+  // CHECK: omp.terminator
+  // CHECK: }
+}
+
+// Test reversed inclusive comparison (9 >= i)
+void emit_for_reversed_inclusive_cmp() {
+  // CHECK: cir.func{{.*}}@{{.*}}emit_for_reversed_inclusive_cmp
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; 9 >= i; i++) {
+        during(i);
+    }
+  }
+  // CHECK: omp.parallel {
+
+  // CHECK: cir.const #cir.int<0> : !s32i
+  // CHECK: cir.const #cir.int<9> : !s32i
+  // CHECK: cir.const #cir.int<1> : !s32i
+  // CHECK: cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
+  // CHECK: %[[RI_C0:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+  // CHECK: %[[RI_C9:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+  // CHECK: %[[RI_C1:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+
+  // CHECK: omp.wsloop {
+  // CHECK-NEXT: omp.loop_nest (%{{.*}}) : i32 = (%[[RI_C0]]) to (%[[RI_C9]]) 
inclusive step (%[[RI_C1]]) {
+  // CHECK: omp.yield
+  // CHECK: }
+  // CHECK: }
+  // CHECK: omp.terminator
+  // CHECK: }
+}
+
+// Test compound assignment step (i += 2)
+void emit_for_compound_step() {
+  // CHECK: cir.func{{.*}}@{{.*}}emit_for_compound_step
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; i < 20; i += 2) {
+        during(i);
+    }
+  }
+  // CHECK: omp.parallel {
+
+  // CHECK: cir.const #cir.int<0> : !s32i
+  // CHECK: cir.const #cir.int<20> : !s32i
+  // CHECK: cir.const #cir.int<2> : !s32i
+  // CHECK: cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
+  // CHECK: %[[CS_C0:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+  // CHECK: %[[CS_C20:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+  // CHECK: %[[CS_C2:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+
+  // CHECK: omp.wsloop {
+  // CHECK-NEXT: omp.loop_nest (%{{.*}}) : i32 = (%[[CS_C0]]) to (%[[CS_C20]]) 
step (%[[CS_C2]]) {
+  // CHECK: omp.yield
+  // CHECK: }
+  // CHECK: }
+  // CHECK: omp.terminator
+  // CHECK: }
+}
+
+// Test commuted step expression (i = step + i)
+void emit_for_commuted_step() {
+  // CHECK: cir.func{{.*}}@{{.*}}emit_for_commuted_step
+  short step = 3;
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; i < 30; i = step + i) {
+        during(i);
+    }
+  }
+  // CHECK: omp.parallel {
+
+  // CHECK: cir.const #cir.int<0> : !s32i
+  // CHECK: cir.const #cir.int<30> : !s32i
+
+  // step is loaded and cast to the loop variable type (i32) in CIR
+  // CHECK: %[[CM_STEP_LOAD:.*]] = cir.load {{.*}} : !cir.ptr<!s16i>, !s16i
+  // CHECK: %[[CM_STEP_CIR:.*]] = cir.cast integral %[[CM_STEP_LOAD]] : !s16i 
-> !s32i
+
+  // CHECK: cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
+
+  // conversion to std integer
+  // CHECK: %[[CM_C0:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+  // CHECK: %[[CM_C30:.*]] = builtin.unrealized_conversion_cast {{.*}} : !s32i 
to i32
+  // CHECK: %[[CM_STEP:.*]] = builtin.unrealized_conversion_cast 
%[[CM_STEP_CIR]] : !s32i to i32
+
+  // CHECK: omp.wsloop {
+  // CHECK-NEXT: omp.loop_nest (%{{.*}}) : i32 = (%[[CM_C0]]) to (%[[CM_C30]]) 
step (%[[CM_STEP]]) {
+  // CHECK: omp.yield
+  // CHECK: }
+  // CHECK: }
+  // CHECK: omp.terminator
+  // CHECK: }
+}
diff --git a/clang/test/CIR/Lowering/pragma-omp-for.c 
b/clang/test/CIR/Lowering/pragma-omp-for.c
new file mode 100644
index 0000000000000..76f069a4cd9a9
--- /dev/null
+++ b/clang/test/CIR/Lowering/pragma-omp-for.c
@@ -0,0 +1,188 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fclangir 
-emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck %s --input-file %t-cir.ll
+
+void before(int);
+void during(int);
+void after(int);
+
+// Test simple for loop with constant bounds: for (int i = 0; i < 10; i++)
+void emit_simple_for() {
+  int j = 5;
+  before(j);
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; i < 10; i++) {
+      during(j);
+    }
+  }
+  after(j);
+}
+
+// CHECK-LABEL: define dso_local void @emit_simple_for()
+// CHECK: call void @before(i32 %{{.*}})
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @{{.*}}, i32 1, 
ptr @emit_simple_for..omp_par, ptr %{{.*}})
+// CHECK: call void @after(i32 %{{.*}})
+
+// CHECK-LABEL: define internal void @emit_simple_for..omp_par(
+// CHECK: store i32 0, ptr %p.lowerbound
+// CHECK: store i32 9, ptr %p.upperbound
+// CHECK: store i32 1, ptr %p.stride
+// CHECK: call void @__kmpc_for_static_init_4u(
+// CHECK: omp_loop.body:
+// CHECK: omp.loop_nest.region:
+// CHECK: store i32 %{{.*}}, ptr %{{.*}}, align 4
+// CHECK: call void @during(i32 %{{.*}})
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: call void @__kmpc_barrier(
+
+// Test for loop with variable bounds and type conversions
+void emit_for_with_vars() {
+  int j = 5;
+  before(j);
+#pragma omp parallel
+  {
+    int lb = 1;
+    long ub = 10;
+    short step = 1;
+#pragma omp for
+    for (int i = 0; i < ub; i = i + step) {
+      during(j);
+    }
+  }
+  after(j);
+}
+
+// CHECK-LABEL: define dso_local void @emit_for_with_vars()
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @{{.*}}, i32 1, 
ptr @emit_for_with_vars..omp_par, ptr %{{.*}})
+
+// CHECK-LABEL: define internal void @emit_for_with_vars..omp_par(
+// variable upper bound: loaded and truncated from i64 to i32
+// CHECK: %{{.*}} = trunc i64 %{{.*}} to i32
+// variable step: loaded and sign-extended from i16 to i32
+// CHECK: %{{.*}} = sext i16 %{{.*}} to i32
+// CHECK: call void @__kmpc_for_static_init_4u(
+// CHECK: omp.loop_nest.region:
+// CHECK: call void @during(i32 %{{.*}})
+// CHECK: call void @__kmpc_for_static_fini(
+
+// Test induction variable is accessible in the loop body: during(i)
+void emit_for_with_induction_var() {
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; i < 10; i++) {
+      during(i);
+    }
+  }
+}
+
+// CHECK-LABEL: define internal void @emit_for_with_induction_var..omp_par(
+// CHECK: store i32 0, ptr %p.lowerbound
+// CHECK: store i32 9, ptr %p.upperbound
+// CHECK: omp.loop_nest.region:
+// IV is stored to the alloca and then loaded for during(i)
+// CHECK: store i32 %{{.*}}, ptr %[[IV_PTR:.*]], align 4
+// CHECK: %[[IV_LOAD:.*]] = load i32, ptr %[[IV_PTR]], align 4
+// CHECK: call void @during(i32 %[[IV_LOAD]])
+
+// Test inclusive upper bound: for (int i = 0; i <= 9; i++)
+void emit_for_inclusive_bound() {
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; i <= 9; i++) {
+      during(i);
+    }
+  }
+}
+
+// CHECK-LABEL: define internal void @emit_for_inclusive_bound..omp_par(
+// inclusive i <= 9 has same trip count as i < 10
+// CHECK: store i32 0, ptr %p.lowerbound
+// CHECK: store i32 9, ptr %p.upperbound
+// CHECK: call void @__kmpc_for_static_init_4u(
+// CHECK: omp.loop_nest.region:
+// CHECK: call void @during(i32 %{{.*}})
+
+// Test reversed comparison: for (int i = 0; 10 > i; i++)
+void emit_for_reversed_cmp() {
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; 10 > i; i++) {
+      during(i);
+    }
+  }
+}
+
+// CHECK-LABEL: define internal void @emit_for_reversed_cmp..omp_par(
+// reversed cmp (10 > i) produces same bounds as (i < 10)
+// CHECK: store i32 0, ptr %p.lowerbound
+// CHECK: store i32 9, ptr %p.upperbound
+// CHECK: call void @__kmpc_for_static_init_4u(
+
+// Test reversed inclusive comparison: for (int i = 0; 9 >= i; i++)
+void emit_for_reversed_inclusive_cmp() {
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; 9 >= i; i++) {
+      during(i);
+    }
+  }
+}
+
+// CHECK-LABEL: define internal void @emit_for_reversed_inclusive_cmp..omp_par(
+// reversed inclusive cmp (9 >= i) produces same bounds as (i <= 9)
+// CHECK: store i32 0, ptr %p.lowerbound
+// CHECK: store i32 9, ptr %p.upperbound
+// CHECK: call void @__kmpc_for_static_init_4u(
+
+// Test compound assignment step: for (int i = 0; i < 20; i += 2)
+void emit_for_compound_step() {
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; i < 20; i += 2) {
+      during(i);
+    }
+  }
+}
+
+// CHECK-LABEL: define internal void @emit_for_compound_step..omp_par(
+// step = 2 visible in the loop body IV computation
+// CHECK: call void @__kmpc_for_static_init_4u(
+// CHECK: omp_loop.body:
+// CHECK: %{{.*}} = mul i32 %{{.*}}, 2
+// CHECK: omp.loop_nest.region:
+// CHECK: call void @during(i32 %{{.*}})
+
+// Test commuted step expression: for (int i = 0; i < 30; i = step + i)
+void emit_for_commuted_step() {
+  short step = 3;
+#pragma omp parallel
+  {
+#pragma omp for
+    for (int i = 0; i < 30; i = step + i) {
+      during(i);
+    }
+  }
+}
+
+// CHECK-LABEL: define internal void @emit_for_commuted_step..omp_par(
+// variable step loaded and sign-extended from i16
+// CHECK: %{{.*}} = sext i16 %{{.*}} to i32
+// CHECK: call void @__kmpc_for_static_init_4u(
+// CHECK: omp_loop.body:
+// step is variable, multiplied into IV
+// CHECK: %{{.*}} = mul i32 %{{.*}}, %{{.*}}
+// CHECK: omp.loop_nest.region:
+// CHECK: call void @during(i32 %{{.*}})
+
+// Verify OpenMP runtime declarations
+// CHECK: declare i32 @__kmpc_global_thread_num(ptr)
+// CHECK: declare void @__kmpc_for_static_init_4u(ptr, i32, i32, ptr, ptr, 
ptr, ptr, i32, i32)
+// CHECK: declare void @__kmpc_for_static_fini(ptr, i32)
+// CHECK: declare void @__kmpc_barrier(ptr, i32)
+// CHECK: declare {{.*}}void @__kmpc_fork_call(ptr, i32, ptr, ...)

>From 1ccd034420e7482dda388717e30b6b1d9a65f65f Mon Sep 17 00:00:00 2001
From: Luca Parigi <[email protected]>
Date: Mon, 2 Mar 2026 13:10:43 +0100
Subject: [PATCH 2/2] [CIR][OpenMP] Updated lowering test on pragma-omp-for.c

Completed the check on lowered code which was not fully covered and reduced the 
test to two representative cases to avoid repetition.
---
 clang/test/CIR/Lowering/pragma-omp-for.c | 416 ++++++++++++++++-------
 1 file changed, 285 insertions(+), 131 deletions(-)

diff --git a/clang/test/CIR/Lowering/pragma-omp-for.c 
b/clang/test/CIR/Lowering/pragma-omp-for.c
index 76f069a4cd9a9..03605c4c42933 100644
--- a/clang/test/CIR/Lowering/pragma-omp-for.c
+++ b/clang/test/CIR/Lowering/pragma-omp-for.c
@@ -1,9 +1,17 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fclangir 
-emit-llvm %s -o %t-cir.ll
 // RUN: FileCheck %s --input-file %t-cir.ll
 
+// CHECK: %struct.ident_t = type { i32, i32, i32, i32, ptr }
+// CHECK: @[[LOC:.*]] = private unnamed_addr constant [23 x i8] 
c";unknown;unknown;0;0;;\00", align 1
+// CHECK: @{{[0-9]+}} = private unnamed_addr constant %struct.ident_t { i32 0, 
i32 2, i32 0, i32 22, ptr @[[LOC]] }, align 8
+// CHECK: @{{[0-9]+}} = private unnamed_addr constant %struct.ident_t { i32 0, 
i32 66, i32 0, i32 22, ptr @[[LOC]] }, align 8
+
 void before(int);
+// CHECK: declare void @before(i32)
 void during(int);
+// CHECK: declare void @during(i32)
 void after(int);
+// CHECK: declare void @after(i32)
 
 // Test simple for loop with constant bounds: for (int i = 0; i < 10; i++)
 void emit_simple_for() {
@@ -19,22 +27,133 @@ void emit_simple_for() {
   after(j);
 }
 
-// CHECK-LABEL: define dso_local void @emit_simple_for()
-// CHECK: call void @before(i32 %{{.*}})
-// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @{{.*}}, i32 1, 
ptr @emit_simple_for..omp_par, ptr %{{.*}})
-// CHECK: call void @after(i32 %{{.*}})
+// CHECK-LABEL: define{{.*}} void @emit_simple_for()
+// CHECK:    %structArg = alloca { ptr, ptr }, align 8
+// CHECK:    %[[I_ALLOCA:.*]] = alloca i32, i64 1, align 4
+// CHECK:    %[[J_ALLOCA:.*]] = alloca i32, i64 1, align 4
+// CHECK:    store i32 5, ptr %[[J_ALLOCA]], align 4
+// CHECK:    %[[J_VAL:.*]] = load i32, ptr %[[J_ALLOCA]], align 4
+// CHECK:    call void @before(i32 %[[J_VAL]])
+// CHECK:    br label %entry
+
+// CHECK: entry:
+// CHECK:    %[[GTN_OUTER:.*]] = call i32 @__kmpc_global_thread_num(ptr 
@{{[0-9]+}})
+// CHECK:    br label %omp_parallel
+
+// CHECK: omp_parallel:
+// CHECK:    %[[GEP_I:.*]] = getelementptr { ptr, ptr }, ptr %structArg, i32 
0, i32 0
+// CHECK:    store ptr %[[I_ALLOCA]], ptr %[[GEP_I]], align 8
+// CHECK:    %[[GEP_J:.*]] = getelementptr { ptr, ptr }, ptr %structArg, i32 
0, i32 1
+// CHECK:    store ptr %[[J_ALLOCA]], ptr %[[GEP_J]], align 8
+// CHECK:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @{{[0-9]+}}, 
i32 1, ptr @emit_simple_for..omp_par, ptr %structArg)
+// CHECK:    br label %omp.par.exit
+
+// CHECK: omp.par.exit:
+// CHECK:    %[[J_AFTER:.*]] = load i32, ptr %[[J_ALLOCA]], align 4
+// CHECK:    call void @after(i32 %[[J_AFTER]])
+// CHECK:    ret void
+
+
+// CHECK-LABEL: define{{.*}} void @emit_simple_for..omp_par(ptr noalias 
%tid.addr, ptr noalias %zero.addr, ptr %0)
+
+// CHECK: omp.par.entry:
+// CHECK:    %[[GEP_I_PAR:.*]] = getelementptr { ptr, ptr }, ptr %0, i32 0, 
i32 0
+// CHECK:    %[[LOADGEP_I:.*]] = load ptr, ptr %[[GEP_I_PAR]], align 8
+// CHECK:    %[[GEP_J_PAR:.*]] = getelementptr { ptr, ptr }, ptr %0, i32 0, 
i32 1
+// CHECK:    %[[LOADGEP_J:.*]] = load ptr, ptr %[[GEP_J_PAR]], align 8
+// CHECK:    %p.lastiter = alloca i32, align 4
+// CHECK:    %p.lowerbound = alloca i32, align 4
+// CHECK:    %p.upperbound = alloca i32, align 4
+// CHECK:    %p.stride = alloca i32, align 4
+// CHECK:    %[[TID_LOCAL:.*]] = alloca i32, align 4
+// CHECK:    %[[TID_LOAD:.*]] = load i32, ptr %tid.addr, align 4
+// CHECK:    store i32 %[[TID_LOAD]], ptr %[[TID_LOCAL]], align 4
+// CHECK:    %[[TID:.*]] = load i32, ptr %[[TID_LOCAL]], align 4
+// CHECK:    br label %omp.region.after_alloca2
+
+// CHECK: omp.region.after_alloca2:
+// CHECK:    br label %omp.region.after_alloca
+
+// CHECK: omp.region.after_alloca:
+// CHECK:    br label %omp.par.region
+
+// CHECK: omp.par.region:
+// CHECK:    br label %omp.par.region1
+
+// CHECK: omp.par.region1:
+// initialize i = 0 before the worksharing loop
+// CHECK:    store i32 0, ptr %[[LOADGEP_I]], align 4
+// CHECK:    br label %omp.wsloop.region
+
+// CHECK: omp.wsloop.region:
+// CHECK:    br label %omp_loop.preheader
+
+// CHECK: omp_loop.preheader:
+// set normalized loop bounds: lb=0, ub=9 (tripcount-1), stride=1
+// CHECK:    store i32 0, ptr %p.lowerbound, align 4
+// CHECK:    store i32 9, ptr %p.upperbound, align 4
+// CHECK:    store i32 1, ptr %p.stride, align 4
+// CHECK:    %[[GTN_WSLOOP:.*]] = call i32 @__kmpc_global_thread_num(ptr 
@{{[0-9]+}})
+// CHECK:    call void @__kmpc_for_static_init_4u(ptr @{{[0-9]+}}, i32 
%[[GTN_WSLOOP]], i32 34, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, 
ptr %p.stride, i32 1, i32 0)
+// reload thread-local lb/ub after static partitioning and compute local trip 
count
+// CHECK:    %[[SF_LB:.*]] = load i32, ptr %p.lowerbound, align 4
+// CHECK:    %[[SF_UB:.*]] = load i32, ptr %p.upperbound, align 4
+// CHECK:    %[[SF_DIFF:.*]] = sub i32 %[[SF_UB]], %[[SF_LB]]
+// CHECK:    %[[SF_TC:.*]] = add i32 %[[SF_DIFF]], 1
+// CHECK:    br label %omp_loop.header
+
+// CHECK: omp_loop.header:
+// CHECK:    %omp_loop.iv = phi i32 [ 0, %omp_loop.preheader ], [ 
%[[LOOP_NEXT:.*]], %omp_loop.inc ]
+// CHECK:    br label %omp_loop.cond
+
+// CHECK: omp_loop.cond:
+// CHECK:    %[[LOOP_CMP:.*]] = icmp ult i32 %omp_loop.iv, %[[SF_TC]]
+// CHECK:    br i1 %[[LOOP_CMP]], label %omp_loop.body, label %omp_loop.exit
+
+// CHECK: omp_loop.exit:
+// CHECK:    call void @__kmpc_for_static_fini(ptr @{{[0-9]+}}, i32 
%[[GTN_WSLOOP]])
+// CHECK:    %[[GTN_BARRIER:.*]] = call i32 @__kmpc_global_thread_num(ptr 
@{{[0-9]+}})
+// CHECK:    call void @__kmpc_barrier(ptr @{{[0-9]+}}, i32 %[[GTN_BARRIER]])
+// CHECK:    br label %omp_loop.after
+
+// CHECK: omp_loop.after:
+// CHECK:    br label %omp.region.cont3
+
+// CHECK: omp.region.cont3:
+// CHECK:    br label %omp.region.cont
+
+// CHECK: omp.region.cont:
+// CHECK:    br label %omp.par.pre_finalize
+
+// CHECK: omp.par.pre_finalize:
+// CHECK:    br label %.fini
+
+// CHECK: .fini:
+// CHECK:    br label %omp.par.exit.exitStub
 
-// CHECK-LABEL: define internal void @emit_simple_for..omp_par(
-// CHECK: store i32 0, ptr %p.lowerbound
-// CHECK: store i32 9, ptr %p.upperbound
-// CHECK: store i32 1, ptr %p.stride
-// CHECK: call void @__kmpc_for_static_init_4u(
 // CHECK: omp_loop.body:
+// real IV = (normalized_iv + lb_offset) * stride + init_val
+// CHECK:    %[[BODY_IV:.*]] = add i32 %omp_loop.iv, %[[SF_LB]]
+// CHECK:    %[[BODY_SCALED:.*]] = mul i32 %[[BODY_IV]], 1
+// CHECK:    %[[BODY_FINAL:.*]] = add i32 %[[BODY_SCALED]], 0
+// CHECK:    br label %omp.loop_nest.region
+
 // CHECK: omp.loop_nest.region:
-// CHECK: store i32 %{{.*}}, ptr %{{.*}}, align 4
-// CHECK: call void @during(i32 %{{.*}})
-// CHECK: call void @__kmpc_for_static_fini(
-// CHECK: call void @__kmpc_barrier(
+// store computed IV to i's alloca; load j and call during(j)
+// CHECK:    store i32 %[[BODY_FINAL]], ptr %[[LOADGEP_I]], align 4
+// CHECK:    %[[J_DURING:.*]] = load i32, ptr %[[LOADGEP_J]], align 4
+// CHECK:    call void @during(i32 %[[J_DURING]])
+// CHECK:    br label %omp.region.cont4
+
+// CHECK: omp.region.cont4:
+// CHECK:    br label %omp_loop.inc
+
+// CHECK: omp_loop.inc:
+// CHECK:    %[[LOOP_NEXT]] = add nuw i32 %omp_loop.iv, 1
+// CHECK:    br label %omp_loop.header
+
+// CHECK: omp.par.exit.exitStub:
+// CHECK:    ret void
 
 // Test for loop with variable bounds and type conversions
 void emit_for_with_vars() {
@@ -53,136 +172,171 @@ void emit_for_with_vars() {
   after(j);
 }
 
-// CHECK-LABEL: define dso_local void @emit_for_with_vars()
-// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @{{.*}}, i32 1, 
ptr @emit_for_with_vars..omp_par, ptr %{{.*}})
+// CHECK-LABEL: define{{.*}} void @emit_for_with_vars()
+// CHECK:    %structArg = alloca { ptr, ptr, ptr, ptr, ptr }, align 8
+// CHECK:    %[[LB_ALLOCA:.*]] = alloca i32, i64 1, align 4
+// CHECK:    %[[UB_ALLOCA:.*]] = alloca i64, i64 1, align 8
+// CHECK:    %[[STEP_ALLOCA:.*]] = alloca i16, i64 1, align 2
+// CHECK:    %[[I_ALLOCA:.*]] = alloca i32, i64 1, align 4
+// CHECK:    %[[J_ALLOCA:.*]] = alloca i32, i64 1, align 4
+// CHECK:    store i32 5, ptr %[[J_ALLOCA]], align 4
+// CHECK:    %[[J_VAL:.*]] = load i32, ptr %[[J_ALLOCA]], align 4
+// CHECK:    call void @before(i32 %[[J_VAL]])
+// CHECK:    br label %entry
 
-// CHECK-LABEL: define internal void @emit_for_with_vars..omp_par(
-// variable upper bound: loaded and truncated from i64 to i32
-// CHECK: %{{.*}} = trunc i64 %{{.*}} to i32
-// variable step: loaded and sign-extended from i16 to i32
-// CHECK: %{{.*}} = sext i16 %{{.*}} to i32
-// CHECK: call void @__kmpc_for_static_init_4u(
-// CHECK: omp.loop_nest.region:
-// CHECK: call void @during(i32 %{{.*}})
-// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: entry:
+// CHECK:    %[[GTN_OUTER:.*]] = call i32 @__kmpc_global_thread_num(ptr 
@{{[0-9]+}})
+// CHECK:    br label %omp_parallel
 
-// Test induction variable is accessible in the loop body: during(i)
-void emit_for_with_induction_var() {
-#pragma omp parallel
-  {
-#pragma omp for
-    for (int i = 0; i < 10; i++) {
-      during(i);
-    }
-  }
-}
+// CHECK: omp_parallel:
+// CHECK:    %[[GEP_LB:.*]] = getelementptr { ptr, ptr, ptr, ptr, ptr }, ptr 
%structArg, i32 0, i32 0
+// CHECK:    store ptr %[[LB_ALLOCA]], ptr %[[GEP_LB]], align 8
+// CHECK:    %[[GEP_UB:.*]] = getelementptr { ptr, ptr, ptr, ptr, ptr }, ptr 
%structArg, i32 0, i32 1
+// CHECK:    store ptr %[[UB_ALLOCA]], ptr %[[GEP_UB]], align 8
+// CHECK:    %[[GEP_STEP:.*]] = getelementptr { ptr, ptr, ptr, ptr, ptr }, ptr 
%structArg, i32 0, i32 2
+// CHECK:    store ptr %[[STEP_ALLOCA]], ptr %[[GEP_STEP]], align 8
+// CHECK:    %[[GEP_I:.*]] = getelementptr { ptr, ptr, ptr, ptr, ptr }, ptr 
%structArg, i32 0, i32 3
+// CHECK:    store ptr %[[I_ALLOCA]], ptr %[[GEP_I]], align 8
+// CHECK:    %[[GEP_J:.*]] = getelementptr { ptr, ptr, ptr, ptr, ptr }, ptr 
%structArg, i32 0, i32 4
+// CHECK:    store ptr %[[J_ALLOCA]], ptr %[[GEP_J]], align 8
+// CHECK:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @{{[0-9]+}}, 
i32 1, ptr @emit_for_with_vars..omp_par, ptr %structArg)
+// CHECK:    br label %omp.par.exit
 
-// CHECK-LABEL: define internal void @emit_for_with_induction_var..omp_par(
-// CHECK: store i32 0, ptr %p.lowerbound
-// CHECK: store i32 9, ptr %p.upperbound
-// CHECK: omp.loop_nest.region:
-// IV is stored to the alloca and then loaded for during(i)
-// CHECK: store i32 %{{.*}}, ptr %[[IV_PTR:.*]], align 4
-// CHECK: %[[IV_LOAD:.*]] = load i32, ptr %[[IV_PTR]], align 4
-// CHECK: call void @during(i32 %[[IV_LOAD]])
+// CHECK: omp.par.exit:
+// CHECK:    %[[J_AFTER:.*]] = load i32, ptr %[[J_ALLOCA]], align 4
+// CHECK:    call void @after(i32 %[[J_AFTER]])
+// CHECK:    ret void
 
-// Test inclusive upper bound: for (int i = 0; i <= 9; i++)
-void emit_for_inclusive_bound() {
-#pragma omp parallel
-  {
-#pragma omp for
-    for (int i = 0; i <= 9; i++) {
-      during(i);
-    }
-  }
-}
 
-// CHECK-LABEL: define internal void @emit_for_inclusive_bound..omp_par(
-// inclusive i <= 9 has same trip count as i < 10
-// CHECK: store i32 0, ptr %p.lowerbound
-// CHECK: store i32 9, ptr %p.upperbound
-// CHECK: call void @__kmpc_for_static_init_4u(
-// CHECK: omp.loop_nest.region:
-// CHECK: call void @during(i32 %{{.*}})
+// CHECK-LABEL: define{{.*}} void @emit_for_with_vars..omp_par(ptr noalias 
%tid.addr, ptr noalias %zero.addr, ptr %0)
 
-// Test reversed comparison: for (int i = 0; 10 > i; i++)
-void emit_for_reversed_cmp() {
-#pragma omp parallel
-  {
-#pragma omp for
-    for (int i = 0; 10 > i; i++) {
-      during(i);
-    }
-  }
-}
+// CHECK: omp.par.entry:
+// CHECK:    %[[GEP_LB_PAR:.*]] = getelementptr { ptr, ptr, ptr, ptr, ptr }, 
ptr %0, i32 0, i32 0
+// CHECK:    %[[LOADGEP_LB:.*]] = load ptr, ptr %[[GEP_LB_PAR]], align 8
+// CHECK:    %[[GEP_UB_PAR:.*]] = getelementptr { ptr, ptr, ptr, ptr, ptr }, 
ptr %0, i32 0, i32 1
+// CHECK:    %[[LOADGEP_UB:.*]] = load ptr, ptr %[[GEP_UB_PAR]], align 8
+// CHECK:    %[[GEP_STEP_PAR:.*]] = getelementptr { ptr, ptr, ptr, ptr, ptr }, 
ptr %0, i32 0, i32 2
+// CHECK:    %[[LOADGEP_STEP:.*]] = load ptr, ptr %[[GEP_STEP_PAR]], align 8
+// CHECK:    %[[GEP_I_PAR:.*]] = getelementptr { ptr, ptr, ptr, ptr, ptr }, 
ptr %0, i32 0, i32 3
+// CHECK:    %[[LOADGEP_I:.*]] = load ptr, ptr %[[GEP_I_PAR]], align 8
+// CHECK:    %[[GEP_J_PAR:.*]] = getelementptr { ptr, ptr, ptr, ptr, ptr }, 
ptr %0, i32 0, i32 4
+// CHECK:    %[[LOADGEP_J:.*]] = load ptr, ptr %[[GEP_J_PAR]], align 8
+// CHECK:    %p.lastiter = alloca i32, align 4
+// CHECK:    %p.lowerbound = alloca i32, align 4
+// CHECK:    %p.upperbound = alloca i32, align 4
+// CHECK:    %p.stride = alloca i32, align 4
+// CHECK:    %[[TID_LOCAL:.*]] = alloca i32, align 4
+// CHECK:    %[[TID_LOAD:.*]] = load i32, ptr %tid.addr, align 4
+// CHECK:    store i32 %[[TID_LOAD]], ptr %[[TID_LOCAL]], align 4
+// CHECK:    %[[TID:.*]] = load i32, ptr %[[TID_LOCAL]], align 4
+// CHECK:    br label %omp.region.after_alloca2
 
-// CHECK-LABEL: define internal void @emit_for_reversed_cmp..omp_par(
-// reversed cmp (10 > i) produces same bounds as (i < 10)
-// CHECK: store i32 0, ptr %p.lowerbound
-// CHECK: store i32 9, ptr %p.upperbound
-// CHECK: call void @__kmpc_for_static_init_4u(
+// CHECK: omp.region.after_alloca2:
+// CHECK:    br label %omp.region.after_alloca
 
-// Test reversed inclusive comparison: for (int i = 0; 9 >= i; i++)
-void emit_for_reversed_inclusive_cmp() {
-#pragma omp parallel
-  {
-#pragma omp for
-    for (int i = 0; 9 >= i; i++) {
-      during(i);
-    }
-  }
-}
+// CHECK: omp.region.after_alloca:
+// CHECK:    br label %omp.par.region
 
-// CHECK-LABEL: define internal void @emit_for_reversed_inclusive_cmp..omp_par(
-// reversed inclusive cmp (9 >= i) produces same bounds as (i <= 9)
-// CHECK: store i32 0, ptr %p.lowerbound
-// CHECK: store i32 9, ptr %p.upperbound
-// CHECK: call void @__kmpc_for_static_init_4u(
+// CHECK: omp.par.region:
+// CHECK:    br label %omp.par.region1
 
-// Test compound assignment step: for (int i = 0; i < 20; i += 2)
-void emit_for_compound_step() {
-#pragma omp parallel
-  {
-#pragma omp for
-    for (int i = 0; i < 20; i += 2) {
-      during(i);
-    }
-  }
-}
+// CHECK: omp.par.region1:
+// initialize lb=1, ub=10, step=1 and i=0
+// CHECK:    store i32 1, ptr %[[LOADGEP_LB]], align 4
+// CHECK:    store i64 10, ptr %[[LOADGEP_UB]], align 8
+// CHECK:    store i16 1, ptr %[[LOADGEP_STEP]], align 2
+// load ub and step, truncate/extend to i32 for trip count computation
+// CHECK:    %[[UB_VAL:.*]] = load i64, ptr %[[LOADGEP_UB]], align 8
+// CHECK:    %[[UB_TRUNC:.*]] = trunc i64 %[[UB_VAL]] to i32
+// CHECK:    %[[STEP_VAL:.*]] = load i16, ptr %[[LOADGEP_STEP]], align 2
+// CHECK:    %[[STEP_SEXT:.*]] = sext i16 %[[STEP_VAL]] to i32
+// CHECK:    store i32 0, ptr %[[LOADGEP_I]], align 4
+// CHECK:    br label %omp.wsloop.region
 
-// CHECK-LABEL: define internal void @emit_for_compound_step..omp_par(
-// step = 2 visible in the loop body IV computation
-// CHECK: call void @__kmpc_for_static_init_4u(
-// CHECK: omp_loop.body:
-// CHECK: %{{.*}} = mul i32 %{{.*}}, 2
-// CHECK: omp.loop_nest.region:
-// CHECK: call void @during(i32 %{{.*}})
+// CHECK: omp.wsloop.region:
+// compute absolute value of step to normalize direction
+// CHECK:    %[[STEP_NEG:.*]] = icmp slt i32 %[[STEP_SEXT]], 0
+// CHECK:    %[[STEP_NEG_VAL:.*]] = sub i32 0, %[[STEP_SEXT]]
+// CHECK:    %[[STEP_ABS:.*]] = select i1 %[[STEP_NEG]], i32 
%[[STEP_NEG_VAL]], i32 %[[STEP_SEXT]]
+// select lb/ub based on step direction
+// CHECK:    %[[RANGE_LO:.*]] = select i1 %[[STEP_NEG]], i32 %[[UB_TRUNC]], 
i32 0
+// CHECK:    %[[RANGE_HI:.*]] = select i1 %[[STEP_NEG]], i32 0, i32 
%[[UB_TRUNC]]
+// CHECK:    %[[RANGE_DIFF:.*]] = sub nsw i32 %[[RANGE_HI]], %[[RANGE_LO]]
+// CHECK:    %[[RANGE_EMPTY:.*]] = icmp sle i32 %[[RANGE_HI]], %[[RANGE_LO]]
+// compute trip count = (diff - 1) / abs(step) + 1
+// CHECK:    %[[TC_SUB:.*]] = sub i32 %[[RANGE_DIFF]], 1
+// CHECK:    %[[TC_DIV:.*]] = udiv i32 %[[TC_SUB]], %[[STEP_ABS]]
+// CHECK:    %[[TC_ADD:.*]] = add i32 %[[TC_DIV]], 1
+// CHECK:    %[[TC_ONE:.*]] = icmp ule i32 %[[RANGE_DIFF]], %[[STEP_ABS]]
+// CHECK:    %[[TC_CLAMPED:.*]] = select i1 %[[TC_ONE]], i32 1, i32 %[[TC_ADD]]
+// CHECK:    %omp_loop.tripcount = select i1 %[[RANGE_EMPTY]], i32 0, i32 
%[[TC_CLAMPED]]
+// CHECK:    br label %omp_loop.preheader
 
-// Test commuted step expression: for (int i = 0; i < 30; i = step + i)
-void emit_for_commuted_step() {
-  short step = 3;
-#pragma omp parallel
-  {
-#pragma omp for
-    for (int i = 0; i < 30; i = step + i) {
-      during(i);
-    }
-  }
-}
+// CHECK: omp_loop.preheader:
+// set normalized loop bounds: lb=0, ub=tripcount-1, stride=1
+// CHECK:    store i32 0, ptr %p.lowerbound, align 4
+// CHECK:    %[[TC_MINUS1:.*]] = sub i32 %omp_loop.tripcount, 1
+// CHECK:    store i32 %[[TC_MINUS1]], ptr %p.upperbound, align 4
+// CHECK:    store i32 1, ptr %p.stride, align 4
+// CHECK:    %[[GTN_WSLOOP:.*]] = call i32 @__kmpc_global_thread_num(ptr 
@{{[0-9]+}})
+// CHECK:    call void @__kmpc_for_static_init_4u(ptr @{{[0-9]+}}, i32 
%[[GTN_WSLOOP]], i32 34, ptr %p.lastiter, ptr %p.lowerbound, ptr %p.upperbound, 
ptr %p.stride, i32 1, i32 0)
+// reload thread-local lb/ub after static partitioning and compute local trip 
count
+// CHECK:    %[[SF_LB:.*]] = load i32, ptr %p.lowerbound, align 4
+// CHECK:    %[[SF_UB:.*]] = load i32, ptr %p.upperbound, align 4
+// CHECK:    %[[SF_DIFF:.*]] = sub i32 %[[SF_UB]], %[[SF_LB]]
+// CHECK:    %[[SF_TC:.*]] = add i32 %[[SF_DIFF]], 1
+// CHECK:    br label %omp_loop.header
+
+// CHECK: omp_loop.header:
+// CHECK:    %omp_loop.iv = phi i32 [ 0, %omp_loop.preheader ], [ 
%[[LOOP_NEXT:.*]], %omp_loop.inc ]
+// CHECK:    br label %omp_loop.cond
+
+// CHECK: omp_loop.cond:
+// CHECK:    %[[LOOP_CMP:.*]] = icmp ult i32 %omp_loop.iv, %[[SF_TC]]
+// CHECK:    br i1 %[[LOOP_CMP]], label %omp_loop.body, label %omp_loop.exit
+
+// CHECK: omp_loop.exit:
+// CHECK:    call void @__kmpc_for_static_fini(ptr @{{[0-9]+}}, i32 
%[[GTN_WSLOOP]])
+// CHECK:    %[[GTN_BARRIER:.*]] = call i32 @__kmpc_global_thread_num(ptr 
@{{[0-9]+}})
+// CHECK:    call void @__kmpc_barrier(ptr @{{[0-9]+}}, i32 %[[GTN_BARRIER]])
+// CHECK:    br label %omp_loop.after
+
+// CHECK: omp_loop.after:
+// CHECK:    br label %omp.region.cont3
+
+// CHECK: omp.region.cont3:
+// CHECK:    br label %omp.region.cont
+
+// CHECK: omp.region.cont:
+// CHECK:    br label %omp.par.pre_finalize
+
+// CHECK: omp.par.pre_finalize:
+// CHECK:    br label %.fini
+
+// CHECK: .fini:
+// CHECK:    br label %omp.par.exit.exitStub
 
-// CHECK-LABEL: define internal void @emit_for_commuted_step..omp_par(
-// variable step loaded and sign-extended from i16
-// CHECK: %{{.*}} = sext i16 %{{.*}} to i32
-// CHECK: call void @__kmpc_for_static_init_4u(
 // CHECK: omp_loop.body:
-// step is variable, multiplied into IV
-// CHECK: %{{.*}} = mul i32 %{{.*}}, %{{.*}}
+// real IV = (normalized_iv + lb_offset) * step + init_val
+// CHECK:    %[[BODY_IV:.*]] = add i32 %omp_loop.iv, %[[SF_LB]]
+// CHECK:    %[[BODY_SCALED:.*]] = mul i32 %[[BODY_IV]], %[[STEP_SEXT]]
+// CHECK:    %[[BODY_FINAL:.*]] = add i32 %[[BODY_SCALED]], 0
+// CHECK:    br label %omp.loop_nest.region
+
 // CHECK: omp.loop_nest.region:
-// CHECK: call void @during(i32 %{{.*}})
-
-// Verify OpenMP runtime declarations
-// CHECK: declare i32 @__kmpc_global_thread_num(ptr)
-// CHECK: declare void @__kmpc_for_static_init_4u(ptr, i32, i32, ptr, ptr, 
ptr, ptr, i32, i32)
-// CHECK: declare void @__kmpc_for_static_fini(ptr, i32)
-// CHECK: declare void @__kmpc_barrier(ptr, i32)
-// CHECK: declare {{.*}}void @__kmpc_fork_call(ptr, i32, ptr, ...)
+// store computed IV to i's alloca; load j and call during(j)
+// CHECK:    store i32 %[[BODY_FINAL]], ptr %[[LOADGEP_I]], align 4
+// CHECK:    %[[J_DURING:.*]] = load i32, ptr %[[LOADGEP_J]], align 4
+// CHECK:    call void @during(i32 %[[J_DURING]])
+// CHECK:    br label %omp.region.cont4
+
+// CHECK: omp.region.cont4:
+// CHECK:    br label %omp_loop.inc
+
+// CHECK: omp_loop.inc:
+// CHECK:    %[[LOOP_NEXT]] = add nuw i32 %omp_loop.iv, 1
+// CHECK:    br label %omp_loop.header
+
+// CHECK: omp.par.exit.exitStub:
+// CHECK:    ret void
+

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR][OpenMP] Emit #pragma omp for as omp.wsloop + omp.loop_nest (PR #181841)

Reply via email to