Author: Erich Keane Date: 2025-04-21T11:51:26-07:00 New Revision: 0ae9dac262afccea1e1a2e02520f737ab38b286c
URL: https://github.com/llvm/llvm-project/commit/0ae9dac262afccea1e1a2e02520f737ab38b286c DIFF: https://github.com/llvm/llvm-project/commit/0ae9dac262afccea1e1a2e02520f737ab38b286c.diff LOG: [OpenACC][CIR] Lower 'num_workers' for parallel/kernels (#136578) This patch also includes the first one to handle 'device_type' properly, which is where most of the 'challenge' here comes from. >From the best I can tell: we must keep two lists of the same size, 1 of all of the 'num_workers' items, and 1 of the 'device_type' value for that 'num_workers'. Additionally, the 'device_type' list can only handle single 'device_type' values, so we have to duplicate the 'num_workers' items in cases where there are multiple applicable 'device_type' values. This patch accomplishes this by keeping the two in sync, and saving the current 'device_type' in the visitor. Added: Modified: clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp clang/test/CIR/CodeGenOpenACC/kernels.c clang/test/CIR/CodeGenOpenACC/parallel.c Removed: ################################################################################ diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp index 30e840cbfa1f7..604fdf369860e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp @@ -46,10 +46,27 @@ class OpenACCClauseCIREmitter final // diagnostics are gone. SourceLocation dirLoc; + const OpenACCDeviceTypeClause *lastDeviceTypeClause = nullptr; + void clauseNotImplemented(const OpenACCClause &c) { cgf.cgm.errorNYI(c.getSourceRange(), "OpenACC Clause", c.getClauseKind()); } + mlir::Value createIntExpr(const Expr *intExpr) { + mlir::Value expr = cgf.emitScalarExpr(intExpr); + mlir::Location exprLoc = cgf.cgm.getLoc(intExpr->getBeginLoc()); + + mlir::IntegerType targetType = mlir::IntegerType::get( + &cgf.getMLIRContext(), cgf.getContext().getIntWidth(intExpr->getType()), + intExpr->getType()->isSignedIntegerOrEnumerationType() + ? mlir::IntegerType::SignednessSemantics::Signed + : mlir::IntegerType::SignednessSemantics::Unsigned); + + auto conversionOp = builder.create<mlir::UnrealizedConversionCastOp>( + exprLoc, targetType, expr); + return conversionOp.getResult(0); + } + // 'condition' as an OpenACC grammar production is used for 'if' and (some // variants of) 'self'. It needs to be emitted as a signless-1-bit value, so // this function emits the expression, then sets the unrealized conversion @@ -109,6 +126,7 @@ class OpenACCClauseCIREmitter final } void VisitDeviceTypeClause(const OpenACCDeviceTypeClause &clause) { + lastDeviceTypeClause = &clause; if constexpr (isOneOfTypes<OpTy, InitOp, ShutdownOp>) { llvm::SmallVector<mlir::Attribute> deviceTypes; std::optional<mlir::ArrayAttr> existingDeviceTypes = @@ -116,7 +134,7 @@ class OpenACCClauseCIREmitter final // Ensure we keep the existing ones, and in the correct 'new' order. if (existingDeviceTypes) { - for (const mlir::Attribute &Attr : *existingDeviceTypes) + for (mlir::Attribute Attr : *existingDeviceTypes) deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get( builder.getContext(), cast<mlir::acc::DeviceTypeAttr>(Attr).getValue())); @@ -136,6 +154,51 @@ class OpenACCClauseCIREmitter final if (!clause.getArchitectures().empty()) operation.setDeviceType( decodeDeviceType(clause.getArchitectures()[0].getIdentifierInfo())); + } else if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) { + // Nothing to do here, these constructs don't have any IR for these, as + // they just modify the other clauses IR. So setting of `lastDeviceType` + // (done above) is all we need. + } else { + return clauseNotImplemented(clause); + } + } + + void VisitNumWorkersClause(const OpenACCNumWorkersClause &clause) { + if constexpr (isOneOfTypes<OpTy, ParallelOp, KernelsOp>) { + // Collect the 'existing' device-type attributes so we can re-create them + // and insert them. + llvm::SmallVector<mlir::Attribute> deviceTypes; + mlir::ArrayAttr existingDeviceTypes = + operation.getNumWorkersDeviceTypeAttr(); + + if (existingDeviceTypes) { + for (mlir::Attribute Attr : existingDeviceTypes) + deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get( + builder.getContext(), + cast<mlir::acc::DeviceTypeAttr>(Attr).getValue())); + } + + // Insert 1 version of the 'int-expr' to the NumWorkers list per-current + // device type. + mlir::Value intExpr = createIntExpr(clause.getIntExpr()); + if (lastDeviceTypeClause) { + for (const DeviceTypeArgument &arg : + lastDeviceTypeClause->getArchitectures()) { + deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get( + builder.getContext(), decodeDeviceType(arg.getIdentifierInfo()))); + operation.getNumWorkersMutable().append(intExpr); + } + } else { + // Else, we just add a single for 'none'. + deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get( + builder.getContext(), mlir::acc::DeviceType::None)); + operation.getNumWorkersMutable().append(intExpr); + } + + operation.setNumWorkersDeviceTypeAttr( + mlir::ArrayAttr::get(builder.getContext(), deviceTypes)); + } else if constexpr (isOneOfTypes<OpTy, SerialOp>) { + llvm_unreachable("num_workers not valid on serial"); } else { return clauseNotImplemented(clause); } diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c index ca5bfebcb4ff3..6459b310546cd 100644 --- a/clang/test/CIR/CodeGenOpenACC/kernels.c +++ b/clang/test/CIR/CodeGenOpenACC/kernels.c @@ -106,5 +106,57 @@ void acc_kernels(int cond) { // CHECK-NEXT: acc.terminator // CHECK-NEXT: } loc +#pragma acc kernels num_workers(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels num_workers(cond) device_type(nvidia) num_workers(2u) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i + // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32 + // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels num_workers(cond) device_type(nvidia, host) num_workers(2) device_type(radeon) num_workers(3) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i + // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32 + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels num_workers(cond) device_type(nvidia) num_workers(2) device_type(radeon, multicore) num_workers(3) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i + // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32 + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>], %[[THREE_CAST]] : si32 [#acc.device_type<multicore>]) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels device_type(nvidia) num_workers(2) device_type(radeon) num_workers(3) + {} + // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i + // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32 + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: acc.kernels num_workers(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + // CHECK-NEXT: cir.return } diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c index 3fb0b987409db..bdb506ee7e1d2 100644 --- a/clang/test/CIR/CodeGenOpenACC/parallel.c +++ b/clang/test/CIR/CodeGenOpenACC/parallel.c @@ -105,5 +105,57 @@ void acc_parallel(int cond) { // CHECK-NEXT: acc.yield // CHECK-NEXT: } loc +#pragma acc parallel num_workers(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel num_workers(cond) device_type(nvidia) num_workers(2u) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i + // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32 + // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel num_workers(cond) device_type(nvidia, host) num_workers(2) device_type(radeon) num_workers(3) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i + // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32 + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel num_workers(cond) device_type(nvidia) num_workers(2) device_type(radeon, multicore) num_workers(4) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i + // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32 + // CHECK-NEXT: %[[FOUR_LITERAL:.*]] = cir.const #cir.int<4> : !s32i + // CHECK-NEXT: %[[FOUR_CAST:.*]] = builtin.unrealized_conversion_cast %[[FOUR_LITERAL]] : !s32i to si32 + // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[FOUR_CAST]] : si32 [#acc.device_type<radeon>], %[[FOUR_CAST]] : si32 [#acc.device_type<multicore>]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel device_type(nvidia) num_workers(2) device_type(radeon) num_workers(3) + {} + // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i + // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32 + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: acc.parallel num_workers(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: cir.return } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits