https://github.com/erichkeane created https://github.com/llvm/llvm-project/pull/136626
Async acts just like num_workers/vector_length in that it gets a new variant per device_type and is lowered as an operand. However, it has one additional complication, in that it can have a variant that has no argument, which produces an attribute with the correct devicetype. Additionally, this syncronizes us with the implementation of flang, which prohibits multiple 'async' clauses per-device_type. >From 554256a719265abeb3cac278fbb1a19d7b989545 Mon Sep 17 00:00:00 2001 From: erichkeane <eke...@nvidia.com> Date: Mon, 21 Apr 2025 12:54:23 -0700 Subject: [PATCH] [OpenACC][CIR] Implement 'async' lowering. Async acts just like num_workers/vector_length in that it gets a new variant per device_type and is lowered as an operand. However, it has one additional complication, in that it can have a variant that has no argument, which produces an attribute with the correct devicetype. Additionally, this syncronizes us with the implementation of flang, which prohibits multiple 'async' clauses per-device_type. --- clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp | 42 +++++++++++++---- clang/lib/Sema/SemaOpenACCClause.cpp | 3 ++ clang/test/CIR/CodeGenOpenACC/kernels.c | 46 +++++++++++++++++++ clang/test/CIR/CodeGenOpenACC/parallel.c | 46 +++++++++++++++++++ clang/test/CIR/CodeGenOpenACC/serial.c | 46 +++++++++++++++++++ .../compute-construct-async-clause.c | 42 +++++++++++++++++ 6 files changed, 216 insertions(+), 9 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp index e7dd2e74b0864..82fbb49db3bc8 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp @@ -97,11 +97,13 @@ class OpenACCClauseCIREmitter final // Handle a clause affected by the 'device-type' to the point that they need // to have the attributes added in the correct/corresponding order, such as - // 'num_workers' or 'vector_length' on a compute construct. - mlir::ArrayAttr - handleDeviceTypeAffectedClause(mlir::ArrayAttr existingDeviceTypes, - mlir::Value argument, - mlir::MutableOperandRange &argCollection) { + // 'num_workers' or 'vector_length' on a compute construct. For cases where we + // don't have an argument that needs to be added to an additional one (such as + // asyncOnly) we can use this with 'argument' as std::nullopt. + mlir::ArrayAttr handleDeviceTypeAffectedClause( + mlir::ArrayAttr existingDeviceTypes, + std::optional<mlir::Value> argument = std::nullopt, + mlir::MutableOperandRange *argCollection = nullptr) { llvm::SmallVector<mlir::Attribute> deviceTypes; // Collect the 'existing' device-type attributes so we can re-create them @@ -120,13 +122,19 @@ class OpenACCClauseCIREmitter final lastDeviceTypeClause->getArchitectures()) { deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get( builder.getContext(), decodeDeviceType(arch.getIdentifierInfo()))); - argCollection.append(argument); + if (argument) { + assert(argCollection); + argCollection->append(*argument); + } } } else { // Else, we just add a single for 'none'. deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get( builder.getContext(), mlir::acc::DeviceType::None)); - argCollection.append(argument); + if (argument) { + assert(argCollection); + argCollection->append(*argument); + } } return mlir::ArrayAttr::get(builder.getContext(), deviceTypes); @@ -205,7 +213,7 @@ class OpenACCClauseCIREmitter final mlir::MutableOperandRange range = operation.getNumWorkersMutable(); operation.setNumWorkersDeviceTypeAttr(handleDeviceTypeAffectedClause( operation.getNumWorkersDeviceTypeAttr(), - createIntExpr(clause.getIntExpr()), range)); + createIntExpr(clause.getIntExpr()), &range)); } else if constexpr (isOneOfTypes<OpTy, SerialOp>) { llvm_unreachable("num_workers not valid on serial"); } else { @@ -218,7 +226,7 @@ class OpenACCClauseCIREmitter final mlir::MutableOperandRange range = operation.getVectorLengthMutable(); operation.setVectorLengthDeviceTypeAttr(handleDeviceTypeAffectedClause( operation.getVectorLengthDeviceTypeAttr(), - createIntExpr(clause.getIntExpr()), range)); + createIntExpr(clause.getIntExpr()), &range)); } else if constexpr (isOneOfTypes<OpTy, SerialOp>) { llvm_unreachable("vector_length not valid on serial"); } else { @@ -226,6 +234,22 @@ class OpenACCClauseCIREmitter final } } + void VisitAsyncClause(const OpenACCAsyncClause &clause) { + if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) { + if (!clause.hasIntExpr()) { + operation.setAsyncOnlyAttr( + handleDeviceTypeAffectedClause(operation.getAsyncOnlyAttr())); + } else { + mlir::MutableOperandRange range = operation.getAsyncOperandsMutable(); + operation.setAsyncOperandsDeviceTypeAttr(handleDeviceTypeAffectedClause( + operation.getAsyncOperandsDeviceTypeAttr(), + createIntExpr(clause.getIntExpr()), &range)); + } + } else { + return clauseNotImplemented(clause); + } + } + void VisitSelfClause(const OpenACCSelfClause &clause) { if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) { if (clause.isEmptySelfClause()) { diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp index 3694a831b76de..ed437ac62e332 100644 --- a/clang/lib/Sema/SemaOpenACCClause.cpp +++ b/clang/lib/Sema/SemaOpenACCClause.cpp @@ -639,6 +639,9 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause( OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause( SemaOpenACC::OpenACCParsedClause &Clause) { + if (DisallowSinceLastDeviceType<OpenACCAsyncClause>(Clause)) + return nullptr; + assert(Clause.getNumIntExprs() < 2 && "Invalid number of expressions for Async"); return OpenACCAsyncClause::Create( diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c index d2da1d18f1534..1744acf0ab223 100644 --- a/clang/test/CIR/CodeGenOpenACC/kernels.c +++ b/clang/test/CIR/CodeGenOpenACC/kernels.c @@ -210,5 +210,51 @@ void acc_kernels(int cond) { // CHECK-NEXT: acc.terminator // CHECK-NEXT: } loc +#pragma acc kernels async + {} + // CHECK-NEXT: acc.kernels { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} + +#pragma acc kernels async(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels async device_type(nvidia, radeon) async + {} + // CHECK-NEXT: acc.kernels { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]} + +#pragma acc kernels async(3) device_type(nvidia, radeon) async(cond) + {} + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc kernels async device_type(nvidia, radeon) async(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} + +#pragma acc kernels async(3) device_type(nvidia, radeon) async + {} + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32) { + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} + // CHECK-NEXT: cir.return } diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c index 61dccc591c252..892d931c880e7 100644 --- a/clang/test/CIR/CodeGenOpenACC/parallel.c +++ b/clang/test/CIR/CodeGenOpenACC/parallel.c @@ -209,5 +209,51 @@ void acc_parallel(int cond) { // CHECK-NEXT: acc.yield // CHECK-NEXT: } loc +#pragma acc parallel async + {} + // CHECK-NEXT: acc.parallel { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} + +#pragma acc parallel async(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel async device_type(nvidia, radeon) async + {} + // CHECK-NEXT: acc.parallel { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]} + +#pragma acc parallel async(3) device_type(nvidia, radeon) async(cond) + {} + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel async device_type(nvidia, radeon) async(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} + +#pragma acc parallel async(3) device_type(nvidia, radeon) async + {} + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} + // CHECK-NEXT: cir.return } diff --git a/clang/test/CIR/CodeGenOpenACC/serial.c b/clang/test/CIR/CodeGenOpenACC/serial.c index b72f44a2ea473..094958f0e3b23 100644 --- a/clang/test/CIR/CodeGenOpenACC/serial.c +++ b/clang/test/CIR/CodeGenOpenACC/serial.c @@ -106,5 +106,51 @@ void acc_serial(int cond) { // CHECK-NEXT: acc.yield // CHECK-NEXT: } loc +#pragma acc serial async + {} + // CHECK-NEXT: acc.serial { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} + +#pragma acc serial async(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial async device_type(nvidia, radeon) async + {} + // CHECK-NEXT: acc.serial { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]} + +#pragma acc serial async(3) device_type(nvidia, radeon) async(cond) + {} + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial async device_type(nvidia, radeon) async(cond) + {} + // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i + // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32 + // CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} + +#pragma acc serial async(3) device_type(nvidia, radeon) async + {} + // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i + // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32 + // CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} + // CHECK-NEXT: cir.return } diff --git a/clang/test/SemaOpenACC/compute-construct-async-clause.c b/clang/test/SemaOpenACC/compute-construct-async-clause.c index 4895d7f2209bb..4ca963713254c 100644 --- a/clang/test/SemaOpenACC/compute-construct-async-clause.c +++ b/clang/test/SemaOpenACC/compute-construct-async-clause.c @@ -20,6 +20,48 @@ void Test() { #pragma acc serial async(1, 2) while(1); + // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'kernels' directive}} + // expected-note@+1{{previous clause is here}} +#pragma acc kernels async async + while(1); + + // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'kernels' directive}} + // expected-note@+1{{previous clause is here}} +#pragma acc kernels async(1) async(2) + while(1); + + // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'parallel' directive}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel async(1) async(2) + while(1); + + // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on a 'serial' directive}} + // expected-note@+1{{previous clause is here}} +#pragma acc serial async(1) async(2) + while(1); + + // expected-error@+3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'kernels' directive}} + // expected-note@+2{{previous clause is here}} + // expected-note@+1{{previous clause is here}} +#pragma acc kernels async(1) device_type(*) async(1) async(2) + while(1); + // expected-error@+3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'parallel' directive}} + // expected-note@+2{{previous clause is here}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel async device_type(*) async async + while(1); + // expected-error@+3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'serial' directive}} + // expected-note@+2{{previous clause is here}} + // expected-note@+1{{previous clause is here}} +#pragma acc serial async(1) device_type(*) async async(2) + while(1); + + // expected-error@+3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'parallel' directive}} + // expected-note@+2{{previous clause is here}} + // expected-note@+1{{previous clause is here}} +#pragma acc parallel device_type(*) async async + while(1); + struct NotConvertible{} NC; // expected-error@+1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}} #pragma acc parallel async(NC) _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits