https://github.com/erichkeane created 
https://github.com/llvm/llvm-project/pull/136626

Async acts just like num_workers/vector_length in that it gets a new variant 
per device_type and is lowered as an operand.

However, it has one additional complication, in that it can have a variant that 
has no argument, which produces an attribute with the correct devicetype.

Additionally, this syncronizes us with the implementation of flang,
  which prohibits multiple 'async' clauses per-device_type.

>From 554256a719265abeb3cac278fbb1a19d7b989545 Mon Sep 17 00:00:00 2001
From: erichkeane <eke...@nvidia.com>
Date: Mon, 21 Apr 2025 12:54:23 -0700
Subject: [PATCH] [OpenACC][CIR] Implement 'async' lowering.

Async acts just like num_workers/vector_length in that it gets a new
variant per device_type and is lowered as an operand.

However, it has one additional complication, in that it can have a
variant that has no argument, which produces an attribute with the
correct devicetype.

Additionally, this syncronizes us with the implementation of flang,
  which prohibits multiple 'async' clauses per-device_type.
---
 clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp   | 42 +++++++++++++----
 clang/lib/Sema/SemaOpenACCClause.cpp          |  3 ++
 clang/test/CIR/CodeGenOpenACC/kernels.c       | 46 +++++++++++++++++++
 clang/test/CIR/CodeGenOpenACC/parallel.c      | 46 +++++++++++++++++++
 clang/test/CIR/CodeGenOpenACC/serial.c        | 46 +++++++++++++++++++
 .../compute-construct-async-clause.c          | 42 +++++++++++++++++
 6 files changed, 216 insertions(+), 9 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp 
b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index e7dd2e74b0864..82fbb49db3bc8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -97,11 +97,13 @@ class OpenACCClauseCIREmitter final
 
   // Handle a clause affected by the 'device-type' to the point that they need
   // to have the attributes added in the correct/corresponding order, such as
-  // 'num_workers' or 'vector_length' on a compute construct.
-  mlir::ArrayAttr
-  handleDeviceTypeAffectedClause(mlir::ArrayAttr existingDeviceTypes,
-                                 mlir::Value argument,
-                                 mlir::MutableOperandRange &argCollection) {
+  // 'num_workers' or 'vector_length' on a compute construct. For cases where 
we
+  // don't have an argument that needs to be added to an additional one (such 
as
+  // asyncOnly) we can use this with 'argument' as std::nullopt.
+  mlir::ArrayAttr handleDeviceTypeAffectedClause(
+      mlir::ArrayAttr existingDeviceTypes,
+      std::optional<mlir::Value> argument = std::nullopt,
+      mlir::MutableOperandRange *argCollection = nullptr) {
     llvm::SmallVector<mlir::Attribute> deviceTypes;
 
     // Collect the 'existing' device-type attributes so we can re-create them
@@ -120,13 +122,19 @@ class OpenACCClauseCIREmitter final
            lastDeviceTypeClause->getArchitectures()) {
         deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
             builder.getContext(), decodeDeviceType(arch.getIdentifierInfo())));
-        argCollection.append(argument);
+        if (argument) {
+          assert(argCollection);
+          argCollection->append(*argument);
+        }
       }
     } else {
       // Else, we just add a single for 'none'.
       deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
           builder.getContext(), mlir::acc::DeviceType::None));
-      argCollection.append(argument);
+      if (argument) {
+        assert(argCollection);
+        argCollection->append(*argument);
+      }
     }
 
     return mlir::ArrayAttr::get(builder.getContext(), deviceTypes);
@@ -205,7 +213,7 @@ class OpenACCClauseCIREmitter final
       mlir::MutableOperandRange range = operation.getNumWorkersMutable();
       operation.setNumWorkersDeviceTypeAttr(handleDeviceTypeAffectedClause(
           operation.getNumWorkersDeviceTypeAttr(),
-          createIntExpr(clause.getIntExpr()), range));
+          createIntExpr(clause.getIntExpr()), &range));
     } else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
       llvm_unreachable("num_workers not valid on serial");
     } else {
@@ -218,7 +226,7 @@ class OpenACCClauseCIREmitter final
       mlir::MutableOperandRange range = operation.getVectorLengthMutable();
       operation.setVectorLengthDeviceTypeAttr(handleDeviceTypeAffectedClause(
           operation.getVectorLengthDeviceTypeAttr(),
-          createIntExpr(clause.getIntExpr()), range));
+          createIntExpr(clause.getIntExpr()), &range));
     } else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
       llvm_unreachable("vector_length not valid on serial");
     } else {
@@ -226,6 +234,22 @@ class OpenACCClauseCIREmitter final
     }
   }
 
+  void VisitAsyncClause(const OpenACCAsyncClause &clause) {
+    if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
+      if (!clause.hasIntExpr()) {
+        operation.setAsyncOnlyAttr(
+            handleDeviceTypeAffectedClause(operation.getAsyncOnlyAttr()));
+      } else {
+        mlir::MutableOperandRange range = operation.getAsyncOperandsMutable();
+        
operation.setAsyncOperandsDeviceTypeAttr(handleDeviceTypeAffectedClause(
+            operation.getAsyncOperandsDeviceTypeAttr(),
+            createIntExpr(clause.getIntExpr()), &range));
+      }
+    } else {
+      return clauseNotImplemented(clause);
+    }
+  }
+
   void VisitSelfClause(const OpenACCSelfClause &clause) {
     if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
       if (clause.isEmptySelfClause()) {
diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp 
b/clang/lib/Sema/SemaOpenACCClause.cpp
index 3694a831b76de..ed437ac62e332 100644
--- a/clang/lib/Sema/SemaOpenACCClause.cpp
+++ b/clang/lib/Sema/SemaOpenACCClause.cpp
@@ -639,6 +639,9 @@ OpenACCClause 
*SemaOpenACCClauseVisitor::VisitVectorLengthClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
+  if (DisallowSinceLastDeviceType<OpenACCAsyncClause>(Clause))
+    return nullptr;
+
   assert(Clause.getNumIntExprs() < 2 &&
          "Invalid number of expressions for Async");
   return OpenACCAsyncClause::Create(
diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c 
b/clang/test/CIR/CodeGenOpenACC/kernels.c
index d2da1d18f1534..1744acf0ab223 100644
--- a/clang/test/CIR/CodeGenOpenACC/kernels.c
+++ b/clang/test/CIR/CodeGenOpenACC/kernels.c
@@ -210,5 +210,51 @@ void acc_kernels(int cond) {
   // CHECK-NEXT: acc.terminator
   // CHECK-NEXT: } loc
 
+#pragma acc kernels async
+  {}
+  // CHECK-NEXT: acc.kernels {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc kernels async(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, 
!s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels async device_type(nvidia, radeon) async
+  {}
+  // CHECK-NEXT: acc.kernels {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, 
#acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
+#pragma acc kernels async(3) device_type(nvidia, radeon) async(cond)
+  {}
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, 
!s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : 
si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 
[#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels async device_type(nvidia, radeon) async(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, 
!s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32 
[#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc kernels async(3) device_type(nvidia, radeon) async
+  {}
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, 
#acc.device_type<radeon>]}
+
   // CHECK-NEXT: cir.return
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c 
b/clang/test/CIR/CodeGenOpenACC/parallel.c
index 61dccc591c252..892d931c880e7 100644
--- a/clang/test/CIR/CodeGenOpenACC/parallel.c
+++ b/clang/test/CIR/CodeGenOpenACC/parallel.c
@@ -209,5 +209,51 @@ void acc_parallel(int cond) {
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 
+#pragma acc parallel async
+  {}
+  // CHECK-NEXT: acc.parallel {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc parallel async(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, 
!s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc parallel async device_type(nvidia, radeon) async
+  {}
+  // CHECK-NEXT: acc.parallel {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, 
#acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
+#pragma acc parallel async(3) device_type(nvidia, radeon) async(cond)
+  {}
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, 
!s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : 
si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 
[#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc parallel async device_type(nvidia, radeon) async(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, 
!s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32 
[#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc parallel async(3) device_type(nvidia, radeon) async
+  {}
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, 
#acc.device_type<radeon>]}
+
   // CHECK-NEXT: cir.return
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/serial.c 
b/clang/test/CIR/CodeGenOpenACC/serial.c
index b72f44a2ea473..094958f0e3b23 100644
--- a/clang/test/CIR/CodeGenOpenACC/serial.c
+++ b/clang/test/CIR/CodeGenOpenACC/serial.c
@@ -106,5 +106,51 @@ void acc_serial(int cond) {
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 
+#pragma acc serial async
+  {}
+  // CHECK-NEXT: acc.serial {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc serial async(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, 
!s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial async device_type(nvidia, radeon) async
+  {}
+  // CHECK-NEXT: acc.serial {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, 
#acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
+#pragma acc serial async(3) device_type(nvidia, radeon) async(cond)
+  {}
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, 
!s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : 
si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 
[#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial async device_type(nvidia, radeon) async(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, 
!s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32 
[#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc serial async(3) device_type(nvidia, radeon) async
+  {}
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast 
%[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, 
#acc.device_type<radeon>]}
+
   // CHECK-NEXT: cir.return
 }
diff --git a/clang/test/SemaOpenACC/compute-construct-async-clause.c 
b/clang/test/SemaOpenACC/compute-construct-async-clause.c
index 4895d7f2209bb..4ca963713254c 100644
--- a/clang/test/SemaOpenACC/compute-construct-async-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-async-clause.c
@@ -20,6 +20,48 @@ void Test() {
 #pragma acc serial async(1, 2)
   while(1);
 
+  // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on 
a 'kernels' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc kernels async async
+  while(1);
+
+  // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on 
a 'kernels' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc kernels async(1) async(2)
+  while(1);
+
+  // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on 
a 'parallel' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel async(1) async(2)
+  while(1);
+
+  // expected-error@+2{{OpenACC 'async' clause cannot appear more than once on 
a 'serial' directive}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc serial async(1) async(2)
+  while(1);
+
+  // expected-error@+3{{OpenACC 'async' clause cannot appear more than once in 
a 'device_type' region on a 'kernels' directive}}
+  // expected-note@+2{{previous clause is here}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc kernels async(1) device_type(*) async(1) async(2)
+  while(1);
+  // expected-error@+3{{OpenACC 'async' clause cannot appear more than once in 
a 'device_type' region on a 'parallel' directive}}
+  // expected-note@+2{{previous clause is here}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel async device_type(*) async async
+  while(1);
+  // expected-error@+3{{OpenACC 'async' clause cannot appear more than once in 
a 'device_type' region on a 'serial' directive}}
+  // expected-note@+2{{previous clause is here}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc serial async(1) device_type(*) async async(2)
+  while(1);
+
+  // expected-error@+3{{OpenACC 'async' clause cannot appear more than once in 
a 'device_type' region on a 'parallel' directive}}
+  // expected-note@+2{{previous clause is here}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel device_type(*) async async
+  while(1);
+
   struct NotConvertible{} NC;
   // expected-error@+1{{OpenACC clause 'async' requires expression of integer 
type ('struct NotConvertible' invalid)}}
 #pragma acc parallel async(NC)

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to