[clang] [CIR][AMDGPU] Add AMDGPU-specific function attributes for HIP kernels (PR #188007)

via cfe-commits Mon, 23 Mar 2026 22:46:48 -0700

https://github.com/skc7 updated https://github.com/llvm/llvm-project/pull/188007


>From 8e53f91820aa1158951de2ae7beac94fcaed0545 Mon Sep 17 00:00:00 2001
From: skc7 <[email protected]>
Date: Mon, 23 Mar 2026 15:20:23 +0530
Subject: [PATCH 1/3] [CIR][AMDGPU] Add AMDGPU-specific function attributes for
 HIP kernels

---
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        |  14 +-
 clang/lib/CIR/CodeGen/CMakeLists.txt          |   1 +
 clang/lib/CIR/CodeGen/TargetInfo.cpp          |  10 +
 clang/lib/CIR/CodeGen/TargetInfo.h            |   5 +
 clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp      | 256 ++++++++++++++++++
 .../Lowering/DirectToLLVM/LowerToLLVMIR.cpp   |  27 +-
 clang/test/CIR/CodeGenHIP/amdgpu-attrs.hip    |  82 ++++++
 7 files changed, 386 insertions(+), 9 deletions(-)
 create mode 100644 clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp
 create mode 100644 clang/test/CIR/CodeGenHIP/amdgpu-attrs.hip

diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp 
b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index f3ab733bf4c6a..4be669777bb26 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -669,7 +669,7 @@ void CIRGenModule::setNonAliasAttributes(GlobalDecl gd, 
mlir::Operation *op) {
   assert(!cir::MissingFeatures::opFuncCPUAndFeaturesAttributes());
   assert(!cir::MissingFeatures::opFuncSection());
 
-  assert(!cir::MissingFeatures::setTargetAttributes());
+  getTargetCIRGenInfo().setTargetAttributes(gd.getDecl(), op, *this);
 }
 
 std::optional<cir::SourceLanguage> CIRGenModule::getCIRSourceLanguage() const {
@@ -2557,12 +2557,15 @@ void CIRGenModule::setFunctionAttributes(GlobalDecl 
globalDecl,
   // represent them in dedicated ops. The correct attributes are ensured during
   // translation to LLVM. Thus, we don't need to check for them here.
 
+  const auto *funcDecl = cast<FunctionDecl>(globalDecl.getDecl());
+
   if (!isIncompleteFunction)
     setCIRFunctionAttributes(globalDecl,
                              getTypes().arrangeGlobalDeclaration(globalDecl),
                              func, isThunk);
 
-  assert(!cir::MissingFeatures::setTargetAttributes());
+  if (!isIncompleteFunction && func.isDeclaration())
+    getTargetCIRGenInfo().setTargetAttributes(funcDecl, func, *this);
 
   // TODO(cir): This needs a lot of work to better match CodeGen. That
   // ultimately ends up in setGlobalVisibility, which already has the linkage 
of
@@ -2574,17 +2577,16 @@ void CIRGenModule::setFunctionAttributes(GlobalDecl 
globalDecl,
   }
 
   // If we plan on emitting this inline builtin, we can't treat it as a 
builtin.
-  const auto *fd = cast<FunctionDecl>(globalDecl.getDecl());
-  if (fd->isInlineBuiltinDeclaration()) {
+  if (funcDecl->isInlineBuiltinDeclaration()) {
     const FunctionDecl *fdBody;
-    bool hasBody = fd->hasBody(fdBody);
+    bool hasBody = funcDecl->hasBody(fdBody);
     (void)hasBody;
     assert(hasBody && "Inline builtin declarations should always have an "
                       "available body!");
     assert(!cir::MissingFeatures::attributeNoBuiltin());
   }
 
-  if (fd->isReplaceableGlobalAllocationFunction()) {
+  if (funcDecl->isReplaceableGlobalAllocationFunction()) {
     // A replaceable global allocation function does not act like a builtin by
     // default, only if it is invoked by a new-expression or delete-expression.
     func->setAttr(cir::CIRDialect::getNoBuiltinAttrName(),
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt 
b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 8548cc8424527..9b8fdf551ef10 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -51,6 +51,7 @@ add_clang_library(clangCIR
   CIRGenTypes.cpp
   CIRGenVTables.cpp
   TargetInfo.cpp
+  Targets/AMDGPU.cpp
 
   DEPENDS
   MLIRCIR
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.cpp 
b/clang/lib/CIR/CodeGen/TargetInfo.cpp
index ee68d9c329b83..3859588c5cfaf 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CIR/CodeGen/TargetInfo.cpp
@@ -1,6 +1,7 @@
 #include "TargetInfo.h"
 #include "ABIInfo.h"
 #include "CIRGenFunction.h"
+#include "CIRGenModule.h"
 #include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.h"
 #include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
@@ -53,6 +54,15 @@ class AMDGPUTargetCIRGenInfo : public TargetCIRGenInfo {
 public:
   AMDGPUTargetCIRGenInfo(CIRGenTypes &cgt)
       : TargetCIRGenInfo(std::make_unique<AMDGPUABIInfo>(cgt)) {}
+
+  void setTargetAttributes(const clang::Decl *decl, mlir::Operation *global,
+                           CIRGenModule &cgm) const override {
+    auto func = mlir::dyn_cast<cir::FuncOp>(global);
+    if (!func)
+      return;
+
+    setAMDGPUTargetFunctionAttributes(decl, func, cgm);
+  }
 };
 
 } // namespace
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.h 
b/clang/lib/CIR/CodeGen/TargetInfo.h
index b397d8cd7fab8..868af0e8343fb 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.h
+++ b/clang/lib/CIR/CodeGen/TargetInfo.h
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.h"
 #include "clang/Basic/AddressSpaces.h"
 #include "clang/CIR/Dialect/IR/CIRAttrs.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/Dialect/IR/CIROpsEnums.h"
 
 #include <memory>
@@ -135,6 +136,10 @@ class TargetCIRGenInfo {
 std::unique_ptr<TargetCIRGenInfo>
 createAMDGPUTargetCIRGenInfo(CIRGenTypes &cgt);
 
+/// Set AMDGPU-specific function attributes for HIP kernels.
+void setAMDGPUTargetFunctionAttributes(const clang::Decl *decl,
+                                       cir::FuncOp func, CIRGenModule &cgm);
+
 std::unique_ptr<TargetCIRGenInfo> createX8664TargetCIRGenInfo(CIRGenTypes 
&cgt);
 
 std::unique_ptr<TargetCIRGenInfo> createNVPTXTargetCIRGenInfo(CIRGenTypes 
&cgt);
diff --git a/clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp
new file mode 100644
index 0000000000000..280cb6ae5865c
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp
@@ -0,0 +1,256 @@
+//===---- AMDGPU.cpp - AMDGPU-specific CIR CodeGen 
------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides AMDGPU-specific CIR CodeGen logic for function attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../CIRGenModule.h"
+#include "../TargetInfo.h"
+
+#include "clang/AST/Attr.h"
+#include "clang/AST/Decl.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+namespace {
+
+/// Check if AMDGPU protected visibility is required.
+static bool requiresAMDGPUProtectedVisibility(const clang::Decl *d,
+                                              cir::FuncOp func) {
+  if (func.getGlobalVisibility() != cir::VisibilityKind::Hidden)
+    return false;
+
+  if (d->hasAttr<OMPDeclareTargetDeclAttr>())
+    return false;
+
+  return d->hasAttr<DeviceKernelAttr>() ||
+         (clang::isa<clang::FunctionDecl>(d) && d->hasAttr<CUDAGlobalAttr>());
+}
+
+/// Handle amdgpu-flat-work-group-size attribute.
+static void handleAMDGPUFlatWorkGroupSizeAttr(const clang::FunctionDecl *fd,
+                                              cir::FuncOp func,
+                                              CIRGenModule &cgm,
+                                              bool isOpenCLKernel) {
+  auto &builder = cgm.getBuilder();
+  const auto *flatWGS = fd->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
+  const auto *reqdWGS =
+      cgm.getLangOpts().OpenCL ? fd->getAttr<ReqdWorkGroupSizeAttr>() : 
nullptr;
+
+  if (flatWGS || reqdWGS) {
+    unsigned min = 0, max = 0;
+    if (flatWGS) {
+      min = flatWGS->getMin()
+                ->EvaluateKnownConstInt(cgm.getASTContext())
+                .getExtValue();
+      max = flatWGS->getMax()
+                ->EvaluateKnownConstInt(cgm.getASTContext())
+                .getExtValue();
+    }
+    if (reqdWGS && min == 0 && max == 0) {
+      min = max = reqdWGS->getXDim()
+                      ->EvaluateKnownConstInt(cgm.getASTContext())
+                      .getExtValue() *
+                  reqdWGS->getYDim()
+                      ->EvaluateKnownConstInt(cgm.getASTContext())
+                      .getExtValue() *
+                  reqdWGS->getZDim()
+                      ->EvaluateKnownConstInt(cgm.getASTContext())
+                      .getExtValue();
+    }
+    if (min != 0) {
+      assert(min <= max && "Min must be less than or equal Max");
+      std::string attrVal = llvm::utostr(min) + "," + llvm::utostr(max);
+      func->setAttr("cir.amdgpu-flat-work-group-size",
+                    builder.getStringAttr(attrVal));
+    } else {
+      assert(max == 0 && "Max must be zero");
+    }
+  } else {
+    const unsigned defaultMax =
+        isOpenCLKernel ? 256 : cgm.getLangOpts().GPUMaxThreadsPerBlock;
+    std::string attrVal = std::string("1,") + llvm::utostr(defaultMax);
+    func->setAttr("cir.amdgpu-flat-work-group-size",
+                  builder.getStringAttr(attrVal));
+  }
+}
+
+/// Handle amdgpu-waves-per-eu attribute.
+static void handleAMDGPUWavesPerEUAttr(const clang::FunctionDecl *fd,
+                                       cir::FuncOp func, CIRGenModule &cgm) {
+  const auto *attr = fd->getAttr<AMDGPUWavesPerEUAttr>();
+  if (!attr)
+    return;
+
+  auto &builder = cgm.getBuilder();
+  unsigned min =
+      attr->getMin()->EvaluateKnownConstInt(cgm.getASTContext()).getExtValue();
+  unsigned max = attr->getMax()
+                     ? attr->getMax()
+                           ->EvaluateKnownConstInt(cgm.getASTContext())
+                           .getExtValue()
+                     : 0;
+
+  if (min != 0) {
+    assert((max == 0 || min <= max) && "Min must be less than or equal Max");
+    std::string attrVal = llvm::utostr(min);
+    if (max != 0)
+      attrVal = attrVal + "," + llvm::utostr(max);
+    func->setAttr("cir.amdgpu-waves-per-eu", builder.getStringAttr(attrVal));
+  } else {
+    assert(max == 0 && "Max must be zero");
+  }
+}
+
+/// Handle amdgpu-num-sgpr attribute.
+static void handleAMDGPUNumSGPRAttr(const clang::FunctionDecl *fd,
+                                    cir::FuncOp func, CIRGenModule &cgm) {
+  const auto *attr = fd->getAttr<AMDGPUNumSGPRAttr>();
+  if (!attr)
+    return;
+
+  uint32_t numSGPR = attr->getNumSGPR();
+  if (numSGPR != 0) {
+    auto &builder = cgm.getBuilder();
+    func->setAttr("cir.amdgpu-num-sgpr",
+                  builder.getStringAttr(llvm::utostr(numSGPR)));
+  }
+}
+
+/// Handle amdgpu-num-vgpr attribute.
+static void handleAMDGPUNumVGPRAttr(const clang::FunctionDecl *fd,
+                                    cir::FuncOp func, CIRGenModule &cgm) {
+  const auto *attr = fd->getAttr<AMDGPUNumVGPRAttr>();
+  if (!attr)
+    return;
+
+  uint32_t numVGPR = attr->getNumVGPR();
+  if (numVGPR != 0) {
+    auto &builder = cgm.getBuilder();
+    func->setAttr("cir.amdgpu-num-vgpr",
+                  builder.getStringAttr(llvm::utostr(numVGPR)));
+  }
+}
+
+/// Handle amdgpu-max-num-workgroups attribute.
+static void handleAMDGPUMaxNumWorkGroupsAttr(const clang::FunctionDecl *fd,
+                                             cir::FuncOp func,
+                                             CIRGenModule &cgm) {
+  const auto *attr = fd->getAttr<AMDGPUMaxNumWorkGroupsAttr>();
+  if (!attr)
+    return;
+
+  auto &builder = cgm.getBuilder();
+  uint32_t x = attr->getMaxNumWorkGroupsX()
+                   ->EvaluateKnownConstInt(cgm.getASTContext())
+                   .getExtValue();
+  uint32_t y = attr->getMaxNumWorkGroupsY()
+                   ? attr->getMaxNumWorkGroupsY()
+                         ->EvaluateKnownConstInt(cgm.getASTContext())
+                         .getExtValue()
+                   : 1;
+  uint32_t z = attr->getMaxNumWorkGroupsZ()
+                   ? attr->getMaxNumWorkGroupsZ()
+                         ->EvaluateKnownConstInt(cgm.getASTContext())
+                         .getExtValue()
+                   : 1;
+
+  llvm::SmallString<32> attrVal;
+  llvm::raw_svector_ostream os(attrVal);
+  os << x << ',' << y << ',' << z;
+  func->setAttr("cir.amdgpu-max-num-workgroups",
+                builder.getStringAttr(attrVal.str()));
+}
+
+/// Handle amdgpu-cluster-dims attribute.
+static void handleAMDGPUClusterDimsAttr(const clang::FunctionDecl *fd,
+                                        cir::FuncOp func, CIRGenModule &cgm,
+                                        bool isOpenCLKernel) {
+  auto &builder = cgm.getBuilder();
+
+  if (const auto *attr = fd->getAttr<CUDAClusterDimsAttr>()) {
+    auto getExprVal = [&](const Expr *e) {
+      return e ? e->EvaluateKnownConstInt(cgm.getASTContext()).getExtValue()
+               : 1;
+    };
+    unsigned x = getExprVal(attr->getX());
+    unsigned y = getExprVal(attr->getY());
+    unsigned z = getExprVal(attr->getZ());
+
+    llvm::SmallString<32> attrVal;
+    llvm::raw_svector_ostream os(attrVal);
+    os << x << ',' << y << ',' << z;
+    func->setAttr("cir.amdgpu-cluster-dims",
+                  builder.getStringAttr(attrVal.str()));
+  }
+
+  const clang::TargetInfo &targetInfo = cgm.getASTContext().getTargetInfo();
+  if ((isOpenCLKernel &&
+       targetInfo.hasFeatureEnabled(targetInfo.getTargetOpts().FeatureMap,
+                                    "clusters")) ||
+      fd->hasAttr<CUDANoClusterAttr>()) {
+    func->setAttr("cir.amdgpu-cluster-dims", builder.getStringAttr("0,0,0"));
+  }
+}
+
+/// Handle amdgpu-ieee attribute.
+static void handleAMDGPUIEEEAttr(cir::FuncOp func, CIRGenModule &cgm) {
+  if (!cgm.getCodeGenOpts().EmitIEEENaNCompliantInsts) {
+    auto &builder = cgm.getBuilder();
+    func->setAttr("cir.amdgpu-ieee", builder.getStringAttr("false"));
+  }
+}
+
+} // anonymous namespace
+
+void clang::CIRGen::setAMDGPUTargetFunctionAttributes(const clang::Decl *decl,
+                                                      cir::FuncOp func,
+                                                      CIRGenModule &cgm) {
+  const auto *fd = clang::dyn_cast_or_null<clang::FunctionDecl>(decl);
+  if (!fd)
+    return;
+
+  if (func.isDeclaration())
+    return;
+
+  // Set protected visibility for AMDGPU kernels
+  if (requiresAMDGPUProtectedVisibility(decl, func)) {
+    func.setGlobalVisibility(cir::VisibilityKind::Protected);
+    func.setDSOLocal(true);
+  }
+
+  const bool isOpenCLKernel =
+      cgm.getLangOpts().OpenCL && fd->hasAttr<DeviceKernelAttr>();
+  const bool isHIPKernel =
+      cgm.getLangOpts().HIP && fd->hasAttr<CUDAGlobalAttr>();
+
+  if (!isOpenCLKernel && !isHIPKernel)
+    return;
+
+  // Set HIP kernel calling convention
+  if (isHIPKernel) {
+    // TODO(CIR) : Add amdgpu calling conv.
+    func.setVisibility(mlir::SymbolTable::Visibility::Public);
+    func.setLinkageAttr(cir::GlobalLinkageKindAttr::get(
+        func.getContext(), cir::GlobalLinkageKind::ExternalLinkage));
+  }
+
+  handleAMDGPUFlatWorkGroupSizeAttr(fd, func, cgm, isOpenCLKernel);
+  handleAMDGPUWavesPerEUAttr(fd, func, cgm);
+  handleAMDGPUNumSGPRAttr(fd, func, cgm);
+  handleAMDGPUNumVGPRAttr(fd, func, cgm);
+  handleAMDGPUMaxNumWorkGroupsAttr(fd, func, cgm);
+  handleAMDGPUClusterDimsAttr(fd, func, cgm, isOpenCLKernel);
+  handleAMDGPUIEEEAttr(func, cgm);
+}
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVMIR.cpp 
b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVMIR.cpp
index 2a95cfb9371b1..dbedbb5647aa5 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVMIR.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVMIR.cpp
@@ -15,9 +15,7 @@
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
-#include "clang/CIR/Dialect/IR/CIRAttrs.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
-#include "clang/CIR/MissingFeatures.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -54,7 +52,11 @@ class CIRDialectLLVMIRTranslationInterface
       mlir::Operation *op, llvm::ArrayRef<llvm::Instruction *> instructions,
       mlir::NamedAttribute attribute,
       mlir::LLVM::ModuleTranslation &moduleTranslation) const override {
-    if (auto mod = dyn_cast<mlir::ModuleOp>(op)) {
+    if (auto func = dyn_cast<mlir::LLVM::LLVMFuncOp>(op)) {
+      if (mlir::failed(
+              amendFunction(func, instructions, attribute, moduleTranslation)))
+        return mlir::failure();
+    } else if (auto mod = dyn_cast<mlir::ModuleOp>(op)) {
       if (mlir::failed(amendModule(mod, attribute, moduleTranslation)))
         return mlir::failure();
     }
@@ -62,6 +64,25 @@ class CIRDialectLLVMIRTranslationInterface
   }
 
 private:
+  // Translate CIR function attributes to LLVM function attributes.
+  mlir::LogicalResult
+  amendFunction(mlir::LLVM::LLVMFuncOp func,
+                llvm::ArrayRef<llvm::Instruction *> instructions,
+                mlir::NamedAttribute attribute,
+                mlir::LLVM::ModuleTranslation &moduleTranslation) const {
+    llvm::Function *llvmFunc = 
moduleTranslation.lookupFunction(func.getName());
+    llvm::StringRef attrName = attribute.getName().strref();
+
+    // Strip the "cir." prefix to get the LLVM attribute name.
+    llvm::StringRef llvmAttrName = attrName.substr(strlen("cir."));
+    if (auto strAttr = mlir::dyn_cast<mlir::StringAttr>(attribute.getValue())) 
{
+      llvmFunc->addFnAttr(llvmAttrName, strAttr.getValue());
+      return mlir::success();
+    }
+
+    return mlir::success();
+  }
+
   // Translate CIR's module attributes to LLVM's module metadata
   mlir::LogicalResult
   amendModule(mlir::ModuleOp mod, mlir::NamedAttribute attribute,
diff --git a/clang/test/CIR/CodeGenHIP/amdgpu-attrs.hip 
b/clang/test/CIR/CodeGenHIP/amdgpu-attrs.hip
new file mode 100644
index 0000000000000..5a15f62899cf8
--- /dev/null
+++ b/clang/test/CIR/CodeGenHIP/amdgpu-attrs.hip
@@ -0,0 +1,82 @@
+#include "../CodeGenCUDA/Inputs/cuda.h"
+
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip -fclangir \
+// RUN:            -fcuda-is-device -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR %s --input-file=%t.cir
+
+// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip -fclangir \
+// RUN:            -fcuda-is-device -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM %s --input-file=%t.ll
+
+// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip \
+// RUN:            -fcuda-is-device -emit-llvm %s -o %t.ogcg.ll
+// RUN: FileCheck --check-prefix=LLVM %s --input-file=%t.ogcg.ll
+
+// Test that AMDGPU-specific attributes are generated for HIP kernels
+
+// Test: Default attributes for simple kernel
+// CIR: cir.func{{.*}} 
@_Z13kernel_simplev(){{.*}}"cir.amdgpu-flat-work-group-size" = "1,1024"
+// LLVM: define{{.*}} void @_Z13kernel_simplev(){{.*}} #[[SIMPLE_ATTR:[0-9]+]]
+__global__ void kernel_simple() {}
+
+// Test: Explicit flat work group size attribute
+// CIR: cir.func{{.*}} 
@_Z21kernel_flat_wg_size_1v(){{.*}}"cir.amdgpu-flat-work-group-size" = "64,128"
+// LLVM: define{{.*}} void @_Z21kernel_flat_wg_size_1v(){{.*}} 
#[[FLAT_WG_ATTR:[0-9]+]]
+__attribute__((amdgpu_flat_work_group_size(64, 128)))
+__global__ void kernel_flat_wg_size_1() {}
+
+// Test: Waves per EU attribute
+// CIR: cir.func{{.*}} 
@_Z19kernel_waves_per_euv(){{.*}}"cir.amdgpu-waves-per-eu" = "2"
+// LLVM: define{{.*}} void @_Z19kernel_waves_per_euv(){{.*}} 
#[[WAVES_ATTR:[0-9]+]]
+__attribute__((amdgpu_waves_per_eu(2)))
+__global__ void kernel_waves_per_eu() {}
+
+// Test: Waves per EU with min and max
+// CIR: cir.func{{.*}} 
@_Z22kernel_waves_per_eu_mmv(){{.*}}"cir.amdgpu-waves-per-eu" = "2,4"
+// LLVM: define{{.*}} void @_Z22kernel_waves_per_eu_mmv(){{.*}} 
#[[WAVES_MM_ATTR:[0-9]+]]
+__attribute__((amdgpu_waves_per_eu(2, 4)))
+__global__ void kernel_waves_per_eu_mm() {}
+
+// Test: Num SGPR attribute
+// CIR: cir.func{{.*}} @_Z15kernel_num_sgprv(){{.*}}"cir.amdgpu-num-sgpr" = 
"32"
+// LLVM: define{{.*}} void @_Z15kernel_num_sgprv(){{.*}} #[[SGPR_ATTR:[0-9]+]]
+__attribute__((amdgpu_num_sgpr(32)))
+__global__ void kernel_num_sgpr() {}
+
+// Test: Num VGPR attribute
+// CIR: cir.func{{.*}} @_Z15kernel_num_vgprv(){{.*}}"cir.amdgpu-num-vgpr" = 
"64"
+// LLVM: define{{.*}} void @_Z15kernel_num_vgprv(){{.*}} #[[VGPR_ATTR:[0-9]+]]
+__attribute__((amdgpu_num_vgpr(64)))
+__global__ void kernel_num_vgpr() {}
+
+// Test: Max num workgroups attribute
+// CIR: cir.func{{.*}} 
@_Z22kernel_max_num_wgroupsv(){{.*}}"cir.amdgpu-max-num-workgroups" = "8,4,2"
+// LLVM: define{{.*}} void @_Z22kernel_max_num_wgroupsv(){{.*}} 
#[[MAX_WG_ATTR:[0-9]+]]
+__attribute__((amdgpu_max_num_work_groups(8, 4, 2)))
+__global__ void kernel_max_num_wgroups() {}
+
+// Test: Combined attributes
+// CIR: cir.func{{.*}} 
@_Z15kernel_combinedv(){{.*}}"cir.amdgpu-flat-work-group-size" = 
"256,256"{{.*}}"cir.amdgpu-num-sgpr" = "48"{{.*}}"cir.amdgpu-num-vgpr" = 
"32"{{.*}}"cir.amdgpu-waves-per-eu" = "1,2"
+// LLVM: define{{.*}} void @_Z15kernel_combinedv(){{.*}} 
#[[COMBINED_ATTR:[0-9]+]]
+__attribute__((amdgpu_flat_work_group_size(256, 256)))
+__attribute__((amdgpu_waves_per_eu(1, 2)))
+__attribute__((amdgpu_num_sgpr(48)))
+__attribute__((amdgpu_num_vgpr(32)))
+__global__ void kernel_combined() {}
+
+// Test: Device function should NOT have kernel attributes
+// CIR: cir.func{{.*}} @_Z9device_fnv()
+// CIR-NOT: cir.amdgpu-flat-work-group-size
+// LLVM: define{{.*}} void @_Z9device_fnv()
+__device__ void device_fn() {}
+
+// Verify LLVM attributes
+// LLVM-DAG: attributes #[[SIMPLE_ATTR]] = 
{{.*}}"amdgpu-flat-work-group-size"="1,1024"
+// LLVM-DAG: attributes #[[FLAT_WG_ATTR]] = 
{{.*}}"amdgpu-flat-work-group-size"="64,128"
+// LLVM-DAG: attributes #[[WAVES_ATTR]] = {{.*}}"amdgpu-waves-per-eu"="2"
+// LLVM-DAG: attributes #[[WAVES_MM_ATTR]] = {{.*}}"amdgpu-waves-per-eu"="2,4"
+// LLVM-DAG: attributes #[[SGPR_ATTR]] = {{.*}}"amdgpu-num-sgpr"="32"
+// LLVM-DAG: attributes #[[VGPR_ATTR]] = {{.*}}"amdgpu-num-vgpr"="64"
+// LLVM-DAG: attributes #[[MAX_WG_ATTR]] = 
{{.*}}"amdgpu-max-num-workgroups"="8,4,2"
+// LLVM-DAG: attributes #[[COMBINED_ATTR]] = 
{{.*}}"amdgpu-flat-work-group-size"="256,256"{{.*}}"amdgpu-num-sgpr"="48"{{.*}}"amdgpu-num-vgpr"="32"{{.*}}"amdgpu-waves-per-eu"="1,2"

>From 682aedb605f1c8f36c2be1c9003a9f5603671230 Mon Sep 17 00:00:00 2001
From: skc7 <[email protected]>
Date: Mon, 23 Mar 2026 16:04:13 +0530
Subject: [PATCH 2/3] add support for amdgpu-expand-waitcnt-profiling

---
 clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp      | 76 +++++++++++--------
 .../Lowering/DirectToLLVM/LowerToLLVMIR.cpp   |  5 +-
 clang/test/CIR/CodeGenHIP/amdgpu-attrs.hip    | 16 ++++
 3 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp
index 280cb6ae5865c..b1d92c40d6178 100644
--- a/clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp
@@ -42,7 +42,8 @@ static bool requiresAMDGPUProtectedVisibility(const 
clang::Decl *d,
 static void handleAMDGPUFlatWorkGroupSizeAttr(const clang::FunctionDecl *fd,
                                               cir::FuncOp func,
                                               CIRGenModule &cgm,
-                                              bool isOpenCLKernel) {
+                                              bool isOpenCLKernel,
+                                              bool isHIPKernel) {
   auto &builder = cgm.getBuilder();
   const auto *flatWGS = fd->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
   const auto *reqdWGS =
@@ -77,10 +78,15 @@ static void handleAMDGPUFlatWorkGroupSizeAttr(const 
clang::FunctionDecl *fd,
     } else {
       assert(max == 0 && "Max must be zero");
     }
-  } else {
-    const unsigned defaultMax =
-        isOpenCLKernel ? 256 : cgm.getLangOpts().GPUMaxThreadsPerBlock;
-    std::string attrVal = std::string("1,") + llvm::utostr(defaultMax);
+  } else if (isOpenCLKernel || isHIPKernel) {
+    // By default, restrict the maximum size to a value specified by
+    // --gpu-max-threads-per-block=n or its default value for HIP.
+    const unsigned openCLDefaultMaxWorkGroupSize = 256;
+    const unsigned defaultMaxWorkGroupSize =
+        isOpenCLKernel ? openCLDefaultMaxWorkGroupSize
+                       : cgm.getLangOpts().GPUMaxThreadsPerBlock;
+    std::string attrVal =
+        std::string("1,") + llvm::utostr(defaultMaxWorkGroupSize);
     func->setAttr("cir.amdgpu-flat-work-group-size",
                   builder.getStringAttr(attrVal));
   }
@@ -212,45 +218,51 @@ static void handleAMDGPUIEEEAttr(cir::FuncOp func, 
CIRGenModule &cgm) {
   }
 }
 
+/// Handle amdgpu-expand-waitcnt-profiling attribute.
+static void handleAMDGPUExpandWaitcntProfilingAttr(cir::FuncOp func,
+                                                   CIRGenModule &cgm) {
+  if (cgm.getCodeGenOpts().AMDGPUExpandWaitcntProfiling) {
+    auto &builder = cgm.getBuilder();
+    func->setAttr("cir.amdgpu-expand-waitcnt-profiling",
+                  builder.getStringAttr(""));
+  }
+}
+
 } // anonymous namespace
 
 void clang::CIRGen::setAMDGPUTargetFunctionAttributes(const clang::Decl *decl,
                                                       cir::FuncOp func,
                                                       CIRGenModule &cgm) {
-  const auto *fd = clang::dyn_cast_or_null<clang::FunctionDecl>(decl);
-  if (!fd)
-    return;
-
-  if (func.isDeclaration())
-    return;
-
-  // Set protected visibility for AMDGPU kernels
   if (requiresAMDGPUProtectedVisibility(decl, func)) {
     func.setGlobalVisibility(cir::VisibilityKind::Protected);
     func.setDSOLocal(true);
   }
 
-  const bool isOpenCLKernel =
-      cgm.getLangOpts().OpenCL && fd->hasAttr<DeviceKernelAttr>();
-  const bool isHIPKernel =
-      cgm.getLangOpts().HIP && fd->hasAttr<CUDAGlobalAttr>();
-
-  if (!isOpenCLKernel && !isHIPKernel)
+  if (func.isDeclaration())
     return;
 
-  // Set HIP kernel calling convention
-  if (isHIPKernel) {
-    // TODO(CIR) : Add amdgpu calling conv.
-    func.setVisibility(mlir::SymbolTable::Visibility::Public);
-    func.setLinkageAttr(cir::GlobalLinkageKindAttr::get(
-        func.getContext(), cir::GlobalLinkageKind::ExternalLinkage));
-  }
+  const auto *fd = clang::dyn_cast_or_null<clang::FunctionDecl>(decl);
+  if (fd) {
+    const bool isOpenCLKernel =
+        cgm.getLangOpts().OpenCL && fd->hasAttr<DeviceKernelAttr>();
+    const bool isHIPKernel =
+        cgm.getLangOpts().HIP && fd->hasAttr<CUDAGlobalAttr>();
+
+    if (isHIPKernel) {
+      // TODO(CIR) : Add amdgpu calling conv.
+      func.setVisibility(mlir::SymbolTable::Visibility::Public);
+      func.setLinkageAttr(cir::GlobalLinkageKindAttr::get(
+          func.getContext(), cir::GlobalLinkageKind::ExternalLinkage));
+    }
 
-  handleAMDGPUFlatWorkGroupSizeAttr(fd, func, cgm, isOpenCLKernel);
-  handleAMDGPUWavesPerEUAttr(fd, func, cgm);
-  handleAMDGPUNumSGPRAttr(fd, func, cgm);
-  handleAMDGPUNumVGPRAttr(fd, func, cgm);
-  handleAMDGPUMaxNumWorkGroupsAttr(fd, func, cgm);
-  handleAMDGPUClusterDimsAttr(fd, func, cgm, isOpenCLKernel);
+    handleAMDGPUFlatWorkGroupSizeAttr(fd, func, cgm, isOpenCLKernel,
+                                      isHIPKernel);
+    handleAMDGPUWavesPerEUAttr(fd, func, cgm);
+    handleAMDGPUNumSGPRAttr(fd, func, cgm);
+    handleAMDGPUNumVGPRAttr(fd, func, cgm);
+    handleAMDGPUMaxNumWorkGroupsAttr(fd, func, cgm);
+    handleAMDGPUClusterDimsAttr(fd, func, cgm, isOpenCLKernel);
+  }
   handleAMDGPUIEEEAttr(func, cgm);
+  handleAMDGPUExpandWaitcntProfilingAttr(func, cgm);
 }
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVMIR.cpp 
b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVMIR.cpp
index dbedbb5647aa5..dbcd0aed88056 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVMIR.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVMIR.cpp
@@ -75,11 +75,8 @@ class CIRDialectLLVMIRTranslationInterface
 
     // Strip the "cir." prefix to get the LLVM attribute name.
     llvm::StringRef llvmAttrName = attrName.substr(strlen("cir."));
-    if (auto strAttr = mlir::dyn_cast<mlir::StringAttr>(attribute.getValue())) 
{
+    if (auto strAttr = mlir::dyn_cast<mlir::StringAttr>(attribute.getValue()))
       llvmFunc->addFnAttr(llvmAttrName, strAttr.getValue());
-      return mlir::success();
-    }
-
     return mlir::success();
   }
 
diff --git a/clang/test/CIR/CodeGenHIP/amdgpu-attrs.hip 
b/clang/test/CIR/CodeGenHIP/amdgpu-attrs.hip
index 5a15f62899cf8..8c020ccf9c58b 100644
--- a/clang/test/CIR/CodeGenHIP/amdgpu-attrs.hip
+++ b/clang/test/CIR/CodeGenHIP/amdgpu-attrs.hip
@@ -13,6 +13,11 @@
 // RUN:            -fcuda-is-device -emit-llvm %s -o %t.ogcg.ll
 // RUN: FileCheck --check-prefix=LLVM %s --input-file=%t.ogcg.ll
 
+// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip -fclangir \
+// RUN:            -fcuda-is-device -mamdgpu-expand-waitcnt-profiling \
+// RUN:            -emit-cir %s -o %t.prof.cir
+// RUN: FileCheck --check-prefix=PROF %s --input-file=%t.prof.cir
+
 // Test that AMDGPU-specific attributes are generated for HIP kernels
 
 // Test: Default attributes for simple kernel
@@ -80,3 +85,14 @@ __device__ void device_fn() {}
 // LLVM-DAG: attributes #[[VGPR_ATTR]] = {{.*}}"amdgpu-num-vgpr"="64"
 // LLVM-DAG: attributes #[[MAX_WG_ATTR]] = 
{{.*}}"amdgpu-max-num-workgroups"="8,4,2"
 // LLVM-DAG: attributes #[[COMBINED_ATTR]] = 
{{.*}}"amdgpu-flat-work-group-size"="256,256"{{.*}}"amdgpu-num-sgpr"="48"{{.*}}"amdgpu-num-vgpr"="32"{{.*}}"amdgpu-waves-per-eu"="1,2"
+
+// Test: amdgpu-expand-waitcnt-profiling is set on all functions when enabled
+// PROF: cir.func{{.*}} 
@_Z13kernel_simplev(){{.*}}"cir.amdgpu-expand-waitcnt-profiling"
+// PROF: cir.func{{.*}} 
@_Z21kernel_flat_wg_size_1v(){{.*}}"cir.amdgpu-expand-waitcnt-profiling"
+// PROF: cir.func{{.*}} 
@_Z19kernel_waves_per_euv(){{.*}}"cir.amdgpu-expand-waitcnt-profiling"
+// PROF: cir.func{{.*}} 
@_Z22kernel_waves_per_eu_mmv(){{.*}}"cir.amdgpu-expand-waitcnt-profiling"
+// PROF: cir.func{{.*}} 
@_Z15kernel_num_sgprv(){{.*}}"cir.amdgpu-expand-waitcnt-profiling"
+// PROF: cir.func{{.*}} 
@_Z15kernel_num_vgprv(){{.*}}"cir.amdgpu-expand-waitcnt-profiling"
+// PROF: cir.func{{.*}} 
@_Z22kernel_max_num_wgroupsv(){{.*}}"cir.amdgpu-expand-waitcnt-profiling"
+// PROF: cir.func{{.*}} 
@_Z15kernel_combinedv(){{.*}}"cir.amdgpu-expand-waitcnt-profiling"
+// PROF: cir.func{{.*}} 
@_Z9device_fnv(){{.*}}"cir.amdgpu-expand-waitcnt-profiling"

>From ad1bd2085fb1af97c24978736ab6f682d94ca029 Mon Sep 17 00:00:00 2001
From: skc7 <[email protected]>
Date: Tue, 24 Mar 2026 11:13:37 +0530
Subject: [PATCH 3/3] update requiresAMDGPUProtectedVisibility and other minor
 fixes

---
 clang/include/clang/CIR/MissingFeatures.h |   1 -
 clang/lib/CIR/CodeGen/TargetInfo.cpp      |  17 +++-
 clang/lib/CIR/CodeGen/TargetInfo.h        |   4 +
 clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp  | 115 ++++++++++------------
 4 files changed, 67 insertions(+), 70 deletions(-)

diff --git a/clang/include/clang/CIR/MissingFeatures.h 
b/clang/include/clang/CIR/MissingFeatures.h
index 68db08a5580ca..5051044eb5d50 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -327,7 +327,6 @@ struct MissingFeatures {
   static bool setDLLStorageClass() { return false; }
   static bool setNonGC() { return false; }
   static bool setObjCGCLValueClass() { return false; }
-  static bool setTargetAttributes() { return false; }
   static bool shouldSplitConstantStore() { return false; }
   static bool shouldUseBZeroPlusStoresToInitialize() { return false; }
   static bool shouldUseMemSetToInitialize() { return false; }
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.cpp 
b/clang/lib/CIR/CodeGen/TargetInfo.cpp
index 3859588c5cfaf..a5cc74b18a8a0 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CIR/CodeGen/TargetInfo.cpp
@@ -57,11 +57,18 @@ class AMDGPUTargetCIRGenInfo : public TargetCIRGenInfo {
 
   void setTargetAttributes(const clang::Decl *decl, mlir::Operation *global,
                            CIRGenModule &cgm) const override {
-    auto func = mlir::dyn_cast<cir::FuncOp>(global);
-    if (!func)
-      return;
-
-    setAMDGPUTargetFunctionAttributes(decl, func, cgm);
+    if (auto func = mlir::dyn_cast<cir::FuncOp>(global)) {
+      if (requiresAMDGPUProtectedVisibility(decl, func.getGlobalVisibility())) 
{
+        func.setGlobalVisibility(cir::VisibilityKind::Protected);
+        func.setDSOLocal(true);
+      }
+      setAMDGPUTargetFunctionAttributes(decl, func, cgm);
+    } else if (auto gv = mlir::dyn_cast<cir::GlobalOp>(global)) {
+      if (requiresAMDGPUProtectedVisibility(decl, gv.getGlobalVisibility())) {
+        gv.setGlobalVisibility(cir::VisibilityKind::Protected);
+        gv.setDSOLocal(true);
+      }
+    }
   }
 };
 
diff --git a/clang/lib/CIR/CodeGen/TargetInfo.h 
b/clang/lib/CIR/CodeGen/TargetInfo.h
index 868af0e8343fb..5e0103093827b 100644
--- a/clang/lib/CIR/CodeGen/TargetInfo.h
+++ b/clang/lib/CIR/CodeGen/TargetInfo.h
@@ -136,6 +136,10 @@ class TargetCIRGenInfo {
 std::unique_ptr<TargetCIRGenInfo>
 createAMDGPUTargetCIRGenInfo(CIRGenTypes &cgt);
 
+/// Check if AMDGPU protected visibility is required.
+bool requiresAMDGPUProtectedVisibility(const clang::Decl *d,
+                                       cir::VisibilityKind visibility);
+
 /// Set AMDGPU-specific function attributes for HIP kernels.
 void setAMDGPUTargetFunctionAttributes(const clang::Decl *decl,
                                        cir::FuncOp func, CIRGenModule &cgm);
diff --git a/clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp
index b1d92c40d6178..566bca14c4d59 100644
--- a/clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CIR/CodeGen/Targets/AMDGPU.cpp
@@ -23,28 +23,27 @@
 using namespace clang;
 using namespace clang::CIRGen;
 
-namespace {
-
-/// Check if AMDGPU protected visibility is required.
-static bool requiresAMDGPUProtectedVisibility(const clang::Decl *d,
-                                              cir::FuncOp func) {
-  if (func.getGlobalVisibility() != cir::VisibilityKind::Hidden)
-    return false;
-
-  if (d->hasAttr<OMPDeclareTargetDeclAttr>())
+bool clang::CIRGen::requiresAMDGPUProtectedVisibility(
+    const Decl *d, cir::VisibilityKind visibility) {
+  if (visibility != cir::VisibilityKind::Hidden)
     return false;
 
-  return d->hasAttr<DeviceKernelAttr>() ||
-         (clang::isa<clang::FunctionDecl>(d) && d->hasAttr<CUDAGlobalAttr>());
+  return !d->hasAttr<OMPDeclareTargetDeclAttr>() &&
+         (d->hasAttr<DeviceKernelAttr>() ||
+          (isa<FunctionDecl>(d) && d->hasAttr<CUDAGlobalAttr>()) ||
+          (isa<VarDecl>(d) &&
+           (d->hasAttr<CUDADeviceAttr>() || d->hasAttr<CUDAConstantAttr>() ||
+            cast<VarDecl>(d)->getType()->isCUDADeviceBuiltinSurfaceType() ||
+            cast<VarDecl>(d)->getType()->isCUDADeviceBuiltinTextureType())));
 }
 
+namespace {
+
 /// Handle amdgpu-flat-work-group-size attribute.
-static void handleAMDGPUFlatWorkGroupSizeAttr(const clang::FunctionDecl *fd,
-                                              cir::FuncOp func,
-                                              CIRGenModule &cgm,
-                                              bool isOpenCLKernel,
-                                              bool isHIPKernel) {
-  auto &builder = cgm.getBuilder();
+static void
+handleAMDGPUFlatWorkGroupSizeAttr(const FunctionDecl *fd, cir::FuncOp func,
+                                  CIRGenModule &cgm, CIRGenBuilderTy &builder,
+                                  bool isOpenCLKernel, bool isHIPKernel) {
   const auto *flatWGS = fd->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
   const auto *reqdWGS =
       cgm.getLangOpts().OpenCL ? fd->getAttr<ReqdWorkGroupSizeAttr>() : 
nullptr;
@@ -93,13 +92,12 @@ static void handleAMDGPUFlatWorkGroupSizeAttr(const 
clang::FunctionDecl *fd,
 }
 
 /// Handle amdgpu-waves-per-eu attribute.
-static void handleAMDGPUWavesPerEUAttr(const clang::FunctionDecl *fd,
-                                       cir::FuncOp func, CIRGenModule &cgm) {
+static void handleAMDGPUWavesPerEUAttr(const FunctionDecl *fd, cir::FuncOp 
func,
+                                       CIRGenModule &cgm,
+                                       CIRGenBuilderTy &builder) {
   const auto *attr = fd->getAttr<AMDGPUWavesPerEUAttr>();
   if (!attr)
     return;
-
-  auto &builder = cgm.getBuilder();
   unsigned min =
       attr->getMin()->EvaluateKnownConstInt(cgm.getASTContext()).getExtValue();
   unsigned max = attr->getMax()
@@ -120,44 +118,43 @@ static void handleAMDGPUWavesPerEUAttr(const 
clang::FunctionDecl *fd,
 }
 
 /// Handle amdgpu-num-sgpr attribute.
-static void handleAMDGPUNumSGPRAttr(const clang::FunctionDecl *fd,
-                                    cir::FuncOp func, CIRGenModule &cgm) {
+static void handleAMDGPUNumSGPRAttr(const FunctionDecl *fd, cir::FuncOp func,
+                                    CIRGenModule &cgm,
+                                    CIRGenBuilderTy &builder) {
   const auto *attr = fd->getAttr<AMDGPUNumSGPRAttr>();
   if (!attr)
     return;
 
   uint32_t numSGPR = attr->getNumSGPR();
   if (numSGPR != 0) {
-    auto &builder = cgm.getBuilder();
     func->setAttr("cir.amdgpu-num-sgpr",
                   builder.getStringAttr(llvm::utostr(numSGPR)));
   }
 }
 
 /// Handle amdgpu-num-vgpr attribute.
-static void handleAMDGPUNumVGPRAttr(const clang::FunctionDecl *fd,
-                                    cir::FuncOp func, CIRGenModule &cgm) {
+static void handleAMDGPUNumVGPRAttr(const FunctionDecl *fd, cir::FuncOp func,
+                                    CIRGenModule &cgm,
+                                    CIRGenBuilderTy &builder) {
   const auto *attr = fd->getAttr<AMDGPUNumVGPRAttr>();
   if (!attr)
     return;
 
   uint32_t numVGPR = attr->getNumVGPR();
   if (numVGPR != 0) {
-    auto &builder = cgm.getBuilder();
     func->setAttr("cir.amdgpu-num-vgpr",
                   builder.getStringAttr(llvm::utostr(numVGPR)));
   }
 }
 
 /// Handle amdgpu-max-num-workgroups attribute.
-static void handleAMDGPUMaxNumWorkGroupsAttr(const clang::FunctionDecl *fd,
+static void handleAMDGPUMaxNumWorkGroupsAttr(const FunctionDecl *fd,
                                              cir::FuncOp func,
-                                             CIRGenModule &cgm) {
+                                             CIRGenModule &cgm,
+                                             CIRGenBuilderTy &builder) {
   const auto *attr = fd->getAttr<AMDGPUMaxNumWorkGroupsAttr>();
   if (!attr)
     return;
-
-  auto &builder = cgm.getBuilder();
   uint32_t x = attr->getMaxNumWorkGroupsX()
                    ->EvaluateKnownConstInt(cgm.getASTContext())
                    .getExtValue();
@@ -180,10 +177,10 @@ static void handleAMDGPUMaxNumWorkGroupsAttr(const 
clang::FunctionDecl *fd,
 }
 
 /// Handle amdgpu-cluster-dims attribute.
-static void handleAMDGPUClusterDimsAttr(const clang::FunctionDecl *fd,
+static void handleAMDGPUClusterDimsAttr(const FunctionDecl *fd,
                                         cir::FuncOp func, CIRGenModule &cgm,
+                                        CIRGenBuilderTy &builder,
                                         bool isOpenCLKernel) {
-  auto &builder = cgm.getBuilder();
 
   if (const auto *attr = fd->getAttr<CUDAClusterDimsAttr>()) {
     auto getExprVal = [&](const Expr *e) {
@@ -201,7 +198,7 @@ static void handleAMDGPUClusterDimsAttr(const 
clang::FunctionDecl *fd,
                   builder.getStringAttr(attrVal.str()));
   }
 
-  const clang::TargetInfo &targetInfo = cgm.getASTContext().getTargetInfo();
+  const TargetInfo &targetInfo = cgm.getASTContext().getTargetInfo();
   if ((isOpenCLKernel &&
        targetInfo.hasFeatureEnabled(targetInfo.getTargetOpts().FeatureMap,
                                     "clusters")) ||
@@ -211,58 +208,48 @@ static void handleAMDGPUClusterDimsAttr(const 
clang::FunctionDecl *fd,
 }
 
 /// Handle amdgpu-ieee attribute.
-static void handleAMDGPUIEEEAttr(cir::FuncOp func, CIRGenModule &cgm) {
-  if (!cgm.getCodeGenOpts().EmitIEEENaNCompliantInsts) {
-    auto &builder = cgm.getBuilder();
+static void handleAMDGPUIEEEAttr(cir::FuncOp func, CIRGenModule &cgm,
+                                 CIRGenBuilderTy &builder) {
+  if (!cgm.getCodeGenOpts().EmitIEEENaNCompliantInsts)
     func->setAttr("cir.amdgpu-ieee", builder.getStringAttr("false"));
-  }
 }
 
 /// Handle amdgpu-expand-waitcnt-profiling attribute.
 static void handleAMDGPUExpandWaitcntProfilingAttr(cir::FuncOp func,
-                                                   CIRGenModule &cgm) {
-  if (cgm.getCodeGenOpts().AMDGPUExpandWaitcntProfiling) {
-    auto &builder = cgm.getBuilder();
+                                                   CIRGenModule &cgm,
+                                                   CIRGenBuilderTy &builder) {
+  if (cgm.getCodeGenOpts().AMDGPUExpandWaitcntProfiling)
     func->setAttr("cir.amdgpu-expand-waitcnt-profiling",
                   builder.getStringAttr(""));
-  }
 }
 
-} // anonymous namespace
+} // namespace
 
-void clang::CIRGen::setAMDGPUTargetFunctionAttributes(const clang::Decl *decl,
+void clang::CIRGen::setAMDGPUTargetFunctionAttributes(const Decl *decl,
                                                       cir::FuncOp func,
                                                       CIRGenModule &cgm) {
-  if (requiresAMDGPUProtectedVisibility(decl, func)) {
-    func.setGlobalVisibility(cir::VisibilityKind::Protected);
-    func.setDSOLocal(true);
-  }
-
   if (func.isDeclaration())
     return;
 
-  const auto *fd = clang::dyn_cast_or_null<clang::FunctionDecl>(decl);
+  CIRGenBuilderTy &builder = cgm.getBuilder();
+
+  const auto *fd = dyn_cast_or_null<FunctionDecl>(decl);
   if (fd) {
     const bool isOpenCLKernel =
         cgm.getLangOpts().OpenCL && fd->hasAttr<DeviceKernelAttr>();
     const bool isHIPKernel =
         cgm.getLangOpts().HIP && fd->hasAttr<CUDAGlobalAttr>();
 
-    if (isHIPKernel) {
-      // TODO(CIR) : Add amdgpu calling conv.
-      func.setVisibility(mlir::SymbolTable::Visibility::Public);
-      func.setLinkageAttr(cir::GlobalLinkageKindAttr::get(
-          func.getContext(), cir::GlobalLinkageKind::ExternalLinkage));
-    }
+    // TODO(CIR): Set amdgpu_kernel calling convention for HIP kernels.
 
-    handleAMDGPUFlatWorkGroupSizeAttr(fd, func, cgm, isOpenCLKernel,
+    handleAMDGPUFlatWorkGroupSizeAttr(fd, func, cgm, builder, isOpenCLKernel,
                                       isHIPKernel);
-    handleAMDGPUWavesPerEUAttr(fd, func, cgm);
-    handleAMDGPUNumSGPRAttr(fd, func, cgm);
-    handleAMDGPUNumVGPRAttr(fd, func, cgm);
-    handleAMDGPUMaxNumWorkGroupsAttr(fd, func, cgm);
-    handleAMDGPUClusterDimsAttr(fd, func, cgm, isOpenCLKernel);
+    handleAMDGPUWavesPerEUAttr(fd, func, cgm, builder);
+    handleAMDGPUNumSGPRAttr(fd, func, cgm, builder);
+    handleAMDGPUNumVGPRAttr(fd, func, cgm, builder);
+    handleAMDGPUMaxNumWorkGroupsAttr(fd, func, cgm, builder);
+    handleAMDGPUClusterDimsAttr(fd, func, cgm, builder, isOpenCLKernel);
   }
-  handleAMDGPUIEEEAttr(func, cgm);
-  handleAMDGPUExpandWaitcntProfilingAttr(func, cgm);
+  handleAMDGPUIEEEAttr(func, cgm, builder);
+  handleAMDGPUExpandWaitcntProfilingAttr(func, cgm, builder);
 }

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [CIR][AMDGPU] Add AMDGPU-specific function attributes for HIP kernels (PR #188007)

Reply via email to