https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/113019
>From 63a199325b085599e1d66c241c7a9beca667dfb3 Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Sat, 19 Oct 2024 02:18:45 +0400 Subject: [PATCH] AMDGPU: Mark grid size loads with range metadata Only handles the v5 case. --- .../AMDGPU/AMDGPULowerKernelAttributes.cpp | 33 ++++- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 1 + ...amdgpu-max-num-workgroups-load-annotate.ll | 124 ++++++++++++++++++ 3 files changed, 154 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp index 1bb5e794da7dd6..5fc0c36359b6f5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" @@ -83,6 +84,20 @@ Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) { } // end anonymous namespace +static void annotateGridSizeLoadWithRangeMD(LoadInst *Load, + uint32_t MaxNumGroups) { + if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max()) + return; + + if (!Load->getType()->isIntegerTy(32)) + return; + + // TODO: If there is existing range metadata, preserve it if it is stricter. + MDBuilder MDB(Load->getContext()); + MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1)); + Load->setMetadata(LLVMContext::MD_range, Range); +} + static bool processUse(CallInst *CI, bool IsV5OrAbove) { Function *F = CI->getParent()->getParent(); @@ -92,7 +107,11 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { const bool HasUniformWorkGroupSize = F->getFnAttribute("uniform-work-group-size").getValueAsBool(); - if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize) + SmallVector<unsigned> MaxNumWorkgroups = + AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", 3); + + if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize && + none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; })) return false; Value *BlockCounts[3] = {nullptr, nullptr, nullptr}; @@ -133,16 +152,22 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) { if (IsV5OrAbove) { // Base is ImplicitArgPtr. switch (Offset) { case HIDDEN_BLOCK_COUNT_X: - if (LoadSize == 4) + if (LoadSize == 4) { BlockCounts[0] = Load; + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]); + } break; case HIDDEN_BLOCK_COUNT_Y: - if (LoadSize == 4) + if (LoadSize == 4) { BlockCounts[1] = Load; + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]); + } break; case HIDDEN_BLOCK_COUNT_Z: - if (LoadSize == 4) + if (LoadSize == 4) { BlockCounts[2] = Load; + annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]); + } break; case HIDDEN_GROUP_SIZE_X: if (LoadSize == 2) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 54b17ca2cffb15..b18ce90cf45dba 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -369,6 +369,7 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct TM.getSubtarget<R600Subtarget>(F)); } +// FIXME: This has no reason to be in subtarget SmallVector<unsigned> AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3, diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll new file mode 100644 index 00000000000000..9064292129928f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s + +define i32 @use_grid_size_x_max_num_workgroups() #0 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0:![0-9]+]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + ret i32 %grid.size.x +} + +define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range() #0 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4, !range !0 + ret i32 %grid.size.x +} + +define i32 @use_grid_size_y_max_num_workgroups() #0 { +; CHECK-LABEL: define i32 @use_grid_size_y_max_num_workgroups( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GEP_GRID_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 4 +; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG1:![0-9]+]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_Y]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.grid.size.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 4 + %grid.size.y = load i32, ptr addrspace(4) %gep.grid.size.y, align 4 + ret i32 %grid.size.y +} + +define i32 @use_grid_size_z_max_num_workgroups() #0 { +; CHECK-LABEL: define i32 @use_grid_size_z_max_num_workgroups( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GEP_GRID_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 8 +; CHECK-NEXT: [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG2:![0-9]+]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_Z]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep.grid.size.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 8 + %grid.size.z = load i32, ptr addrspace(4) %gep.grid.size.z, align 4 + ret i32 %grid.size.z +} + +define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type() #0 { +; CHECK-LABEL: define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load <2 x i16>, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4 +; CHECK-NEXT: ret <2 x i16> [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load <2 x i16>, ptr addrspace(4) %implicitarg.ptr, align 4 + ret <2 x i16> %grid.size.x +} + +define i32 @use_grid_size_x_max_num_workgroups_max_minus_1() #1 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max_minus_1( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG3:![0-9]+]] +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + ret i32 %grid.size.x +} + +define i32 @use_grid_size_x_max_num_workgroups_max() #2 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4 +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + ret i32 %grid.size.x +} + +define i32 @use_grid_size_x_max_num_workgroups_zero() #3 { +; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_zero( +; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4 +; CHECK-NEXT: ret i32 [[GRID_SIZE_X]] +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4 + ret i32 %grid.size.x +} + +declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3 + +attributes #0 = { "amdgpu-max-num-workgroups"="36,42,89" } +attributes #1 = { "amdgpu-max-num-workgroups"="4294967294,42,89" } +attributes #2 = { "amdgpu-max-num-workgroups"="4294967295,42,89" } +attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" } + +!0 = !{i32 0, i32 -1} + +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" } +; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;. +; CHECK: [[RNG0]] = !{i32 1, i32 37} +; CHECK: [[RNG1]] = !{i32 1, i32 43} +; CHECK: [[RNG2]] = !{i32 1, i32 90} +; CHECK: [[RNG3]] = !{i32 1, i32 -1} +;. _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits