[clang] f5b2168 - [AMDGPU] Add amdgcn_sched_group_barrier builtin
Author: Austin Kerbow Date: 2022-07-28T10:43:14-07:00 New Revision: f5b21680d1221d7acaa1b174d0b86fa907c71eb8 URL: https://github.com/llvm/llvm-project/commit/f5b21680d1221d7acaa1b174d0b86fa907c71eb8 DIFF: https://github.com/llvm/llvm-project/commit/f5b21680d1221d7acaa1b174d0b86fa907c71eb8.diff LOG: [AMDGPU] Add amdgcn_sched_group_barrier builtin This builtin allows the creation of custom scheduling pipelines on a per-region basis. Like the sched_barrier builtin this is intended to be used either for testing, in situations where the default scheduler heuristics cannot be improved, or in critical kernels where users are trying to get performance that is close to handwritten assembly. Obviously using these builtins will require extra work from the kernel writer to maintain the desired behavior. The builtin can be used to create groups of instructions called "scheduling groups" where ordering between the groups is enforced by the scheduler. __builtin_amdgcn_sched_group_barrier takes three parameters. The first parameter is a mask that determines the types of instructions that you would like to synchronize around and add to a scheduling group. These instructions will be selected from the bottom up starting from the sched_group_barrier's location during instruction scheduling. The second parameter is the number of matching instructions that will be associated with this sched_group_barrier. The third parameter is an identifier which is used to describe what other sched_group_barriers should be synchronized with. Note that multiple sched_group_barriers must be added in order for them to be useful since they only synchronize with other sched_group_barriers. Only "scheduling groups" with a matching third parameter will have any enforced ordering between them. As an example, the code below tries to create a pipeline of 1 VMEM_READ instruction followed by 1 VALU instruction followed by 5 MFMA instructions... // 1 VMEM_READ __builtin_amdgcn_sched_group_barrier(32, 1, 0) // 1 VALU __builtin_amdgcn_sched_group_barrier(2, 1, 0) // 5 MFMA __builtin_amdgcn_sched_group_barrier(8, 5, 0) // 1 VMEM_READ __builtin_amdgcn_sched_group_barrier(32, 1, 0) // 3 VALU __builtin_amdgcn_sched_group_barrier(2, 3, 0) // 2 VMEM_WRITE __builtin_amdgcn_sched_group_barrier(64, 2, 0) Reviewed By: jrbyrnes Differential Revision: https://reviews.llvm.org/D128158 Added: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll llvm/test/CodeGen/AMDGPU/sched-group-barrier-pre-RA.mir Modified: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/test/CodeGenOpenCL/builtins-amdgcn.cl clang/test/SemaOpenCL/builtins-amdgcn-error.cl llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp llvm/lib/Target/AMDGPU/SIInstructions.td llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp Removed: diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index cdf5f5a854189..618d5562e5093 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,6 +63,7 @@ BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n") BUILTIN(__builtin_amdgcn_s_barrier, "v", "n") BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n") +BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") BUILTIN(__builtin_amdgcn_ds_gws_init, "vUiUi", "n") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 9853045ea19f9..444b65a83719b 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -409,6 +409,19 @@ void test_sched_barrier() __builtin_amdgcn_sched_barrier(15); } +// CHECK-LABEL: @test_sched_group_barrier +// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 1, i32 2) +// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 2, i32 4) +// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 4, i32 8, i32 16) +// CHECK: call void @llvm.amdgcn.sched.group.barrier(i32 15, i32 1, i32 -1) +void test_sched_group_barrier() +{ + __builtin_amdgcn_sched_group_barrier(0, 1, 2); + __builtin_amdgcn_sched_group_barrier(1, 2, 4); + __builtin_amdgcn_sched_group_barrier(4, 8, 16); + __builtin_amdgcn_sched_group_barrier(15, 1, -1); +} + // CHECK-LABEL: @test_s_sleep // CHECK: call void @llvm.amdgcn.s.sleep(i32 1) // CHECK: call void @llvm.amdgcn.s.sleep(i32 15) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl index 32fb41a6201c4..dd296e3854973 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl +++ b/clang/test/SemaOpenCL/b
[clang] 62bcfcb - [AMDGPU] Add llvm.amdgcn.s.setprio intrinsic
Author: Austin Kerbow Date: 2022-03-12T22:15:42-08:00 New Revision: 62bcfcb5a588e5e844f8e4e42a2e4d15c907a746 URL: https://github.com/llvm/llvm-project/commit/62bcfcb5a588e5e844f8e4e42a2e4d15c907a746 DIFF: https://github.com/llvm/llvm-project/commit/62bcfcb5a588e5e844f8e4e42a2e4d15c907a746.diff LOG: [AMDGPU] Add llvm.amdgcn.s.setprio intrinsic Reviewed By: rampitec, arsenm Differential Revision: https://reviews.llvm.org/D120976 Added: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll Modified: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/test/CodeGenOpenCL/builtins-amdgcn.cl clang/test/SemaOpenCL/builtins-amdgcn-error.cl llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/Target/AMDGPU/SOPInstructions.td Removed: diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 3b7ff75a9410a..d2e60f85b9feb 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -116,6 +116,7 @@ BUILTIN(__builtin_amdgcn_cubema, "", "nc") BUILTIN(__builtin_amdgcn_s_sleep, "vIi", "n") BUILTIN(__builtin_amdgcn_s_incperflevel, "vIi", "n") BUILTIN(__builtin_amdgcn_s_decperflevel, "vIi", "n") +BUILTIN(__builtin_amdgcn_s_setprio, "vIs", "n") BUILTIN(__builtin_amdgcn_uicmp, "WUiUiUiIi", "nc") BUILTIN(__builtin_amdgcn_uicmpl, "WUiWUiWUiIi", "nc") BUILTIN(__builtin_amdgcn_sicmp, "WUiiiIi", "nc") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index b02e6308c343f..ee6d2e0589c5c 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -423,6 +423,15 @@ void test_s_decperflevel() __builtin_amdgcn_s_decperflevel(15); } +// CHECK-LABEL: @test_s_setprio +// CHECK: call void @llvm.amdgcn.s.setprio(i16 0) +// CHECK: call void @llvm.amdgcn.s.setprio(i16 3) +void test_s_setprio() +{ + __builtin_amdgcn_s_setprio(0); + __builtin_amdgcn_s_setprio(3); +} + // CHECK-LABEL: @test_cubeid( // CHECK: call float @llvm.amdgcn.cubeid(float %a, float %b, float %c) void test_cubeid(global float* out, float a, float b, float c) { diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl index 20a520812e3f3..af4fdf32c272f 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl @@ -54,6 +54,12 @@ void test_s_decperflevel(int x) __builtin_amdgcn_s_decperflevel(x); // expected-error {{argument to '__builtin_amdgcn_s_decperflevel' must be a constant integer}} } +void test_s_setprio(int x) +{ + __builtin_amdgcn_s_setprio(x); // expected-error {{argument to '__builtin_amdgcn_s_setprio' must be a constant integer}} + __builtin_amdgcn_s_setprio(65536); // expected-warning {{implicit conversion from 'int' to 'short' changes value from 65536 to 0}} +} + void test_sicmp_i32(global ulong* out, int a, int b, uint c) { *out = __builtin_amdgcn_sicmp(a, b, c); // expected-error {{argument to '__builtin_amdgcn_sicmp' must be a constant integer}} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 2a351721e034a..0de25e20e7d0a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1329,6 +1329,11 @@ def int_amdgcn_s_sethalt : Intrinsic<[], [llvm_i32_ty], [ImmArg>, IntrNoMem, IntrHasSideEffects, IntrWillReturn]>; +def int_amdgcn_s_setprio : + GCCBuiltin<"__builtin_amdgcn_s_setprio">, + Intrinsic<[], [llvm_i16_ty], [ImmArg>, IntrNoMem, +IntrHasSideEffects, IntrWillReturn]>; + def int_amdgcn_s_getreg : GCCBuiltin<"__builtin_amdgcn_s_getreg">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 86d1feb183f44..16f747a285d6d 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1278,7 +1278,10 @@ def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16), let hasSideEffects = 1; } -def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">; +def S_SETPRIO : SOPP_Pseudo <"s_setprio", (ins i16imm:$simm16), "$simm16", + [(int_amdgcn_s_setprio timm:$simm16)]> { + let hasSideEffects = 1; +} let Uses = [EXEC, M0] in { // FIXME: Should this be mayLoad+mayStore? diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll new file mode 100644 index 0..8dd414f78340b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.setprio.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx90a -show-mc-encoding -ve
[clang] 2db7002 - [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic
Author: Austin Kerbow Date: 2022-05-11T13:22:51-07:00 New Revision: 2db700215a2eebce7358c0a81a3d52d0a9d4a997 URL: https://github.com/llvm/llvm-project/commit/2db700215a2eebce7358c0a81a3d52d0a9d4a997 DIFF: https://github.com/llvm/llvm-project/commit/2db700215a2eebce7358c0a81a3d52d0a9d4a997.diff LOG: [AMDGPU] Add llvm.amdgcn.sched.barrier intrinsic Adds an intrinsic/builtin that can be used to fine tune scheduler behavior. If there is a need to have highly optimized codegen and kernel developers have knowledge of inter-wave runtime behavior which is unknown to the compiler this builtin can be used to tune scheduling. This intrinsic creates a barrier between scheduling regions. The immediate parameter is a mask to determine the types of instructions that should be prevented from crossing the sched_barrier. In this initial patch, there are only two variations. A mask of 0 means that no instructions may be scheduled across the sched_barrier. A mask of 1 means that non-memory, non-side-effect inducing instructions may cross the sched_barrier. Note that this intrinsic is only meant to work with the scheduling passes. Any other transformations that may move code will not be impacted in the ways described above. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D124700 Added: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.barrier.ll llvm/test/CodeGen/AMDGPU/sched_barrier.mir Modified: clang/include/clang/Basic/BuiltinsAMDGPU.def clang/test/CodeGenOpenCL/builtins-amdgcn.cl clang/test/SemaOpenCL/builtins-amdgcn-error.cl llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.cpp llvm/lib/Target/AMDGPU/SIInstructions.td llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir Removed: diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index afcfa07f6df13..19e4ea998aa47 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -62,6 +62,7 @@ BUILTIN(__builtin_amdgcn_s_sendmsg, "vIiUi", "n") BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n") BUILTIN(__builtin_amdgcn_s_barrier, "v", "n") BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n") +BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n") BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n") BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n") BUILTIN(__builtin_amdgcn_ds_gws_init, "vUiUi", "n") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 7d1509e4c4cb0..9853045ea19f9 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -396,6 +396,19 @@ void test_wave_barrier() __builtin_amdgcn_wave_barrier(); } +// CHECK-LABEL: @test_sched_barrier +// CHECK: call void @llvm.amdgcn.sched.barrier(i32 0) +// CHECK: call void @llvm.amdgcn.sched.barrier(i32 1) +// CHECK: call void @llvm.amdgcn.sched.barrier(i32 4) +// CHECK: call void @llvm.amdgcn.sched.barrier(i32 15) +void test_sched_barrier() +{ + __builtin_amdgcn_sched_barrier(0); + __builtin_amdgcn_sched_barrier(1); + __builtin_amdgcn_sched_barrier(4); + __builtin_amdgcn_sched_barrier(15); +} + // CHECK-LABEL: @test_s_sleep // CHECK: call void @llvm.amdgcn.s.sleep(i32 1) // CHECK: call void @llvm.amdgcn.s.sleep(i32 15) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl index af4fdf32c272f..1351ab58c1f9a 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error.cl @@ -60,6 +60,11 @@ void test_s_setprio(int x) __builtin_amdgcn_s_setprio(65536); // expected-warning {{implicit conversion from 'int' to 'short' changes value from 65536 to 0}} } +void test_sched_barrier(int x) +{ + __builtin_amdgcn_sched_barrier(x); // expected-error {{argument to '__builtin_amdgcn_sched_barrier' must be a constant integer}} +} + void test_sicmp_i32(global ulong* out, int a, int b, uint c) { *out = __builtin_amdgcn_sicmp(a, b, c); // expected-error {{argument to '__builtin_amdgcn_sicmp' must be a constant integer}} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index f148fda5b37f5..df552982a4ee9 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -213,6 +213,15 @@ def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">, def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn]>; +// The 1st parameter is a mask for the types of instructions that may be allowed +// to cross the SCHED_BARRIER during
[clang] 864a2b2 - [AMDGPU] Reserve extra SGPR blocks wth XNACK "any" TID Setting
Author: Austin Kerbow Date: 2023-03-17T20:26:23-07:00 New Revision: 864a2b25beac507cc76b50030757283aae434c0c URL: https://github.com/llvm/llvm-project/commit/864a2b25beac507cc76b50030757283aae434c0c DIFF: https://github.com/llvm/llvm-project/commit/864a2b25beac507cc76b50030757283aae434c0c.diff LOG: [AMDGPU] Reserve extra SGPR blocks wth XNACK "any" TID Setting ASMPrinter was relying on feature bits to setup extra SGRPs in the knerel descriptor for the xnack_mask. This was broken for the dynamic XNACK "any" TID setting which could cause user SGPRs to be clobbered if the number of SGPRs reserved was near a granulated block boundary. When XNACK was enabled this worked correctly in the ASMParser which meant some kernels were only failing without "-save-temps". Fixes: SWDEV-382764 Reviewed By: kzhuravl Differential Revision: https://reviews.llvm.org/D145401 Added: llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll Modified: clang/test/Frontend/amdgcn-machine-analysis-remarks.cl llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll llvm/test/CodeGen/AMDGPU/amdpal-callable.ll llvm/test/CodeGen/AMDGPU/occupancy-levels.ll llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll llvm/test/CodeGen/AMDGPU/trap-abis.ll Removed: diff --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl index cf0c15b6319f1..9403d12afa05a 100644 --- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl +++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl @@ -2,7 +2,7 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null // expected-remark@+9 {{Function Name: foo}} -// expected-remark@+8 {{SGPRs: 9}} +// expected-remark@+8 {{SGPRs: 13}} // expected-remark@+7 {{VGPRs: 10}} // expected-remark@+6 {{AGPRs: 12}} // expected-remark@+5 {{ScratchSize [bytes/lane]: 0}} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 0883e7a5ed3a3..82c57dfcef0d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -251,9 +251,9 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() { STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo), CurrentProgramInfo.NumVGPRsForWavesPerEU, CurrentProgramInfo.NumSGPRsForWavesPerEU - - IsaInfo::getNumExtraSGPRs(&STM, -CurrentProgramInfo.VCCUsed, -CurrentProgramInfo.FlatUsed), + IsaInfo::getNumExtraSGPRs( + &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, + getTargetStreamer()->getTargetID()->isXnackOnOrAny()), CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed, CodeObjectVersion); @@ -721,7 +721,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be // unified. unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs( - &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed); + &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed, + getTargetStreamer()->getTargetID()->isXnackOnOrAny()); // Check the addressable register limit before we add ExtraSGPRs. if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 8e558b539fa72..e639fce9d690e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3061,7 +3061,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: gds_segment_byte_size = 0 ; GPRIDX-NEXT: kernarg_segment_byte_size = 12 ; GPRIDX-NEXT: workgroup_fbarrier_count = 0 -; GPRIDX-NEXT: wavefront_sgpr_count = 9 +; GPRIDX-NEXT: wavefront_sgpr_count = 13 ; GPRIDX-NEXT: workitem_vgpr_count = 3 ; GPRIDX-NEXT: reserved_vgpr_first = 0 ; GPRIDX-NEXT: reserved_vgpr_count = 0 @@ -3913,7 +3913,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256 ; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0 ; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0 -; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 0 +; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1 ; GPRIDX-NEXT: priority = 0 ; GPRIDX-NEXT: float_mode = 240 ; GPRIDX-NEXT: priv = 0 @@ -3956,7 +3956,7 @@ define amdgpu_kernel void @dyn_extract_v