https://github.com/lalaniket8 created https://github.com/llvm/llvm-project/pull/135138
None >From 810ad9859bbcd66d6942e497c25bdde27978bf3c Mon Sep 17 00:00:00 2001 From: anikelal <anike...@amd.com> Date: Wed, 9 Apr 2025 11:24:21 +0530 Subject: [PATCH] reduce max wrt divergent mask --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 18 +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 14 ++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 157 +++++++++++++++++++ llvm/lib/Target/AMDGPU/SIInstructions.td | 12 ++ 5 files changed, 203 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index cbef637be213a..642c25f6a0bff 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -366,6 +366,8 @@ BUILTIN(__builtin_r600_read_tidig_z, "Ui", "nc") BUILTIN(__builtin_r600_recipsqrt_ieee, "dd", "nc") BUILTIN(__builtin_r600_recipsqrt_ieeef, "ff", "nc") +BUILTIN(__builtin_amdgcn_wave_reduce_wrt_divergent_mask_max_i32, "iiii", "nc") + //===----------------------------------------------------------------------===// // MFMA builtins. //===----------------------------------------------------------------------===// diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index b56b739094ff3..15c0eeab4a78c 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -284,6 +284,15 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst, Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs)); } +static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) { + switch (BuiltinID) { + case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_wrt_divergent_mask_max_i32: + return Intrinsic::amdgcn_wave_reduce_wrt_divergent_mask_umax; + default: + llvm_unreachable("Unknown BuiltinID for wave reduction"); + } +} + Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E) { llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent; @@ -1142,6 +1151,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_s_prefetch_data: return emitBuiltinWithOneOverloadedType<2>( *this, E, Intrinsic::amdgcn_s_prefetch_data); + case AMDGPU::BI__builtin_amdgcn_wave_reduce_wrt_divergent_mask_max_i32:{ + Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID); + llvm::Value *Value = EmitScalarExpr(E->getArg(0)); + llvm::Value *Mask = EmitScalarExpr(E->getArg(1)); + llvm::Value *Strategy = EmitScalarExpr(E->getArg(2)); + // llvm::errs() << "Value->getType():" << Value->getType() << "\n"; + llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()}); + return Builder.CreateCall(F, {Value, Mask, Strategy}); + } default: return nullptr; } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index ae2f6e62c0272..b0a4ab04f4ca8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2349,6 +2349,20 @@ class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic< def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; +class AMDGPUWaveReduceWrtDivergentMask<LLVMType data_ty = llvm_anyint_ty> : Intrinsic< + [data_ty], + [ + LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR), + llvm_i32_ty, // Divergent mask + llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default, + // 1: Iterative strategy, and + // 2. DPP) + ], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<2>>]>; + +def int_amdgcn_wave_reduce_wrt_divergent_mask_umin : AMDGPUWaveReduceWrtDivergentMask; +def int_amdgcn_wave_reduce_wrt_divergent_mask_umax : AMDGPUWaveReduceWrtDivergentMask; + def int_amdgcn_readfirstlane : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 356040da95672..83ccba7cf1481 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5143,6 +5143,159 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, return RetBB; } +static MachineBasicBlock *lowerWaveReduceWrtDivergentMask(MachineInstr &MI, + MachineBasicBlock &BB, + const GCNSubtarget &ST, + unsigned Opc) { +MachineRegisterInfo &MRI = BB.getParent()->getRegInfo(); +const SIRegisterInfo *TRI = ST.getRegisterInfo(); +const DebugLoc &DL = MI.getDebugLoc(); +const SIInstrInfo *TII = ST.getInstrInfo(); +// const MachineFunction *MF = BB.getParent(); +// const TargetRegisterInfo *TrgtRegInfo = MF->getSubtarget().getRegisterInfo(); +// Reduction operations depend on whether the input operand is SGPR or VGPR. +Register SrcReg = MI.getOperand(1).getReg(); +auto SrcRegClass = MRI.getRegClass(SrcReg); +// llvm::errs() << TrgtRegInfo->getRegClassName(SrcRegClass) << "\n"; +bool isSGPR = TRI->isSGPRClass(SrcRegClass); +Register DstReg = MI.getOperand(0).getReg(); +// llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) << "\n"; +Register DivergentMaskReg = MI.getOperand(2).getReg(); +// llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DivergentMaskReg)) << "\n"; + +MachineBasicBlock *RetBB = nullptr; +if (isSGPR) { +BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg) +.addReg(SrcReg); +RetBB = &BB; +} else { + +MachineBasicBlock::iterator I = BB.end(); + +auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true); + +auto SReg32XM0RegClass = &AMDGPU::SReg_32_XM0RegClass; +auto SReg32RegClass = &AMDGPU::SReg_32RegClass; + +const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); +const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg); +Register ExecCopyReg = MRI.createVirtualRegister(WaveMaskRegClass); +Register AccSGPRReg = MRI.createVirtualRegister(SReg32XM0RegClass); +Register UpdatedAccSGPRReg = MRI.createVirtualRegister(SReg32RegClass); +Register AccReg1 = MRI.createVirtualRegister(DstRegClass); +Register AccReg = MRI.createVirtualRegister(DstRegClass); +Register BPermAddrReg = MRI.createVirtualRegister(DstRegClass); +Register UpdatedBPermAddrReg = MRI.createVirtualRegister(DstRegClass); +Register InitialBPermAddrReg = MRI.createVirtualRegister(DstRegClass); +Register UpdatedAccReg = MRI.createVirtualRegister(DstRegClass); +Register ActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass); +Register UpdatedActiveLanesReg = MRI.createVirtualRegister(WaveMaskRegClass); +Register FF1ActiveLanesReg = MRI.createVirtualRegister(SReg32RegClass); +Register FF1MaskReg = MRI.createVirtualRegister(SReg32RegClass); +Register FF1MaskX4Reg = MRI.createVirtualRegister(SReg32RegClass); +Register ValReg = MRI.createVirtualRegister(SReg32XM0RegClass); +Register MaskReg = MRI.createVirtualRegister(SReg32XM0RegClass); + +bool IsWave32 = ST.isWave32(); + +uint32_t IdentityValue = +(Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0; + +BuildMI(BB, I, DL, TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), ExecCopyReg).addReg(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); + +BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), AccReg) +.addImm(IdentityValue); +BuildMI(BB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), InitialBPermAddrReg) +.addImm(0); +BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)) +.addMBB(ComputeLoop); + +I = ComputeLoop->end(); + +auto PhiActiveLanesInst = +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveLanesReg) +.addReg(ExecCopyReg) +.addMBB(&BB); +auto PhiAccInst = +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccReg1) +.addReg(AccReg) +.addMBB(&BB); +auto PhiBPermAddrInst = +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), BPermAddrReg) +.addReg(InitialBPermAddrReg) +.addMBB(&BB); + +BuildMI(*ComputeLoop, I, DL, TII->get(IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64), FF1ActiveLanesReg) +.addReg(ActiveLanesReg); + +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), ValReg) +.addReg(SrcReg) +.addReg(FF1ActiveLanesReg); + +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), MaskReg) +.addReg(DivergentMaskReg) +.addReg(FF1ActiveLanesReg); + +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B32), FF1MaskReg).addReg(MaskReg); + +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32), AccSGPRReg) +.addReg(AccReg1) +.addReg(FF1MaskReg); + +BuildMI(*ComputeLoop, I, DL, TII->get(Opc), UpdatedAccSGPRReg).addReg(AccSGPRReg).addReg(ValReg); + +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) +.addReg(FF1MaskReg); + +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedAccReg) +.addReg(UpdatedAccSGPRReg) +.addReg(AMDGPU::M0) +.addReg(AccReg1); + +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_LSHL_B32), FF1MaskX4Reg) +.addReg(FF1MaskReg) +.addImm(2); + +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) +.addReg(FF1ActiveLanesReg); + +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_WRITELANE_B32), UpdatedBPermAddrReg) +.addReg(FF1MaskX4Reg) +.addReg(AMDGPU::M0) +.addReg(BPermAddrReg); + +unsigned BITSETOpc = +IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64; +BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), UpdatedActiveLanesReg) +.addReg(FF1ActiveLanesReg) +.addReg(ActiveLanesReg); + +PhiActiveLanesInst.addReg(UpdatedActiveLanesReg) +.addMBB(ComputeLoop); +PhiAccInst.addReg(UpdatedAccReg) +.addMBB(ComputeLoop); +PhiBPermAddrInst.addReg(UpdatedBPermAddrReg) +.addMBB(ComputeLoop); + +unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64; +BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) +.addReg(UpdatedActiveLanesReg) +.addImm(0); +BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) +.addMBB(ComputeLoop); + +BuildMI(*ComputeEnd, ComputeEnd->begin(), DL, TII->get(AMDGPU::DS_BPERMUTE_B32), DstReg) +.addReg(UpdatedBPermAddrReg) +.addReg(UpdatedAccReg) +.addImm(0); + +RetBB = ComputeEnd; + +} +MI.eraseFromParent(); +return RetBB; +} + MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -5156,6 +5309,10 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32: return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); + case AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMIN_PSEUDO_U32: + return lowerWaveReduceWrtDivergentMask(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32); + case AMDGPU::WAVE_REDUCE_WRT_DIVERGENT_MASK_UMAX_PSEUDO_U32: + return lowerWaveReduceWrtDivergentMask(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32); case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { const DebugLoc &DL = MI.getDebugLoc(); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 9051db0c01ed1..a2ddfdf5be125 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -315,6 +315,18 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses } } +let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { + def WAVE_REDUCE_WRT_DIVERGENT_MASK_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy), + [(set i32:$vdst, (int_amdgcn_wave_reduce_wrt_divergent_mask_umin i32:$src, i32:$mask, i32:$strategy))]> { + } + + def WAVE_REDUCE_WRT_DIVERGENT_MASK_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs VGPR_32:$vdst), + (ins VSrc_b32: $src, VSrc_b32: $mask, VSrc_b32:$strategy), + [(set i32:$vdst, (int_amdgcn_wave_reduce_wrt_divergent_mask_umax i32:$src, i32:$mask, i32:$strategy))]> { + } +} + let usesCustomInserter = 1, Defs = [VCC] in { def V_ADD_U64_PSEUDO : VPseudoInstSI < (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits