llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Fabian Ritter (ritter-x2a) <details> <summary>Changes</summary> This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125. --- Full diff: https://github.com/llvm/llvm-project/pull/143673.diff 3 Files Affected: - (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+3-1) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+116-35) - (modified) llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll (+48-103) ``````````diff diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 526a395181764..cc24585b4e4ad 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6765,7 +6765,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT, return SDValue(); int64_t Offset = C2->getSExtValue(); switch (Opcode) { - case ISD::ADD: break; + case ISD::ADD: + case ISD::PTRADD: + break; case ISD::SUB: Offset = -uint64_t(Offset); break; default: return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 0708c76bcb3fc..6149d7ca84354 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -46,6 +47,7 @@ #include <optional> using namespace llvm; +using namespace llvm::SDPatternMatch; #define DEBUG_TYPE "si-lower" @@ -14320,7 +14322,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, // instead of a tree. SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::ADD); + assert(N->isAnyAdd()); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -14353,7 +14355,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, for (SDNode *User : LHS->users()) { // There is a use that does not feed into addition, so the multiply can't // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. - if (User->getOpcode() != ISD::ADD) + if (!User->isAnyAdd()) return SDValue(); // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer @@ -14465,8 +14467,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, SDValue Hi = getHiHalf64(LHS, DAG); SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::PTRADD) + Opcode = ISD::ADD; SDValue AddHi = - DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); + DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); @@ -14940,44 +14945,120 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::ADD) { - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, - // y is not, and (add y, z) is used only once. - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, - // z is not, and (add y, z) is used only once. - // The goal is to move constant offsets to the outermost ptradd, to create - // more opportunities to fold offsets into memory instructions. - // Together with the generic combines in DAGCombiner.cpp, this also - // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). - // - // This transform is here instead of in the general DAGCombiner as it can - // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for - // AArch64's CPA. - SDValue X = N0; - SDValue Y = N1.getOperand(0); - SDValue Z = N1.getOperand(1); - bool N1OneUse = N1.hasOneUse(); - bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); - bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); - if ((ZIsConstant != YIsConstant) && N1OneUse) { - SDNodeFlags Flags; - // If both additions in the original were NUW, the new ones are as well. - if (N->getFlags().hasNoUnsignedWrap() && - N1->getFlags().hasNoUnsignedWrap()) - Flags |= SDNodeFlags::NoUnsignedWrap; - - if (YIsConstant) - std::swap(Y, Z); - - SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags); - DCI.AddToWorklist(Inner.getNode()); - return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags); + // The following folds transform PTRADDs into regular arithmetic in cases + // where the PTRADD wouldn't be folded as an immediate offset into memory + // instructions anyway. They are target-specific in that other targets might + // prefer to not lose information about the pointer arithmetic. + + // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)). + // Adapted from DAGCombiner::visitADDLikeCommutative. + SDValue V, K; + if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) { + SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getNode(ISD::SUB, DL, VT, N0, Inner); + } + + // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in + // performAddCombine. + if (N1.getOpcode() == ISD::MUL) { + if (Subtarget->hasMad64_32()) { + if (SDValue Folded = tryFoldToMad64_32(N, DCI)) + return Folded; + } + } + + // If the 32 low bits of the constant are all zero, there is nothing to fold + // into an immediate offset, so it's better to eliminate the unnecessary + // addition for the lower 32 bits than to preserve the PTRADD. + // Analogous to a fold in performAddCombine. + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } + + if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) { + // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with + // global address GA and constant c, such that c can be folded into GA. + SDValue GAValue = N0.getOperand(0); + if (const GlobalAddressSDNode *GA = + dyn_cast<GlobalAddressSDNode>(GAValue)) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (DCI.isBeforeLegalizeOps() && TLI.isOffsetFoldingLegal(GA)) { + SDNodeFlags Flags; + // If both additions in the original were NUW, reassociation preserves + // that. + if (N->getFlags().hasNoUnsignedWrap() && + N0->getFlags().hasNoUnsignedWrap()) + Flags |= SDNodeFlags::NoUnsignedWrap; + SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags); + } } } + if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse()) + return SDValue(); + + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, + // y is not, and (add y, z) is used only once. + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, + // z is not, and (add y, z) is used only once. + // The goal is to move constant offsets to the outermost ptradd, to create + // more opportunities to fold offsets into memory instructions. + // Together with the generic combines in DAGCombiner.cpp, this also + // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). + // + // This transform is here instead of in the general DAGCombiner as it can + // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for + // AArch64's CPA. + SDValue X = N0; + SDValue Y = N1.getOperand(0); + SDValue Z = N1.getOperand(1); + bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); + bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + + SDNodeFlags ReassocFlags; + // If both additions in the original were NUW, reassociation preserves that. + if (N->getFlags().hasNoUnsignedWrap() && N1->getFlags().hasNoUnsignedWrap()) + ReassocFlags |= SDNodeFlags::NoUnsignedWrap; + if (ZIsConstant != YIsConstant) { + + if (YIsConstant) + std::swap(Y, Z); + + SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags); + } + + // If one of Y and Z is constant, they have been handled above. If both were + // constant, the addition would have been folded in SelectionDAG::getNode + // already. This ensures that the generic DAG combines won't undo the + // following reassociation. + assert(!YIsConstant && !ZIsConstant); + + if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) { + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and + // y are uniform and z isn't. + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and + // z are uniform and y isn't. + // The goal is to push uniform operands up in the computation, so that they + // can be handled with scalar operations. We can't use reassociateScalarOps + // for this since it requires two identical commutative operations to + // reassociate. + if (Y->isDivergent()) + std::swap(Y, Z); + SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); + DCI.AddToWorklist(UniformInner.getNode()); + return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags); + } + return SDValue(); } diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 1ec94162951a6..c00bccdbce6b7 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -145,49 +145,29 @@ entry: ; Test skipping the lower-32-bit addition if it is unnecessary. define ptr @huge_offset_low_32_unused(ptr %p) { -; GFX942_PTRADD-LABEL: huge_offset_low_32_unused: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: s_mov_b32 s0, 0 -; GFX942_PTRADD-NEXT: s_mov_b32 s1, 1 -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: huge_offset_low_32_unused: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_add_u32_e32 v1, 1, v1 -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: huge_offset_low_32_unused: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX942-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i8, ptr %p, i64 u0x100000000 ret ptr %gep } ; Reassociate address computation if it leads to more scalar operations. define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) { -; GFX942_PTRADD-LABEL: reassoc_scalar_r: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0 -; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3] -; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; GFX942_PTRADD-NEXT: s_endpgm -; -; GFX942_LEGACY-LABEL: reassoc_scalar_r: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0 -; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6 -; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7 -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] -; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; GFX942_LEGACY-NEXT: s_endpgm +; GFX942-LABEL: reassoc_scalar_r: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_add_u32 s2, s2, s6 +; GFX942-NEXT: s_addc_u32 s3, s3, s7 +; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942-NEXT: s_endpgm entry: %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() %voffset = zext i32 %voffset32 to i64 @@ -198,30 +178,18 @@ entry: } define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) { -; GFX942_PTRADD-LABEL: reassoc_scalar_l: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0 -; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3] -; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; GFX942_PTRADD-NEXT: s_endpgm -; -; GFX942_LEGACY-LABEL: reassoc_scalar_l: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0 -; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6 -; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7 -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] -; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; GFX942_LEGACY-NEXT: s_endpgm +; GFX942-LABEL: reassoc_scalar_l: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_add_u32 s2, s2, s6 +; GFX942-NEXT: s_addc_u32 s3, s3, s7 +; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942-NEXT: s_endpgm entry: %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() %voffset = zext i32 %voffset32 to i64 @@ -233,24 +201,14 @@ entry: ; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %shift) { -; GFX942_PTRADD-LABEL: shl_neg_offset: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_sub_co_u32_e32 v2, vcc, 0, v2 -; GFX942_PTRADD-NEXT: s_nop 1 -; GFX942_PTRADD-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: shl_neg_offset: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX942_LEGACY-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 -; GFX942_LEGACY-NEXT: s_nop 1 -; GFX942_LEGACY-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: shl_neg_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] %offset = sub i64 0, %noffset %x = shl i64 %offset, %shift %gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %x @@ -268,10 +226,9 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) { ; GFX942_PTRADD: ; %bb.0: ; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1] -; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+4 -; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+12 +; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14 +; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22 ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 10 ; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] ; ; GFX942_LEGACY-LABEL: complextype_global_gep: @@ -291,27 +248,15 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) { ; Tests the tryFoldToMad64_32 PTRADD combine. define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) { -; GFX942_PTRADD-LABEL: fold_mad64: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v1, 12, v0 -; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v0, 12, v0 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1.0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] -; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off -; GFX942_PTRADD-NEXT: s_endpgm -; -; GFX942_LEGACY-LABEL: fold_mad64: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 1.0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] -; GFX942_LEGACY-NEXT: global_store_dword v[0:1], v2, off -; GFX942_LEGACY-NEXT: s_endpgm +; GFX942-LABEL: fold_mad64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; GFX942-NEXT: global_store_dword v[0:1], v2, off +; GFX942-NEXT: s_endpgm %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() %voffset = zext i32 %voffset32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %p, i64 %voffset, i32 0 `````````` </details> https://github.com/llvm/llvm-project/pull/143673 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits