llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> The main goal is to fold away wave64 code when compiled for wave32. If we have out of bounds indexing, these will now clamp down to a low bit which may CSE with the operations on the low half of the wave. --- Full diff: https://github.com/llvm/llvm-project/pull/117963.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (+42-1) - (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h (+4) - (modified) llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll (+96-51) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index 18a09c39a06387..a0bb3e181ac526 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -450,6 +450,37 @@ static bool isTriviallyUniform(const Use &U) { return false; } +/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1). +/// +/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64. +bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC, + IntrinsicInst &II, + unsigned LaneArgIdx) const { + unsigned MaskBits = ST->isWaveSizeKnown() && ST->isWave32() ? 5 : 6; + APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits)); + + KnownBits Known(32); + if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known)) + return true; + + if (!Known.isConstant()) + return false; + + // Unlike the DAG version, SimplifyDemandedBits does not change + // constants. Make sure we clamp these down. Out of bounds indexes may appear + // in wave64 code compiled for wave32. + + Value *LaneArg = II.getArgOperand(LaneArgIdx); + Constant *MaskedConst = + ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask); + if (MaskedConst != LaneArg) { + II.getOperandUse(LaneArgIdx).set(MaskedConst); + return true; + } + + return false; +} + std::optional<Instruction *> GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); @@ -1092,7 +1123,17 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { const Use &Src = II.getArgOperandUse(0); if (isTriviallyUniform(Src)) return IC.replaceInstUsesWith(II, Src.get()); - break; + + if (IID == Intrinsic::amdgcn_readlane && + simplifyDemandedLaneMaskArg(IC, II, 1)) + return &II; + + return std::nullopt; + } + case Intrinsic::amdgcn_writelane: { + if (simplifyDemandedLaneMaskArg(IC, II, 1)) + return &II; + return std::nullopt; } case Intrinsic::amdgcn_trig_preop: { // The intrinsic is declared with name mangling, but currently the diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 10956861650ab3..585f38fc02c29c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -220,6 +220,10 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> { bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const; + + bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, + unsigned LaneAgIdx) const; + std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const; std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll b/llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll index b686f447b8d3c9..327d68bdf550e4 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll @@ -18,30 +18,45 @@ define i32 @readlane_31(i32 %arg) #0 { } define i32 @readlane_32(i32 %arg) #0 { -; CHECK-LABEL: define i32 @readlane_32( -; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 32) -; CHECK-NEXT: ret i32 [[RES]] +; WAVE64-LABEL: define i32 @readlane_32( +; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 32) +; WAVE64-NEXT: ret i32 [[RES]] +; +; WAVE32-LABEL: define i32 @readlane_32( +; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 0) +; WAVE32-NEXT: ret i32 [[RES]] ; %res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 32) ret i32 %res } define i32 @readlane_33(i32 %arg) #0 { -; CHECK-LABEL: define i32 @readlane_33( -; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 33) -; CHECK-NEXT: ret i32 [[RES]] +; WAVE64-LABEL: define i32 @readlane_33( +; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 33) +; WAVE64-NEXT: ret i32 [[RES]] +; +; WAVE32-LABEL: define i32 @readlane_33( +; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 1) +; WAVE32-NEXT: ret i32 [[RES]] ; %res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 33) ret i32 %res } define i32 @readlane_63(i32 %arg) #0 { -; CHECK-LABEL: define i32 @readlane_63( -; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 63) -; CHECK-NEXT: ret i32 [[RES]] +; WAVE64-LABEL: define i32 @readlane_63( +; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 63) +; WAVE64-NEXT: ret i32 [[RES]] +; +; WAVE32-LABEL: define i32 @readlane_63( +; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { +; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 31) +; WAVE32-NEXT: ret i32 [[RES]] ; %res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 63) ret i32 %res @@ -50,7 +65,7 @@ define i32 @readlane_63(i32 %arg) #0 { define i32 @readlane_64(i32 %arg) #0 { ; CHECK-LABEL: define i32 @readlane_64( ; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 64) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 0) ; CHECK-NEXT: ret i32 [[RES]] ; %res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 64) @@ -58,11 +73,16 @@ define i32 @readlane_64(i32 %arg) #0 { } define i32 @readlane_and_31(i32 %arg, i32 %idx) #0 { -; CHECK-LABEL: define i32 @readlane_and_31( -; CHECK-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31 -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]]) -; CHECK-NEXT: ret i32 [[RES]] +; WAVE64-LABEL: define i32 @readlane_and_31( +; WAVE64-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] { +; WAVE64-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31 +; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]]) +; WAVE64-NEXT: ret i32 [[RES]] +; +; WAVE32-LABEL: define i32 @readlane_and_31( +; WAVE32-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] { +; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX]]) +; WAVE32-NEXT: ret i32 [[RES]] ; %idx.clamp = and i32 %idx, 31 %res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 %idx.clamp) @@ -72,8 +92,7 @@ define i32 @readlane_and_31(i32 %arg, i32 %idx) #0 { define i32 @readlane_and_63(i32 %arg, i32 %idx) #0 { ; CHECK-LABEL: define i32 @readlane_and_63( ; CHECK-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 63 -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX]]) ; CHECK-NEXT: ret i32 [[RES]] ; %idx.clamp = and i32 %idx, 63 @@ -92,10 +111,15 @@ define i32 @readlane_poison(i32 %arg) #0 { } define float @readlane_f32_63(float %arg) #0 { -; CHECK-LABEL: define float @readlane_f32_63( -; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 63) -; CHECK-NEXT: ret float [[RES]] +; WAVE64-LABEL: define float @readlane_f32_63( +; WAVE64-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; WAVE64-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 63) +; WAVE64-NEXT: ret float [[RES]] +; +; WAVE32-LABEL: define float @readlane_f32_63( +; WAVE32-SAME: float [[ARG:%.*]]) #[[ATTR0]] { +; WAVE32-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 31) +; WAVE32-NEXT: ret float [[RES]] ; %res = call float @llvm.amdgcn.readlane.f32(float %arg, i32 63) ret float %res @@ -116,30 +140,45 @@ define i32 @writelane_31(i32 %arg0, i32 %arg1) #0 { } define i32 @writelane_32(i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: define i32 @writelane_32( -; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 32, i32 [[ARG1]]) -; CHECK-NEXT: ret i32 [[RES]] +; WAVE64-LABEL: define i32 @writelane_32( +; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 32, i32 [[ARG1]]) +; WAVE64-NEXT: ret i32 [[RES]] +; +; WAVE32-LABEL: define i32 @writelane_32( +; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 0, i32 [[ARG1]]) +; WAVE32-NEXT: ret i32 [[RES]] ; %res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 32, i32 %arg1) ret i32 %res } define i32 @writelane_33(i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: define i32 @writelane_33( -; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 33, i32 [[ARG1]]) -; CHECK-NEXT: ret i32 [[RES]] +; WAVE64-LABEL: define i32 @writelane_33( +; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 33, i32 [[ARG1]]) +; WAVE64-NEXT: ret i32 [[RES]] +; +; WAVE32-LABEL: define i32 @writelane_33( +; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 1, i32 [[ARG1]]) +; WAVE32-NEXT: ret i32 [[RES]] ; %res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 33, i32 %arg1) ret i32 %res } define i32 @writelane_63(i32 %arg0, i32 %arg1) #0 { -; CHECK-LABEL: define i32 @writelane_63( -; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 63, i32 [[ARG1]]) -; CHECK-NEXT: ret i32 [[RES]] +; WAVE64-LABEL: define i32 @writelane_63( +; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 63, i32 [[ARG1]]) +; WAVE64-NEXT: ret i32 [[RES]] +; +; WAVE32-LABEL: define i32 @writelane_63( +; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { +; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 31, i32 [[ARG1]]) +; WAVE32-NEXT: ret i32 [[RES]] ; %res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 63, i32 %arg1) ret i32 %res @@ -148,7 +187,7 @@ define i32 @writelane_63(i32 %arg0, i32 %arg1) #0 { define i32 @writelane_64(i32 %arg0, i32 %arg1) #0 { ; CHECK-LABEL: define i32 @writelane_64( ; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 64, i32 [[ARG1]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 0, i32 [[ARG1]]) ; CHECK-NEXT: ret i32 [[RES]] ; %res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 64, i32 %arg1) @@ -156,11 +195,16 @@ define i32 @writelane_64(i32 %arg0, i32 %arg1) #0 { } define i32 @writelane_and_31(i32 %arg0, i32 %arg1, i32 %idx) #0 { -; CHECK-LABEL: define i32 @writelane_and_31( -; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31 -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]]) -; CHECK-NEXT: ret i32 [[RES]] +; WAVE64-LABEL: define i32 @writelane_and_31( +; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] { +; WAVE64-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31 +; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]]) +; WAVE64-NEXT: ret i32 [[RES]] +; +; WAVE32-LABEL: define i32 @writelane_and_31( +; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] { +; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX]], i32 [[ARG1]]) +; WAVE32-NEXT: ret i32 [[RES]] ; %idx.clamp = and i32 %idx, 31 %res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 %idx.clamp, i32 %arg1) @@ -170,8 +214,7 @@ define i32 @writelane_and_31(i32 %arg0, i32 %arg1, i32 %idx) #0 { define i32 @writelane_and_63(i32 %arg0, i32 %arg1, i32 %idx) #0 { ; CHECK-LABEL: define i32 @writelane_and_63( ; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 63 -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]]) +; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX]], i32 [[ARG1]]) ; CHECK-NEXT: ret i32 [[RES]] ; %idx.clamp = and i32 %idx, 63 @@ -190,16 +233,18 @@ define i32 @writelane_poison(i32 %arg0, i32 %arg1) #0 { } define float @writelane_f32_63(float %arg0, float %arg1) #0 { -; CHECK-LABEL: define float @writelane_f32_63( -; CHECK-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 63, float [[ARG1]]) -; CHECK-NEXT: ret float [[RES]] +; WAVE64-LABEL: define float @writelane_f32_63( +; WAVE64-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] { +; WAVE64-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 63, float [[ARG1]]) +; WAVE64-NEXT: ret float [[RES]] +; +; WAVE32-LABEL: define float @writelane_f32_63( +; WAVE32-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] { +; WAVE32-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 31, float [[ARG1]]) +; WAVE32-NEXT: ret float [[RES]] ; %res = call float @llvm.amdgcn.writelane.f32(float %arg0, i32 63, float %arg1) ret float %res } attributes #0 = { nounwind } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; WAVE32: {{.*}} -; WAVE64: {{.*}} `````````` </details> https://github.com/llvm/llvm-project/pull/117963 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits