llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Patrick Simmons (linuxrocks123) <details> <summary>Changes</summary> This PR optimizes the pattern bitsin(typeof(x)) - popcnt(x) to s_bcnt0_i32 on AMDGPU. It also creates a Blang builtin for s_bcnt0_i32 so that users can call this instruction directly instead of relying on the compiler to match this pattern. --- Full diff: https://github.com/llvm/llvm-project/pull/164847.diff 5 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+3) - (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+8) - (modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+43) - (modified) llvm/lib/Target/AMDGPU/SOPInstructions.td (+6-2) - (modified) llvm/test/CodeGen/AMDGPU/s_cmp_0.ll (+17-21) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 8428fa97fe445..f17156f8a24ab 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -63,6 +63,9 @@ BUILTIN(__builtin_amdgcn_grid_size_z, "Ui", "nc") BUILTIN(__builtin_amdgcn_mbcnt_hi, "UiUiUi", "nc") BUILTIN(__builtin_amdgcn_mbcnt_lo, "UiUiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt032_lo, "UiUi", "nc") +BUILTIN(__builtin_amdgcn_bcnt064_lo, "UiWUi", "nc") + TARGET_BUILTIN(__builtin_amdgcn_s_memtime, "WUi", "n", "s-memtime-inst") //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 9e334d4316336..50b43a1c927ce 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2359,6 +2359,14 @@ def int_amdgcn_mbcnt_hi : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; +def int_amdgcn_bcnt032_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt032_lo">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + +def int_amdgcn_bcnt064_lo : + ClangBuiltin<"__builtin_amdgcn_bcnt064_lo">, + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; + // llvm.amdgcn.ds.swizzle src offset def int_amdgcn_ds_swizzle : ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 8e35ba77d69aa..39b558694edf8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" @@ -35,6 +36,7 @@ #include "llvm/Support/KnownFPClass.h" #include "llvm/Transforms/Utils/IntegerDivision.h" #include "llvm/Transforms/Utils/Local.h" +#include <cstdint> #define DEBUG_TYPE "amdgpu-codegenprepare" @@ -93,6 +95,13 @@ static cl::opt<bool> DisableFDivExpand( cl::ReallyHidden, cl::init(false)); +// Disable processing of fdiv so we can better test the backend implementations. +static cl::opt<bool> + DisableBcnt0("amdgpu-codegenprepare-disable-bcnt0", + cl::desc("Prevent transforming bitsin(typeof(x)) - " + "popcount(x) to bcnt0(x) in AMDGPUCodeGenPrepare"), + cl::ReallyHidden, cl::init(false)); + class AMDGPUCodeGenPrepareImpl : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> { public: @@ -258,6 +267,7 @@ class AMDGPUCodeGenPrepareImpl bool visitAddrSpaceCastInst(AddrSpaceCastInst &I); bool visitIntrinsicInst(IntrinsicInst &I); + bool visitCtpop(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); bool run(); @@ -1910,6 +1920,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { return visitFMinLike(I); case Intrinsic::sqrt: return visitSqrt(I); + case Intrinsic::ctpop: + return visitCtpop(I); default: return false; } @@ -1977,6 +1989,37 @@ Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder, return insertValues(Builder, FractArg->getType(), ResultVals); } +bool AMDGPUCodeGenPrepareImpl::visitCtpop(IntrinsicInst &I) { + uint32_t BitWidth, DestinationWidth, IntrinsicWidth; + if (!I.hasOneUse() || + !ST.hasBCNT(BitWidth = I.getType()->getIntegerBitWidth())) + return false; + + BinaryOperator *MustBeSub = dyn_cast<BinaryOperator>(I.user_back()); + if (!MustBeSub || MustBeSub->getOpcode() != BinaryOperator::Sub) + return false; + + ConstantInt *FirstOperand = dyn_cast<ConstantInt>(MustBeSub->getOperand(0)); + if (!FirstOperand || FirstOperand->getZExtValue() != BitWidth) + return false; + + IRBuilder<> Builder(MustBeSub); + Instruction *TransformedIns = + Builder.CreateIntrinsic(BitWidth > 32 ? Intrinsic::amdgcn_bcnt064_lo + : Intrinsic::amdgcn_bcnt032_lo, + {}, {I.getArgOperand(0)}); + + if ((DestinationWidth = MustBeSub->getType()->getIntegerBitWidth()) != + (IntrinsicWidth = TransformedIns->getType()->getIntegerBitWidth())) + TransformedIns = cast<Instruction>(Builder.CreateZExtOrTrunc( + TransformedIns, Type::getIntNTy(I.getContext(), DestinationWidth))); + + MustBeSub->replaceAllUsesWith(TransformedIns); + TransformedIns->takeName(MustBeSub); + MustBeSub->eraseFromParent(); + return true; +} + bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { Value *FractArg = matchFractPat(I); if (!FractArg) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 84287b621fe78..29104d33a8aa8 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -264,8 +264,12 @@ def S_BREV_B64 : SOP1_64 <"s_brev_b64", } // End isReMaterializable = 1, isAsCheapAsAMove = 1 let Defs = [SCC] in { -def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">; -def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">; +def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32", + [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt032_lo> i32:$src0))] +>; +def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64", + [(set i32:$sdst, (UniformUnaryFrag<int_amdgcn_bcnt064_lo> i64:$src0))] +>; def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32", [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))] >; diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index dd5f838b4a206..db030d2b19d90 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -444,16 +444,14 @@ define amdgpu_ps i32 @bfe_u64(i64 inreg %val0) { define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { ; CHECK-LABEL: bcnt032: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt1_i32_b32 s0, s0 -; CHECK-NEXT: s_sub_i32 s0, 32, s0 -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s0 -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt0_i32_b32 s0, s0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s0 +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone %result2 = sub i32 32, %result call void asm "; use $0", "s"(i32 %result2) @@ -465,17 +463,15 @@ define amdgpu_ps i32 @bcnt032(i32 inreg %val0) { define amdgpu_ps i32 @bcnt064(i64 inreg %val0) { ; CHECK-LABEL: bcnt064: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] -; CHECK-NEXT: s_sub_u32 s0, 64, s0 -; CHECK-NEXT: s_subb_u32 s1, 0, 0 -; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 -; CHECK-NEXT: ; return to shader part epilog +; CHECK-NEXT: s_bcnt0_i32_b64 s0, s[0:1] +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ; use s[0:1] +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog %result = call i64 @llvm.ctpop.i64(i64 %val0) nounwind readnone %result2 = sub i64 64, %result call void asm "; use $0", "s"(i64 %result2) `````````` </details> https://github.com/llvm/llvm-project/pull/164847 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
