https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/146054
>From 26615132899d40b8d245fd98d093ef8c26cdc3e1 Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Thu, 26 Jun 2025 13:31:37 +0200 Subject: [PATCH 1/3] [DAG] Fold (setcc ((x | x >> c0 | ...) & mask)) sequences Fold sequences where we extract a bunch of contiguous bits from a value, merge them into the low bit and then check if the low bits are zero or not. It seems like a strange sequence at first but it's an idiom used by device libs in device libs to check workitem IDs for AMDGPU. The reason I put this in DAGCombiner instead of the target combiner is because this is a generic, valid transform that's also fairly niche, so there isn't much risk of a combine loop I think. See #136727 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 86 ++++++++++++++++++- .../CodeGen/AMDGPU/workitem-intrinsic-opts.ll | 34 ++------ 2 files changed, 91 insertions(+), 29 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6ca243990c468..a6eb214762fcb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -28912,13 +28912,97 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, return SDValue(); } +static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG, + const TargetLowering &TLI) { + // Match a pattern such as: + // (X | (X >> C0) | (X >> C1) | ...) & Mask + // This extracts contiguous parts of X and ORs them together before comparing. + // We can optimize this so that we directly check (X & SomeMask) instead, + // eliminating the shifts. + + EVT VT = Root.getValueType(); + + if (Root.getOpcode() != ISD::AND) + return SDValue(); + + SDValue N0 = Root.getOperand(0); + SDValue N1 = Root.getOperand(1); + + if (N0.getOpcode() != ISD::OR || !isa<ConstantSDNode>(N1)) + return SDValue(); + + APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal(); + if (!RootMask.isMask()) + return SDValue(); + + SDValue Src; + const auto IsSrc = [&](SDValue V) { + if (!Src) { + Src = V; + return true; + } + + return Src == V; + }; + + SmallVector<SDValue> Worklist = {N0}; + APInt PartsMask(VT.getSizeInBits(), 0); + while (!Worklist.empty()) { + SDValue V = Worklist.pop_back_val(); + if (!V.hasOneUse() && Src != V) + return SDValue(); + + if (V.getOpcode() == ISD::OR) { + Worklist.push_back(V.getOperand(0)); + Worklist.push_back(V.getOperand(1)); + continue; + } + + if (V.getOpcode() == ISD::SRL) { + SDValue ShiftSrc = V.getOperand(0); + SDValue ShiftAmt = V.getOperand(1); + + if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt)) + return SDValue(); + + PartsMask |= (RootMask << cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal()); + continue; + } + + if (IsSrc(V)) { + PartsMask |= RootMask; + continue; + } + + return SDValue(); + } + + if (!RootMask.isMask() || !Src) + return SDValue(); + + SDLoc DL(Root); + return DAG.getNode(ISD::AND, DL, VT, + {Src, DAG.getConstant(PartsMask, DL, VT)}); +} + /// This is a stub for TargetLowering::SimplifySetCC. SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, bool foldBooleans) { TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, Level, false, this); - return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL); + if (SDValue C = + TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL)) + return C; + + if ((Cond == ISD::SETNE || Cond == ISD::SETEQ) && + N0.getOpcode() == ISD::AND && isNullConstant(N1)) { + + if (SDValue Res = matchMergedBFX(N0, DAG, TLI)) + return DAG.getSetCC(DL, VT, Res, N1, Cond); + } + + return SDValue(); } /// Given an ISD::SDIV node expressing a divide by constant, return diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll index 07c4aeb1ac7df..64d055bc40e98 100644 --- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll @@ -12,11 +12,7 @@ define i1 @workitem_zero() { ; DAGISEL-GFX8-LABEL: workitem_zero: ; DAGISEL-GFX8: ; %bb.0: ; %entry ; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v1, 10, v31 -; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v0, 20, v31 -; DAGISEL-GFX8-NEXT: v_or_b32_e32 v1, v31, v1 -; DAGISEL-GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31 ; DAGISEL-GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31] @@ -24,10 +20,7 @@ define i1 @workitem_zero() { ; DAGISEL-GFX942-LABEL: workitem_zero: ; DAGISEL-GFX942: ; %bb.0: ; %entry ; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31 -; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31 -; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0 -; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31 ; DAGISEL-GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; DAGISEL-GFX942-NEXT: s_nop 1 ; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -40,11 +33,7 @@ define i1 @workitem_zero() { ; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0 ; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31 -; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31 -; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0 -; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31 ; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL-GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd @@ -106,11 +95,7 @@ define i1 @workitem_nonzero() { ; DAGISEL-GFX8-LABEL: workitem_nonzero: ; DAGISEL-GFX8: ; %bb.0: ; %entry ; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v1, 10, v31 -; DAGISEL-GFX8-NEXT: v_lshrrev_b32_e32 v0, 20, v31 -; DAGISEL-GFX8-NEXT: v_or_b32_e32 v1, v31, v1 -; DAGISEL-GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; DAGISEL-GFX8-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31 ; DAGISEL-GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31] @@ -118,10 +103,7 @@ define i1 @workitem_nonzero() { ; DAGISEL-GFX942-LABEL: workitem_nonzero: ; DAGISEL-GFX942: ; %bb.0: ; %entry ; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v0, 20, v31 -; DAGISEL-GFX942-NEXT: v_lshrrev_b32_e32 v1, 10, v31 -; DAGISEL-GFX942-NEXT: v_or3_b32 v0, v31, v1, v0 -; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; DAGISEL-GFX942-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31 ; DAGISEL-GFX942-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; DAGISEL-GFX942-NEXT: s_nop 1 ; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -134,11 +116,7 @@ define i1 @workitem_nonzero() { ; DAGISEL-GFX12-NEXT: s_wait_samplecnt 0x0 ; DAGISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 ; DAGISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v0, 20, v31 -; DAGISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 10, v31 -; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; DAGISEL-GFX12-NEXT: v_or3_b32 v0, v31, v1, v0 -; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; DAGISEL-GFX12-NEXT: v_and_b32_e32 v0, 0x3fffffff, v31 ; DAGISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; DAGISEL-GFX12-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffd >From a3ee9d6d76963c62783de9b7daac15e019ec0ff6 Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Mon, 30 Jun 2025 10:19:44 +0200 Subject: [PATCH 2/3] comments --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 +-- llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll | 102 ++++++++++++++++++ 2 files changed, 110 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a6eb214762fcb..2cb53c2104f50 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -28922,7 +28922,7 @@ static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG, EVT VT = Root.getValueType(); - if (Root.getOpcode() != ISD::AND) + if (!VT.isScalarInteger() || Root.getOpcode() != ISD::AND) return SDValue(); SDValue N0 = Root.getOperand(0); @@ -28932,8 +28932,6 @@ static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG, return SDValue(); APInt RootMask = cast<ConstantSDNode>(N1)->getAsAPIntVal(); - if (!RootMask.isMask()) - return SDValue(); SDValue Src; const auto IsSrc = [&](SDValue V) { @@ -28949,7 +28947,7 @@ static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG, APInt PartsMask(VT.getSizeInBits(), 0); while (!Worklist.empty()) { SDValue V = Worklist.pop_back_val(); - if (!V.hasOneUse() && Src != V) + if (!V.hasOneUse() && (Src && Src != V)) return SDValue(); if (V.getOpcode() == ISD::OR) { @@ -28965,7 +28963,11 @@ static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG, if (!IsSrc(ShiftSrc) || !isa<ConstantSDNode>(ShiftAmt)) return SDValue(); - PartsMask |= (RootMask << cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal()); + auto ShiftAmtVal = cast<ConstantSDNode>(ShiftAmt)->getAsZExtVal(); + if (ShiftAmtVal > RootMask.getBitWidth()) + return SDValue(); + + PartsMask |= (RootMask << ShiftAmtVal); continue; } @@ -28977,7 +28979,7 @@ static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG, return SDValue(); } - if (!RootMask.isMask() || !Src) + if (!Src) return SDValue(); SDLoc DL(Root); diff --git a/llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll b/llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll new file mode 100644 index 0000000000000..9d415484d4f9c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -O3 -mtriple=amdgcn -mcpu=fiji %s -o - | FileCheck %s + +define i1 @basic_eq_i16_3x5(i16 %arg) { +; CHECK-LABEL: basic_eq_i16_3x5: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %a = and i16 %arg, 31 + %sh5 = lshr i16 %arg, 5 + %b = and i16 %sh5, 31 + %or = or i16 %a, %b + %sh10 = lshr i16 %arg, 10 + %c = and i16 %sh10, 31 + %or1 = or i16 %or, %c + %cmp = icmp eq i16 %or1, 0 + ret i1 %cmp +} + +define i1 @basic_eq_i32_3x5(i32 %arg) { +; CHECK-LABEL: basic_eq_i32_3x5: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %a = and i32 %arg, 31 + %sh5 = lshr i32 %arg, 5 + %b = and i32 %sh5, 31 + %or = or i32 %a, %b + %sh10 = lshr i32 %arg, 10 + %c = and i32 %sh10, 31 + %or1 = or i32 %or, %c + %cmp = icmp eq i32 %or1, 0 + ret i1 %cmp +} + +define i1 @basic_eq_i64_3x5(i64 %arg) { +; CHECK-LABEL: basic_eq_i64_3x5: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %a = and i64 %arg, 31 + %sh5 = lshr i64 %arg, 5 + %b = and i64 %sh5, 31 + %or = or i64 %a, %b + %sh10 = lshr i64 %arg, 10 + %c = and i64 %sh10, 31 + %or1 = or i64 %or, %c + %cmp = icmp eq i64 %or1, 0 + ret i1 %cmp +} + +define i1 @basic_ne_i32_3x5(i32 %arg) { +; CHECK-LABEL: basic_ne_i32_3x5: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %a = and i32 %arg, 31 + %sh5 = lshr i32 %arg, 5 + %b = and i32 %sh5, 31 + %or = or i32 %a, %b + %sh10 = lshr i32 %arg, 10 + %c = and i32 %sh10, 31 + %or1 = or i32 %or, %c + %cmp = icmp ne i32 %or1, 0 + ret i1 %cmp +} + +define i1 @eq_i32_3x5_holes_in_mask(i32 %arg) { +; CHECK-LABEL: eq_i32_3x5_holes_in_mask: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x7f9f, v0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %a = and i32 %arg, 31 + %sh5 = lshr i32 %arg, 7 + %b = and i32 %sh5, 31 + %or = or i32 %a, %b + %sh10 = lshr i32 %arg, 10 + %c = and i32 %sh10, 31 + %or1 = or i32 %or, %c + %cmp = icmp ne i32 %or1, 0 + ret i1 %cmp +} >From c154703477645e761293952f4b3827a3bdf0efc6 Mon Sep 17 00:00:00 2001 From: pvanhout <pierre.vanhoutr...@amd.com> Date: Tue, 1 Jul 2025 11:11:29 +0200 Subject: [PATCH 3/3] comments --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +++-- llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll | 21 +++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 2cb53c2104f50..340dd93259cb3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -28922,6 +28922,7 @@ static SDValue matchMergedBFX(SDValue Root, SelectionDAG &DAG, EVT VT = Root.getValueType(); + // TODO: Support vectors? if (!VT.isScalarInteger() || Root.getOpcode() != ISD::AND) return SDValue(); @@ -28997,8 +28998,8 @@ SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL)) return C; - if ((Cond == ISD::SETNE || Cond == ISD::SETEQ) && - N0.getOpcode() == ISD::AND && isNullConstant(N1)) { + if (ISD::isIntEqualitySetCC(Cond) && N0.getOpcode() == ISD::AND && + isNullConstant(N1)) { if (SDValue Res = matchMergedBFX(N0, DAG, TLI)) return DAG.getSetCC(DL, VT, Res, N1, Cond); diff --git a/llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll b/llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll index 9d415484d4f9c..144cb0d7f2b8b 100644 --- a/llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/merged-bfx-opt.ll @@ -100,3 +100,24 @@ entry: %cmp = icmp ne i32 %or1, 0 ret i1 %cmp } + +define i1 @eq_i32_3x5_all_shifted(i32 %arg) { +; CHECK-LABEL: eq_i32_3x5_all_shifted: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffc, v0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %sh2 = lshr i32 %arg, 2 + %a = and i32 %sh2, 31 + %sh5 = lshr i32 %arg, 7 + %b = and i32 %sh5, 31 + %or = or i32 %a, %b + %sh10 = lshr i32 %arg, 10 + %c = and i32 %sh10, 31 + %or1 = or i32 %or, %c + %cmp = icmp ne i32 %or1, 0 + ret i1 %cmp +} _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits