llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-nvptx @llvm/pr-subscribers-backend-powerpc Author: Simon Pilgrim (RKSimon) <details> <summary>Changes</summary> This is a partial revert of https://github.com/llvm/llvm-project/pull/145939 (I've kept the BUILD_VECTOR(FREEZE(UNDEF), FREEZE(UNDEF), elt2, ...) canonicalization) as we're getting reports of infinite loops (https://github.com/llvm/llvm-project/issues/148084). The issue appears to be due to deep chains of nodes and how visitFREEZE replaces all instances of an operand with a common frozen version - other users of the original frozen node then get added back to the worklist but might no longer be able to confirm a node isn't poison due to recursion depth limits on isGuaranteedNotToBeUndefOrPoison. The issue still exists with the old implementation but by only allowing a single frozen operand it helps prevent cases of interdependent frozen nodes. I'm still working on supporting multiple operands as its critical for topological DAG handling but need to get a fix in for trunk and 21.x. Fixes https://github.com/llvm/llvm-project/issues/148084 --- Patch is 202.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150425.diff 20 Files Affected: - (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+18-1) - (modified) llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp (+3-2) - (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+11-4) - (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+24-40) - (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+24-40) - (modified) llvm/test/CodeGen/NVPTX/i1-select.ll (+15-15) - (modified) llvm/test/CodeGen/NVPTX/i128.ll (+290-292) - (modified) llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll (+204-213) - (modified) llvm/test/CodeGen/RISCV/fpclamptosat.ll (+44-44) - (modified) llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll (+18-18) - (added) llvm/test/CodeGen/RISCV/pr148084.ll (+279) - (modified) llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll (+1018-883) - (modified) llvm/test/CodeGen/SystemZ/pr60413.ll (+18-18) - (modified) llvm/test/CodeGen/X86/abds-neg.ll (+18-18) - (modified) llvm/test/CodeGen/X86/avg.ll (+90-87) - (modified) llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll (+19-20) - (modified) llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll (+6-6) - (modified) llvm/test/CodeGen/X86/freeze-vector.ll (+12-12) - (modified) llvm/test/CodeGen/X86/pr62286.ll (+4-4) - (modified) llvm/test/CodeGen/X86/setcc-non-simple-type.ll (+2-2) ``````````diff diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 23812d795f5fa..91fd2d843f445 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16717,7 +16717,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // Fold freeze(op(x, ...)) -> op(freeze(x), ...). // Try to push freeze through instructions that propagate but don't produce // poison as far as possible. If an operand of freeze follows three - // conditions 1) one-use, and 2) does not produce poison then push + // conditions 1) one-use, 2) does not produce poison, and 3) has all but one + // guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push // the freeze through to the operands that are not guaranteed non-poison. // NOTE: we will strip poison-generating flags, so ignore them here. if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false, @@ -16725,6 +16726,18 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { N0->getNumValues() != 1 || !N0->hasOneUse()) return SDValue(); + // TOOD: we should always allow multiple operands, however this increases the + // likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call + // below causing later nodes that share frozen operands to fold again and no + // longer being able to confirm other operands are not poison due to recursion + // depth limits on isGuaranteedNotToBeUndefOrPoison. + bool AllowMultipleMaybePoisonOperands = + N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC || + N0.getOpcode() == ISD::BUILD_VECTOR || + N0.getOpcode() == ISD::BUILD_PAIR || + N0.getOpcode() == ISD::VECTOR_SHUFFLE || + N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL; + // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all // ones" or "constant" into something that depends on FrozenUndef. We can // instead pick undef values to keep those properties, while at the same time @@ -16757,6 +16770,10 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { MaybePoisonOperandNumbers.push_back(OpNo); if (!HadMaybePoisonOperands) continue; + if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) { + // Multiple maybe-poison ops when not allowed - bail out. + return SDValue(); + } } // NOTE: the whole op may be not guaranteed to not be undef or poison because // it could create undef or poison due to it's poison-generating flags. diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index e0597988e8907..37fddcf943681 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -778,7 +778,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits( case ISD::FREEZE: { SDValue N0 = Op.getOperand(0); if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts, - /*PoisonOnly=*/false)) + /*PoisonOnly=*/false, Depth + 1)) return N0; break; } @@ -3369,7 +3369,8 @@ bool TargetLowering::SimplifyDemandedVectorElts( case ISD::FREEZE: { SDValue N0 = Op.getOperand(0); if (TLO.DAG.isGuaranteedNotToBeUndefOrPoison(N0, DemandedElts, - /*PoisonOnly=*/false)) + /*PoisonOnly=*/false, + Depth + 1)) return TLO.CombineTo(Op, N0); // TODO: Replace this with the general fold from DAGCombiner::visitFREEZE diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5e35d5630d667..d755c7e9bf3b3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45096,27 +45096,34 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { switch (Op.getOpcode()) { + // SSE vector insert/extracts use modulo indices. + case X86ISD::PINSRB: + case X86ISD::PINSRW: + case X86ISD::PEXTRB: + case X86ISD::PEXTRW: + return false; // SSE vector multiplies are either inbounds or saturate. case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: + return false; // SSE vector shifts handle out of bounds shift amounts. case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: return false; - // SSE blends. + // SSE blends. case X86ISD::BLENDI: case X86ISD::BLENDV: return false; - // SSE target shuffles. + // SSE target shuffles. case X86ISD::PSHUFD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPI: case X86ISD::VPERMV3: return false; - // SSE comparisons handle all icmp/fcmp cases. - // TODO: Add CMPM/MM with test coverage. + // SSE comparisons handle all icmp/fcmp cases. + // TODO: Add CMPM/MM with test coverage. case X86ISD::CMPP: case X86ISD::PCMPEQ: case X86ISD::PCMPGT: diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index f8e13fcdd2273..51398a45055eb 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -475,28 +475,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -507,7 +500,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1043,10 +1035,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -2664,28 +2656,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -2696,7 +2681,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -3232,10 +3216,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index ba9dd8f7c2468..6512bee36e88b 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -513,28 +513,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -545,7 +538,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1081,10 +1073,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -1897,28 +1889,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -1929,7 +1914,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -2465,10 +2449,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/150425 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits