llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-regalloc @llvm/pr-subscribers-tablegen Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> This reverts d246cc618adc52fdbd69d44a2a375c8af97b6106. We now handle composing subregister extracts through reg_sequence. --- Patch is 576.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127052.diff 61 Files Affected: - (modified) llvm/lib/CodeGen/PeepholeOptimizer.cpp (-6) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+112-112) - (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+28-28) - (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+24-39) - (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+110-117) - (modified) llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll (+36-36) - (modified) llvm/test/CodeGen/AMDGPU/ctpop64.ll (+18-18) - (modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+10-10) - (modified) llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll (+5-8) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+31-27) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+33-29) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+33-29) - (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+61-53) - (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+140-124) - (modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+178-178) - (modified) llvm/test/CodeGen/AMDGPU/fptrunc.ll (+37-37) - (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+36-55) - (modified) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll (+171-298) - (modified) llvm/test/CodeGen/AMDGPU/idot4s.ll (+62-62) - (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+155-155) - (modified) llvm/test/CodeGen/AMDGPU/idot8u.ll (+18-18) - (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+44-44) - (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll (+150-148) - (modified) llvm/test/CodeGen/AMDGPU/kernel-args.ll (+3-6) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll (+440-440) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll (+81-81) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+16-16) - (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+16-16) - (modified) llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll (+52-53) - (modified) llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll (+2-4) - (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+62-62) - (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+16-16) - (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+29-29) - (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+156-210) - (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll (+10-9) - (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+25-25) - (modified) llvm/test/CodeGen/AMDGPU/mul_int24.ll (+15-18) - (modified) llvm/test/CodeGen/AMDGPU/select.f16.ll (+38-38) - (modified) llvm/test/CodeGen/AMDGPU/shl.ll (+11-11) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll (+304-304) - (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+109-118) - (modified) llvm/test/CodeGen/AMDGPU/spill-vgpr.ll (+8-8) - (modified) llvm/test/CodeGen/AMDGPU/sra.ll (+11-11) - (modified) llvm/test/CodeGen/AMDGPU/srl.ll (+11-11) - (modified) llvm/test/CodeGen/AMDGPU/udiv.ll (+2-4) - (modified) llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll (+21-21) - (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll (+1-9) - (modified) llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll (+59-59) - (modified) llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll (+66-81) - (modified) llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll (+26-32) - (modified) llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll (+27-27) - (modified) llvm/test/CodeGen/Thumb2/mve-shuffle.ll (+53-90) - (modified) llvm/test/CodeGen/Thumb2/mve-vabdus.ll (+25-25) - (modified) llvm/test/CodeGen/Thumb2/mve-vld2.ll (+9-9) - (modified) llvm/test/CodeGen/Thumb2/mve-vld3.ll (+70-66) - (modified) llvm/test/CodeGen/Thumb2/mve-vld4.ll (+33-33) - (modified) llvm/test/CodeGen/Thumb2/mve-vldst4.ll (+55-55) - (modified) llvm/test/CodeGen/Thumb2/mve-vst2.ll (+18-18) - (modified) llvm/test/CodeGen/Thumb2/mve-vst3.ll (+214-209) - (modified) llvm/test/CodeGen/Thumb2/mve-vst4-post.ll (+20-20) - (modified) llvm/test/CodeGen/Thumb2/mve-vst4.ll (+69-69) ``````````diff diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 24bd9938bc45c..5416cdd39aaf3 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -421,12 +421,6 @@ class RegSequenceRewriter : public Rewriter { } bool RewriteCurrentSource(Register NewReg, unsigned NewSubReg) override { - // Do not introduce new subregister uses in a reg_sequence. Until composing - // subregister indices is supported while folding, we're just blocking - // folding of subregister copies later in the function. - if (NewSubReg) - return false; - MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx); MO.setReg(NewReg); MO.setSubReg(NewSubReg); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index d41601cc0d76e..40f29c56c8f12 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1635,7 +1635,6 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2] ; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2] @@ -1683,32 +1682,33 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add3_u32 v6, v3, v2, v6 -; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v6, v[1:2] -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s10, v0 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v5, v[1:2] +; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-NEXT: v_sub_co_u32_e32 v7, vcc, s10, v0 +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2] ; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: s_ashr_i32 s10, s3, 31 -; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v7, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, s11, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v1 -; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s8, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v3, v7, s[0:1] -; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v6, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 +; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc +; GFX9-NEXT: v_sub_u32_e32 v0, s11, v1 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 +; GFX9-NEXT: v_subrev_co_u32_e32 v9, vcc, s8, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v2, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc +; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], 1, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v3, v12, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v1, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1] ; GFX9-NEXT: s_add_u32 s0, s18, s6 ; GFX9-NEXT: s_addc_u32 s1, s19, s6 @@ -1716,116 +1716,116 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s3, s3, s10 ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[10:11] -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX9-NEXT: v_cvt_f32_u32_e32 v15, s2 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc -; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v15 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_co_u32_e32 v15, vcc, s8, v8 -; GFX9-NEXT: v_subbrev_co_u32_e32 v16, vcc, 0, v2, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX9-NEXT: v_trunc_f32_e32 v4, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, 0xcf800000, v4 -; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v4, vcc +; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v15 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v9 +; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX9-NEXT: v_trunc_f32_e32 v16, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, 0xcf800000, v16 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v0 ; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] ; GFX9-NEXT: s_sub_u32 s5, 0, s2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s5, v17, 0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v4 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v16 ; GFX9-NEXT: s_subb_u32 s20, 0, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s5, v12, v[3:4] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_mul_lo_u32 v7, v12, v2 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[18:19], s20, v17, v[3:4] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v11, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v15, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v17, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v10, s[0:1] -; GFX9-NEXT: v_mul_hi_u32 v10, v17, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v16, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v12, v3 -; GFX9-NEXT: v_mul_hi_u32 v2, v12, v2 -; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 -; GFX9-NEXT: v_mul_hi_u32 v8, v17, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v12, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, v11, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2] +; GFX9-NEXT: v_mul_lo_u32 v2, v13, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v15, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v17, v1 +; GFX9-NEXT: v_mul_hi_u32 v10, v17, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, v13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v10, v13, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_hi_u32 v3, v17, v1 +; GFX9-NEXT: v_mul_hi_u32 v1, v13, v1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 -; GFX9-NEXT: v_add_u32_e32 v8, v10, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v8, v7, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v17, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v12, v3, vcc -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v7, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v1, v9, s[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_add_u32_e32 v3, v10, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v17, v0 +; GFX9-NEXT: v_add3_u32 v1, v3, v2, v1 +; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v13, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v8, v[0:1] -; GFX9-NEXT: v_xor_b32_e32 v10, s17, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1] ; GFX9-NEXT: v_xor_b32_e32 v5, s16, v5 -; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v7, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v11, s17 +; GFX9-NEXT: v_xor_b32_e32 v8, s17, v8 +; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v9, s17 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v5 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v6 -; GFX9-NEXT: v_mul_lo_u32 v5, v8, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v7, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v10, v11, vcc -; GFX9-NEXT: v_mul_hi_u32 v10, v7, v2 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 +; GFX9-NEXT: v_xor_b32_e32 v4, s4, v7 +; GFX9-NEXT: v_mul_lo_u32 v5, v11, v2 +; GFX9-NEXT: v_mul_lo_u32 v7, v10, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v8, v9, vcc +; GFX9-NEXT: v_mul_hi_u32 v8, v10, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v10, v8, v3 -; GFX9-NEXT: v_mul_hi_u32 v2, v8, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mul_hi_u32 v6, v7, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v8, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v8, v11, v3 +; GFX9-NEXT: v_mul_hi_u32 v2, v11, v2 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, v10, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, v11, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 -; GFX9-NEXT: v_add_u32_e32 v6, v10, v6 +; GFX9-NEXT: v_add_u32_e32 v7, v8, v7 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_add3_u32 v3, v6, v5, v3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc +; GFX9-NEXT: v_add3_u32 v3, v7, v5, v3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v10, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v11, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, s8, v2 +; GFX9-NEXT: v_mul_lo_u32 v7, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v9, s8, v2 ; GFX9-NEXT: v_mul_hi_u32 v2, s9, v2 ; GFX9-NEXT: v_mul_hi_u32 v12, s9, v3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, s9, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3 -; GFX9-NEXT: v_xor_b32_e32 v9, s4, v9 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_mul_lo_u32 v9, s9, v3 +; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 +; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 +; GFX9-NEXT: v_xor_b32_e32 v6, s4, v6 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v9, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v2, v5 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, s4 ; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GFX9-NEXT: v_add_u32_e32 v6, v8, v6 -; GFX9-NEXT: v_mov_b32_e32 v7, s4 ; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v8, vcc +; GFX9-NEXT: v_add_u32_e32 v6, v9, v7 ; GFX9-NEXT: v_add3_u32 v8, v6, v11, v12 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v7, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4] ; GFX9-NEXT: v_mov_b32_e32 v9, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v2 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 8bb8ecb079a34..bc89a186db010 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2218,31 +2218,31 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_clause 0x1 ; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_mov_b32 s11, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec +; GFX1264-NEXT: s_mov_b32 s11, 0 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-NEXT: s_cbranch_execz .LBB4_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[6:7] -; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_u64 s[6:7], s[4:5], s[10:11] -; GFX1264-NEXT: s_mov_b32 s14, -1 +; GFX1264-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11] +; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_wait_alu 0xfffe -; GFX1264-NEXT: v_mov_b32_e32 v0, s6 -; GFX1264-NEXT: v_mov_b32_e32 v1, s7 -; GFX1264-NEXT: s_mov_b32 s12, s2 -; GFX1264-NEXT: s_mov_b32 s13, s3 -; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264-NEXT: v_mov_b32_e32 v0, s8 +; GFX1264-NEXT: v_mov_b32_e32 v1, s9 +; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: s_mov_b32 s8, s2 +; GFX1264-NEXT: s_mov_b32 s9, s3 +; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB4_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 @@ -5800,31 +5800,31 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: s_clause 0x1 ; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_mov_b32 s11, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[8:9], exec +; GFX1264-NEXT: s_mov_b32 s11, 0 +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1264-NEXT: s_mov_b64 s[6:7], exec ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-NEXT: s_cbranch_execz .LBB10_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[6:7] -; GFX1264-NEXT: s_mov_b32 s15, 0x31016000 +; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9] ; GFX1264-NEXT: s_wait_kmcnt 0x0 -; GFX1264-NEXT: s_mul_u64 s[6:7], s[4:5], s[10:11] -; GFX1264-NEXT: s_mov_b32 s14, -1 +; GFX1264-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11] +; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 ; GFX1264-NEXT: s_wait_alu 0xfffe -; GFX1264-NEXT: v_mov_b32_e32 v0, s6 -; GFX1264-NEXT: v_mov_b32_e32 v1, s7 -; GFX1264-NEXT: s_mov_b32 s12, s2 -; GFX1264-NEXT: s_mov_b32 s13, s3 -; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1264-NEXT: v_mov_b32_e32 v0, s8 +; GFX1264-NEXT: v_mov_b32_e32 v1, s9 +; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: s_mov_b32 s8, s2 +; GFX1264-NEXT: s_mov_b32 s9, s3 +; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1264-NEXT: s_wait_loadcnt 0x0 ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB10_2: -; GFX1264-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1264-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s4, v2, 0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 16fe85bf138b2..9bbecacd6c774 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -1147,12 +1147,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; VI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; VI-NEXT: s_mov_b32 s38, -1 ; VI-NEXT: s_mov_b32 s39, 0xe80000 -; VI-NEXT: s_mov_b64 s[6:7], s[0:1] -; VI-NEXT: s_mov_b32 s0, 0 ; VI-NEXT: s_add_u32 s36, s36, s3 +; VI-NEXT: s_mov_b64 s[6:7], s[0:1] +; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s1, s0 ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] @@ -1170,12 +1169,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; CI-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; CI-NEXT: s_mov_b32 s38, -1 ; CI-NEXT: s_mov_b32 s39, 0xe8f000 -; CI-NEXT: s_mov_b64 s[6:7], s[0:1] -; CI-NEXT: s_mov_b32 s0, 0 ; CI-NEXT: s_add_u32 s36, s36, s3 +; CI-NEXT: s_mov_b64 s[6:7], s[0:1] +; CI-NEXT: s_mov_b64 s[0:1], 0 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/127052 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits