llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Pierre van Houtryve (Pierre-vh) <details> <summary>Changes</summary> We can add a bunch of exts/truncs during RBSelect, we should be able to fold them away afterwards. --- Patch is 184.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131307.diff 8 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPUCombine.td (+2-1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+217-397) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+256-424) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+120-131) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll (+5-21) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+146-157) - (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll (-1) - (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+22-32) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td index 36653867fbba0..a21505356274b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -180,5 +180,6 @@ def AMDGPURegBankCombiner : GICombiner< [unmerge_merge, unmerge_cst, unmerge_undef, zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp, - identity_combines, redundant_and]> { + identity_combines, redundant_and, constant_fold_cast_op, + cast_of_cast_combines]> { } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 3a52497bd6e91..07fcb02d98649 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -41,10 +41,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s2, s2, 0x7f ; GFX8-NEXT: s_and_b32 s1, s1, 0x7f -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_lshr_b32 s1, s1, 1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_lshr_b32 s1, s1, 1 ; GFX8-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -72,10 +71,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_and_b32 s2, s2, 0x7f ; GFX9-NEXT: s_and_b32 s1, s1, 0x7f -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_lshr_b32 s1, s1, 1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_lshr_b32 s1, s1, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, -7 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -102,9 +100,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX10-NEXT: s_and_b32 s2, s2, 0x7f ; GFX10-NEXT: s_and_b32 s1, s1, 0x7f -; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_lo_u32 v1, v0, -7 @@ -134,9 +131,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX11-NEXT: s_and_b32 s2, s2, 0x7f ; GFX11-NEXT: s_and_b32 s1, s1, 0x7f ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -351,11 +347,8 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s3, s2, 7 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -365,11 +358,8 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_and_b32 s3, s2, 7 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -379,11 +369,8 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_and_b32 s3, s2, 7 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 -; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -393,11 +380,8 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_and_b32 s3, s2, 7 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 ; GFX11-NEXT: s_lshl_b32 s0, s0, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -489,7 +473,6 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-LABEL: s_fshl_i8_4: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 4 ; GFX8-NEXT: s_lshr_b32 s1, s1, 4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -498,7 +481,6 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-LABEL: s_fshl_i8_4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 ; GFX9-NEXT: s_lshr_b32 s1, s1, 4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -508,7 +490,6 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s1, s1, 4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -517,9 +498,8 @@ define amdgpu_ps i8 @s_fshl_i8_4(i8 inreg %lhs, i8 inreg %rhs) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 4 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s1, s1, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4) @@ -586,7 +566,6 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-LABEL: s_fshl_i8_5: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 5 ; GFX8-NEXT: s_lshr_b32 s1, s1, 3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -595,7 +574,6 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) { ; GFX9-LABEL: s_fshl_i8_5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 5 ; GFX9-NEXT: s_lshr_b32 s1, s1, 3 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -605,7 +583,6 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 5 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_lshr_b32 s1, s1, 3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -614,9 +591,8 @@ define amdgpu_ps i8 @s_fshl_i8_5(i8 inreg %lhs, i8 inreg %rhs) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_lshl_b32 s0, s0, 5 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s1, s1, 3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog %result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5) @@ -702,23 +678,17 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 ; GFX8-NEXT: s_and_b32 s6, s2, 7 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshl_b32 s0, s0, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s2, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_andn2_b32 s3, 7, s5 ; GFX8-NEXT: s_lshr_b32 s2, s2, 1 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_andn2_b32 s3, 7, s5 ; GFX8-NEXT: s_lshr_b32 s2, s2, s3 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff @@ -733,23 +703,17 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 ; GFX9-NEXT: s_and_b32 s6, s2, 7 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshl_b32 s0, s0, s6 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_and_b32 s2, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX9-NEXT: s_andn2_b32 s3, 7, s5 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_andn2_b32 s3, 7, s5 ; GFX9-NEXT: s_lshr_b32 s2, s2, s3 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff @@ -761,25 +725,19 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX10-LABEL: s_fshl_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_and_b32 s5, s2, 7 -; GFX10-NEXT: s_lshr_b32 s6, s2, 8 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_lshr_b32 s5, s2, 8 +; GFX10-NEXT: s_and_b32 s6, s2, 7 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, s5 -; GFX10-NEXT: s_and_b32 s5, s6, 7 -; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: s_andn2_b32 s6, 7, s6 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 -; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_lshl_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s6, s5, 7 ; GFX10-NEXT: s_lshr_b32 s4, s4, 1 -; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX10-NEXT: s_andn2_b32 s5, 7, s5 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_lshl_b32 s3, s3, s5 -; GFX10-NEXT: s_lshr_b32 s4, s4, s6 +; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshl_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s2, s3, s4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -792,25 +750,19 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX11-LABEL: s_fshl_v2i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s4, s1, 8 -; GFX11-NEXT: s_and_b32 s5, s2, 7 -; GFX11-NEXT: s_lshr_b32 s6, s2, 8 -; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-NEXT: s_and_b32 s6, s2, 7 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s0, s0, s5 -; GFX11-NEXT: s_and_b32 s5, s6, 7 -; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_and_not1_b32 s6, 7, s6 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 -; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_lshl_b32 s0, s0, s6 +; GFX11-NEXT: s_and_b32 s6, s5, 7 ; GFX11-NEXT: s_lshr_b32 s4, s4, 1 -; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_lshl_b32 s3, s3, s5 -; GFX11-NEXT: s_lshr_b32 s4, s4, s6 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_lshl_b32 s3, s3, s6 +; GFX11-NEXT: s_lshr_b32 s4, s4, s5 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_or_b32 s2, s3, s4 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -1030,11 +982,8 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s2, 24 ; GFX8-NEXT: s_and_b32 s12, s2, 7 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 @@ -1042,29 +991,24 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s2, s6, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX8-NEXT: s_andn2_b32 s3, 7, s9 ; GFX8-NEXT: s_lshr_b32 s2, s2, 1 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_andn2_b32 s3, 7, s9 ; GFX8-NEXT: s_lshr_b32 s2, s2, s3 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s2, s10, 7 -; GFX8-NEXT: s_lshl_b32 s2, s4, s2 ; GFX8-NEXT: s_and_b32 s3, s7, 0xff -; GFX8-NEXT: s_andn2_b32 s4, 7, s10 +; GFX8-NEXT: s_lshl_b32 s2, s4, s2 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_andn2_b32 s4, 7, s10 ; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: s_and_b32 s3, s11, 7 -; GFX8-NEXT: s_lshl_b32 s3, s5, s3 -; GFX8-NEXT: s_andn2_b32 s5, 7, s11 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s3, s5, s3 ; GFX8-NEXT: s_lshr_b32 s4, s8, 1 -; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX8-NEXT: s_andn2_b32 s5, 7, s11 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshr_b32 s4, s4, s5 @@ -1088,11 +1032,8 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_lshr_b32 s10, s2, 16 ; GFX9-NEXT: s_lshr_b32 s11, s2, 24 ; GFX9-NEXT: s_and_b32 s12, s2, 7 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 -; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 @@ -1100,29 +1041,24 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_and_b32 s2, s6, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX9-NEXT: s_andn2_b32 s3, 7, s9 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_andn2_b32 s3, 7, s9 ; GFX9-NEXT: s_lshr_b32 s2, s2, s3 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s10, 7 -; GFX9-NEXT: s_lshl_b32 s2, s4, s2 ; GFX9-NEXT: s_and_b32 s3, s7, 0xff -; GFX9-NEXT: s_andn2_b32 s4, 7, s10 +; GFX9-NEXT: s_lshl_b32 s2, s4, s2 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX9-NEXT: s_andn2_b32 s4, 7, s10 ; GFX9-NEXT: s_lshr_b32 s3, s3, s4 ; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: s_and_b32 s3, s11, 7 -; GFX9-NEXT: s_lshl_b32 s3, s5, s3 -; GFX9-NEXT: s_andn2_b32 s5, 7, s11 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s3, s5, s3 ; GFX9-NEXT: s_lshr_b32 s4, s8, 1 -; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX9-NEXT: s_andn2_b32 s5, 7, s11 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 @@ -1146,41 +1082,33 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 ; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_and_b32 s12, s2, 7 -; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_and_b32 s6, s6, 0xff +; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s2, s9, 7 -; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX10-NEXT: s_and_b32 s2, s6, 0xff +; GFX10-NEXT: s_and_b32 s6, s9, 7 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_and_b32 s12, 0xffff, s12 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_lshr_b32 s6, s6, 1 -; GFX10-NEXT: s_and_b32 s9, 0xffff, s9 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 ; GFX10-NEXT: s_lshl_b32 s0, s0, s12 -; GFX10-NEXT: s_lshl_b32 s2, s3, s2 -; GFX10-NEXT: s_lshr_b32 s3, s6, s9 +; GFX10-NEXT: s_lshl_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s2, s2, s9 ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_or_b32 s1, s2, s3 -; GFX10-NEXT: s_and_b32 s3, s7, 0xff +; GFX10-NEXT: s_or_b32 s1, s3, s2 +; GFX10-NEXT: s_and_b32 s2, s7, 0xff +; GFX10-NEXT: s_and_b32 s3, s10, 7 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_andn2_b32 s6, 7, s10 -; GFX10-NEXT: s_lshr_b32 s3, s3, 1 -; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX10-NEXT: s_and_b32 s2, s10, 7 -; GFX10-NEXT: s_lshr_b32 s3, s3, s6 -; GFX10-NEXT: s_andn2_b32 s6, 7, s11 -; GFX10-NEXT: s_lshl_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s3, s4, s3 +; GFX10-NEXT: s_lshr_b32 s2, s2, s6 ; GFX10-NEXT: s_and_b32 s4, s11, 7 -; GFX10-NEXT: s_lshr_b32 s7, s8, 1 -; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX10-NEXT: s_lshr_b32 s6, s8, 1 +; GFX10-NEXT: s_andn2_b32 s7, 7, s11 ; GFX10-NEXT: s_lshl_b32 s4, s5, s4 -; GFX10-NEXT: s_lshr_b32 s5, s7, s6 -; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_lshr_b32 s5, s6, s7 +; GFX10-NEXT: s_or_b32 s2, s3, s2 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff @@ -1204,41 +1132,33 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX11-NEXT: s_lshr_b32 s10, s2, 16 ; GFX11-NEXT: s_lshr_b32 s11, s2, 24 ; GFX11-NEXT: s_and_b32 s12, s2, 7 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_an... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/131307 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits