llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> --- Patch is 284.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142174.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+13-11) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+938-1162) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+1059-1305) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ab3c316f76deb..1c30d3f3bd883 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -757,7 +757,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FABS, MVT::v2f16, Legal); // Can do this in one BFI plus a constant materialize. - setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16}, Custom); + setOperationAction(ISD::FCOPYSIGN, + {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16}, + Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); @@ -5936,10 +5938,11 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || - VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || + VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || + VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 || + VT == MVT::v32f16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); @@ -7122,18 +7125,17 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op, SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { SDValue Mag = Op.getOperand(0); - SDValue Sign = Op.getOperand(1); - EVT MagVT = Mag.getValueType(); - EVT SignVT = Sign.getValueType(); - assert(MagVT.isVector()); + if (MagVT.getVectorNumElements() > 2) + return splitBinaryVectorOp(Op, DAG); + + SDValue Sign = Op.getOperand(1); + EVT SignVT = Sign.getValueType(); if (MagVT == SignVT) return Op; - assert(MagVT.getVectorNumElements() == 2); - // fcopysign v2f16:mag, v2f32:sign -> // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index a5a36d7122f68..3bc1232ce3ed1 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1090,40 +1090,26 @@ define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x ; ; GFX8-LABEL: s_copysign_v3bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshr_b32 s1, s2, 16 -; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_copysign_v3bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: ; return to shader part epilog @@ -1131,33 +1117,19 @@ define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x ; GFX10-LABEL: s_copysign_v3bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX10-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s1, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 +; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_copysign_v3bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s1, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 @@ -1238,101 +1210,49 @@ define amdgpu_ps <2 x i32> @s_copysign_v4bf16(<4 x bfloat> inreg %arg_mag, <4 x ; ; GFX8-LABEL: s_copysign_v4bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: s_lshr_b32 s1, s1, 16 ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: s_lshr_b32 s1, s2, 16 -; GFX8-NEXT: s_lshr_b32 s0, s0, 16 ; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_copysign_v4bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NEXT: s_lshr_b32 s1, s1, 16 ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_copysign_v4bf16: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 -; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 -; GFX10-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2 -; GFX10-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX10-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 +; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_copysign_v4bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 -; GFX11-NEXT: s_lshr_b32 s1, s1, 16 -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2 -; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s0, v1 -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: ; return to shader part epilog %out = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %arg_mag, <4 x bfloat> %arg_sign) %cast = bitcast <4 x bfloat> %out to <2 x i32> @@ -2366,67 +2286,32 @@ define <3 x bfloat> @v_copysign_v3bf16(<3 x bfloat> %mag, <3 x bfloat> %sign) { ; GFX8-LABEL: v_copysign_v3bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX8-NEXT: v_bfi_b32 v3, s4, v4, v3 +; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_v3bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3 -; GFX9-NEXT: v_bfi_b32 v3, s4, v0, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_v3bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 -; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3 -; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, v5, v4 -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 +; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11TRUE16-LABEL: v_copysign_v3bf16: -; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v5 -; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v0, v2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11FAKE16-LABEL: v_copysign_v3bf16: -; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 -; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v5, v4 -; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: v_copysign_v3bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> %mag, <3 x bfloat> %sign) ret <3 x bfloat> %result } @@ -2501,93 +2386,32 @@ define <4 x bfloat> @v_copysign_v4bf16(<4 x bfloat> %mag, <4 x bfloat> %sign) { ; GFX8-LABEL: v_copysign_v4bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: v_bfi_b32 v4, s4, v5, v4 -; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX8-NEXT: v_bfi_b32 v3, s4, v5, v3 +; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_copysign_v4bf16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfi_b32 v4, s4, v1, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3 -; GFX9-NEXT: v_bfi_b32 v3, s4, v0, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_copysign_v4bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3 -; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 -; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, v6, v5 -; GFX10-NEXT: v_bfi_b32 v3, 0x7fff, v7, v4 -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 +; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11TRUE16-LABEL: v_copysign_v4bf16: -; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0x7fff, v6, v7 -; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v5 -; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v0, v2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v1, v3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l -; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11FAKE16-LABEL: v_copysign_v4bf16: -; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3 -; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v6, v5 -; GFX11FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, v7, v4 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 -; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: v_copysign_v4bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <4 x bfloat... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/142174 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits