[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)
arsenm wrote: ### Merge activity * **May 31, 5:58 AM UTC**: A user started a stack merge that includes this pull request via [Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/142177). https://github.com/llvm/llvm-project/pull/142177 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)
arsenm wrote: ### Merge activity * **May 31, 5:58 AM UTC**: A user started a stack merge that includes this pull request via [Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/142175). https://github.com/llvm/llvm-project/pull/142175 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] Backport: [clang] Serialization: support hashing null template arguments (PR #141957)
https://github.com/mizvekov edited https://github.com/llvm/llvm-project/pull/141957 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes Make symmetric with other copysign tests --- Patch is 35.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142114.diff 1 Files Affected: - (added) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+959) ``diff diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll new file mode 100644 index 0..4fcce8a6d623f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -0,0 +1,959 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN +; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7 +; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16 + +declare bfloat @llvm.copysign.bf16(bfloat, bfloat) + +define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) { +; GCN-LABEL: v_copysign_bf16_bf16: +; GCN: ; %bb.0: +; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT:v_and_b32_e32 v1, 0x8000, v1 +; GCN-NEXT:v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT:v_or_b32_e32 v0, v0, v1 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_copysign_bf16_bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT:v_and_b32_e32 v1, 0x8000, v1 +; GFX7-NEXT:v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT:v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT:v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT:s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_copysign_bf16_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT:s_movk_i32 s4, 0x7fff +; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1 +; GFX8-NEXT:s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_copysign_bf16_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_movk_i32 s4, 0x7fff +; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_copysign_bf16_bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX10-NEXT:s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_copysign_bf16_bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT:v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-NEXT:s_setpc_b64 s[30:31] + %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) + ret bfloat %op +} + +define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { +; GCN-LABEL: v_copysign_bf16_s_bf16: +; GCN: ; %bb.0: +; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT:s_and_b32 s4, s16, 0x8000 +; GCN-NEXT:s_lshr_b32 s4, s4, 16 +; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT:v_or_b32_e32 v0, s4, v0 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_copysign_bf16_s_bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT:s_and_b32 s4, s16, 0x8000 +; GFX7-NEXT:s_lshr_b32 s4, s4, 16 +; GFX7-NEXT:v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT:v_or_b32_e32 v0, s4, v0 +; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT:s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_copysign_bf16_s_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT:s_movk_i32 s4, 0x7fff +; GFX8-NEXT:v_mov_b32_e32 v1, s16 +; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1 +; GFX8-NEXT:s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_copysign_bf16_s_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_movk_i32 s4, 0x7fff +; GFX9-NEXT:v_mov_b32_e32 v1, s16 +; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_copysign_bf16_s_bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:v_bfi_b32 v0, 0x7fff, v0, s16 +; GFX10-NEXT:s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_copysign_b
[llvm-branch-commits] [llvm] AMDGPU: Add more f16 copysign tests (PR #142115)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/142115 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Add more f16 copysign tests (PR #142115)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes --- Patch is 365.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142115.diff 2 Files Affected: - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+4746-3) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+3652) ``diff diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 4fcce8a6d623f..e99a6bf273e3b 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -954,6 +954,4749 @@ define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1 ret <2 x i32> %ins.1 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11FAKE16: {{.*}} -; GFX11TRUE16: {{.*}} + +define amdgpu_ps i32 @s_copysign_v2bf16(<2 x bfloat> inreg %arg_mag, <2 x bfloat> inreg %arg_sign) { +; GCN-LABEL: s_copysign_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT:v_mul_f32_e64 v0, 1.0, s3 +; GCN-NEXT:v_mul_f32_e64 v1, 1.0, s2 +; GCN-NEXT:v_mul_f32_e64 v2, 1.0, s1 +; GCN-NEXT:v_mul_f32_e64 v3, 1.0, s0 +; GCN-NEXT:v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT:v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT:v_bfe_u32 v3, v3, 16, 15 +; GCN-NEXT:v_bfe_u32 v2, v2, 16, 15 +; GCN-NEXT:v_and_b32_e32 v1, 0x8000, v1 +; GCN-NEXT:v_and_b32_e32 v0, 0x8000, v0 +; GCN-NEXT:v_or_b32_e32 v1, v3, v1 +; GCN-NEXT:v_or_b32_e32 v0, v2, v0 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:v_or_b32_e32 v0, v1, v0 +; GCN-NEXT:v_readfirstlane_b32 s0, v0 +; GCN-NEXT:; return to shader part epilog +; +; GFX7-LABEL: s_copysign_v2bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT:v_mul_f32_e64 v0, 1.0, s3 +; GFX7-NEXT:v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT:v_mul_f32_e64 v1, 1.0, s2 +; GFX7-NEXT:v_mul_f32_e64 v2, 1.0, s1 +; GFX7-NEXT:v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT:v_mul_f32_e64 v3, 1.0, s0 +; GFX7-NEXT:v_and_b32_e32 v0, 0x8000, v0 +; GFX7-NEXT:v_bfe_u32 v2, v2, 16, 15 +; GFX7-NEXT:v_and_b32_e32 v1, 0x8000, v1 +; GFX7-NEXT:v_bfe_u32 v3, v3, 16, 15 +; GFX7-NEXT:v_or_b32_e32 v0, v2, v0 +; GFX7-NEXT:v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT:v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT:v_readfirstlane_b32 s0, v0 +; GFX7-NEXT:; return to shader part epilog +; +; GFX8-LABEL: s_copysign_v2bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT:s_movk_i32 s2, 0x7fff +; GFX8-NEXT:v_mov_b32_e32 v0, s0 +; GFX8-NEXT:v_mov_b32_e32 v1, s1 +; GFX8-NEXT:s_lshr_b32 s1, s1, 16 +; GFX8-NEXT:s_lshr_b32 s0, s0, 16 +; GFX8-NEXT:v_bfi_b32 v0, s2, v0, v1 +; GFX8-NEXT:v_mov_b32_e32 v1, s0 +; GFX8-NEXT:v_mov_b32_e32 v2, s1 +; GFX8-NEXT:v_bfi_b32 v1, s2, v1, v2 +; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT:v_readfirstlane_b32 s0, v0 +; GFX8-NEXT:; return to shader part epilog +; +; GFX9-LABEL: s_copysign_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_movk_i32 s2, 0x7fff +; GFX9-NEXT:v_mov_b32_e32 v0, s0 +; GFX9-NEXT:v_mov_b32_e32 v1, s1 +; GFX9-NEXT:s_lshr_b32 s1, s1, 16 +; GFX9-NEXT:s_lshr_b32 s0, s0, 16 +; GFX9-NEXT:v_bfi_b32 v0, s2, v0, v1 +; GFX9-NEXT:v_mov_b32_e32 v1, s0 +; GFX9-NEXT:v_mov_b32_e32 v2, s1 +; GFX9-NEXT:v_bfi_b32 v1, s2, v1, v2 +; GFX9-NEXT:v_and_b32_e32 v0, 0x, v0 +; GFX9-NEXT:v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT:v_readfirstlane_b32 s0, v0 +; GFX9-NEXT:; return to shader part epilog +; +; GFX10-LABEL: s_copysign_v2bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT:v_mov_b32_e32 v0, s1 +; GFX10-NEXT:s_lshr_b32 s1, s1, 16 +; GFX10-NEXT:v_mov_b32_e32 v1, s1 +; GFX10-NEXT:v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX10-NEXT:s_lshr_b32 s0, s0, 16 +; GFX10-NEXT:v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX10-NEXT:v_and_b32_e32 v0, 0x, v0 +; GFX10-NEXT:v_lshl_or_b32 v0, v1, 16, v0 +; GFX10-NEXT:v_readfirstlane_b32 s0, v0 +; GFX10-NEXT:; return to shader part epilog +; +; GFX11-LABEL: s_copysign_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT:v_mov_b32_e32 v0, s1 +; GFX11-NEXT:s_lshr_b32 s1, s1, 16 +; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT:v_mov_b32_e32 v1, s1 +; GFX11-NEXT:v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11-NEXT:s_lshr_b32 s0, s0, 16 +; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT:v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT:v_and_b32_e32 v0, 0x, v0 +; GFX11-NEXT:v_lshl_or
[llvm-branch-commits] [mlir] [mlir] Unique property constraints where possible (PR #140849)
https://github.com/joker-eph approved this pull request. Nice! https://github.com/llvm/llvm-project/pull/140849 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes This avoids some ugly codegen on pre-16-bit instruction targets now from annoying f16 legalization effects. This also avoids regressions on newer targets in a future patch. --- Full diff: https://github.com/llvm/llvm-project/pull/142157.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+27-8) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+28-36) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+10-176) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index af85c6bef273d..c61c52ec5843e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11737,9 +11737,10 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) EVT MagVT = MagnitudeOp.getValueType(); - if (MagVT.getScalarType() == MVT::f64) { -unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + + if (MagVT.getScalarType() == MVT::f64) { EVT F32VT = MagVT.isVector() ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) : MVT::v2f32; @@ -11777,7 +11778,7 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignVT != MVT::f64) + if (SignVT.getScalarType() != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. @@ -11785,13 +11786,31 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // fcopysign f64:x, f64:y -> // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) // TODO: In some cases it might make sense to go all the way to f16. - SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp); - SDValue SignAsF32 = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, - DAG.getConstant(1, DL, MVT::i32)); + + EVT F32VT = MagVT.isVector() + ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) + : MVT::v2f32; + + SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp); + + SmallVector F32Signs; + for (unsigned I = 0; I != NumElts; ++I) { +// Take sign from odd elements of cast vector +SDValue SignAsF32 = +DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, +DAG.getConstant(2 * I + 1, DL, MVT::i32)); +F32Signs.push_back(SignAsF32); + } + + SDValue NewSign = + NumElts == 1 + ? F32Signs.back() + : DAG.getNode(ISD::BUILD_VECTOR, DL, +EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts), +F32Signs); return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), - SignAsF32); + NewSign); } // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 90a368885bfdc..45bf0770ad924 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -4677,37 +4677,33 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m ; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT:v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT:v_cvt_f32_f64_e32 v2, v[2:3] -; GCN-NEXT:v_cvt_f32_f64_e32 v3, v[4:5] +; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v5 +; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3 ; GCN-NEXT:v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT:v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT:v_or_b32_e32 v1, v1, v3 -; GCN-NEXT:v_or_b32_e32 v0, v0, v2 -; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:v_or_b32_e32 v1, v1, v2 +; GCN-NEXT:v_or_b32_e32 v0, v0, v3 ; GCN-NEXT:v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT:s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT:v_cvt_f32_f64_e32 v2, v[2:3] -; GFX7-NEXT:v_cvt_f32_f64_e32 v3, v[4:5] -; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT:v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT:v_and_b32_e32 v2, 0x8000, v5 ; GFX7-NEXT:v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT:v_l
[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/142114 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142156 >From 9427bd08206493b681edacd5e54da977bee8fd86 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 12:03:35 +0200 Subject: [PATCH] AMDGPU: Handle vectors in copysign magnitude sign case --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 58 +++- .../AMDGPU/copysign-simplify-demanded-bits.ll | 2 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll| 294 +++--- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 179 +-- 4 files changed, 242 insertions(+), 291 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 74ca3e43fce3a..af85c6bef273d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11721,29 +11721,63 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const { SDValue MagnitudeOp = N->getOperand(0); SDValue SignOp = N->getOperand(1); + + // The generic combine for fcopysign + fp cast is too conservative with + // vectors, and also gets confused by the splitting we will perform here, so + // peek through FP casts. + if (SignOp.getOpcode() == ISD::FP_EXTEND || + SignOp.getOpcode() == ISD::FP_ROUND) +SignOp = SignOp.getOperand(0); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT SignVT = SignOp.getValueType(); // f64 fcopysign is really an f32 copysign on the high bits, so replace the // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) - if (MagnitudeOp.getValueType() == MVT::f64) { -SDValue MagAsVector = -DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp); -SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, -MagAsVector, DAG.getConstant(0, DL, MVT::i32)); -SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, -MagAsVector, DAG.getConstant(1, DL, MVT::i32)); + EVT MagVT = MagnitudeOp.getValueType(); + if (MagVT.getScalarType() == MVT::f64) { +unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + +EVT F32VT = MagVT.isVector() +? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) +: MVT::v2f32; + +SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp); + +SmallVector NewElts; +for (unsigned I = 0; I != NumElts; ++I) { + SDValue MagLo = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I, DL, MVT::i32)); + SDValue MagHi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I + 1, DL, MVT::i32)); -SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp); + SDValue SignOpElt = + MagVT.isVector() + ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(), +SignOp, DAG.getConstant(I, DL, MVT::i32)) + : SignOp; + + SDValue HiOp = + DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt); + + SDValue Vector = + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); + + SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); + NewElts.push_back(NewElt); +} -SDValue Vector = -DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); +if (NewElts.size() == 1) + return NewElts[0]; -return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); +return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignOp.getValueType() != MVT::f64) + if (SignVT != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index a01c2fa152ab3..15b049d4d7563 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -131,8 +131,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd: ; GFX9: ; %bb.0: ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT:v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT:v_or_b32_e32 v4, 1, v4 +; GFX9-NEXT:v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[4:5], v4 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[6:7], v6 ; GFX9-NEXT:s_brev_b32 s4, -2 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 32e3f72af516f..3bd068362410b 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142157 >From ff07bad7e0442c2b4deabadda4d5242e9b190451 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 12:15:33 +0200 Subject: [PATCH] AMDGPU: Handle vectors in copysign sign type combine This avoids some ugly codegen on pre-16-bit instruction targets now from annoying f16 legalization effects. This also avoids regressions on newer targets in a future patch. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 174 --- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 501 ++--- 3 files changed, 129 insertions(+), 581 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index af85c6bef273d..c61c52ec5843e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11737,9 +11737,10 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) EVT MagVT = MagnitudeOp.getValueType(); - if (MagVT.getScalarType() == MVT::f64) { -unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + + if (MagVT.getScalarType() == MVT::f64) { EVT F32VT = MagVT.isVector() ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) : MVT::v2f32; @@ -11777,7 +11778,7 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignVT != MVT::f64) + if (SignVT.getScalarType() != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. @@ -11785,13 +11786,31 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // fcopysign f64:x, f64:y -> // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) // TODO: In some cases it might make sense to go all the way to f16. - SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp); - SDValue SignAsF32 = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, - DAG.getConstant(1, DL, MVT::i32)); + + EVT F32VT = MagVT.isVector() + ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) + : MVT::v2f32; + + SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp); + + SmallVector F32Signs; + for (unsigned I = 0; I != NumElts; ++I) { +// Take sign from odd elements of cast vector +SDValue SignAsF32 = +DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, +DAG.getConstant(2 * I + 1, DL, MVT::i32)); +F32Signs.push_back(SignAsF32); + } + + SDValue NewSign = + NumElts == 1 + ? F32Signs.back() + : DAG.getNode(ISD::BUILD_VECTOR, DL, +EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts), +F32Signs); return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), - SignAsF32); + NewSign); } // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 3bd068362410b..26ea80a802f91 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -4677,37 +4677,33 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m ; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT:v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT:v_cvt_f32_f64_e32 v2, v[2:3] -; GCN-NEXT:v_cvt_f32_f64_e32 v3, v[4:5] +; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v5 +; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3 ; GCN-NEXT:v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT:v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT:v_or_b32_e32 v1, v1, v3 -; GCN-NEXT:v_or_b32_e32 v0, v0, v2 -; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:v_or_b32_e32 v1, v1, v2 +; GCN-NEXT:v_or_b32_e32 v0, v0, v3 ; GCN-NEXT:v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT:s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT:v_cvt_f32_f64_e32 v2, v[2:3] -; GFX7-NEXT:v_cvt_f32_f64_e32 v3, v[4:5] -; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT:v_mul_f32_e32 v
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142157 >From ff07bad7e0442c2b4deabadda4d5242e9b190451 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 12:15:33 +0200 Subject: [PATCH] AMDGPU: Handle vectors in copysign sign type combine This avoids some ugly codegen on pre-16-bit instruction targets now from annoying f16 legalization effects. This also avoids regressions on newer targets in a future patch. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 174 --- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 501 ++--- 3 files changed, 129 insertions(+), 581 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index af85c6bef273d..c61c52ec5843e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11737,9 +11737,10 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) EVT MagVT = MagnitudeOp.getValueType(); - if (MagVT.getScalarType() == MVT::f64) { -unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + + if (MagVT.getScalarType() == MVT::f64) { EVT F32VT = MagVT.isVector() ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) : MVT::v2f32; @@ -11777,7 +11778,7 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignVT != MVT::f64) + if (SignVT.getScalarType() != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. @@ -11785,13 +11786,31 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // fcopysign f64:x, f64:y -> // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) // TODO: In some cases it might make sense to go all the way to f16. - SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp); - SDValue SignAsF32 = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, - DAG.getConstant(1, DL, MVT::i32)); + + EVT F32VT = MagVT.isVector() + ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) + : MVT::v2f32; + + SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp); + + SmallVector F32Signs; + for (unsigned I = 0; I != NumElts; ++I) { +// Take sign from odd elements of cast vector +SDValue SignAsF32 = +DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, +DAG.getConstant(2 * I + 1, DL, MVT::i32)); +F32Signs.push_back(SignAsF32); + } + + SDValue NewSign = + NumElts == 1 + ? F32Signs.back() + : DAG.getNode(ISD::BUILD_VECTOR, DL, +EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts), +F32Signs); return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), - SignAsF32); + NewSign); } // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 3bd068362410b..26ea80a802f91 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -4677,37 +4677,33 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m ; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT:v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT:v_cvt_f32_f64_e32 v2, v[2:3] -; GCN-NEXT:v_cvt_f32_f64_e32 v3, v[4:5] +; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v5 +; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3 ; GCN-NEXT:v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT:v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT:v_or_b32_e32 v1, v1, v3 -; GCN-NEXT:v_or_b32_e32 v0, v0, v2 -; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:v_or_b32_e32 v1, v1, v2 +; GCN-NEXT:v_or_b32_e32 v0, v0, v3 ; GCN-NEXT:v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT:s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT:v_cvt_f32_f64_e32 v2, v[2:3] -; GFX7-NEXT:v_cvt_f32_f64_e32 v3, v[4:5] -; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT:v_mul_f32_e32 v
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142156 >From 9427bd08206493b681edacd5e54da977bee8fd86 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 12:03:35 +0200 Subject: [PATCH] AMDGPU: Handle vectors in copysign magnitude sign case --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 58 +++- .../AMDGPU/copysign-simplify-demanded-bits.ll | 2 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll| 294 +++--- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 179 +-- 4 files changed, 242 insertions(+), 291 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 74ca3e43fce3a..af85c6bef273d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11721,29 +11721,63 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const { SDValue MagnitudeOp = N->getOperand(0); SDValue SignOp = N->getOperand(1); + + // The generic combine for fcopysign + fp cast is too conservative with + // vectors, and also gets confused by the splitting we will perform here, so + // peek through FP casts. + if (SignOp.getOpcode() == ISD::FP_EXTEND || + SignOp.getOpcode() == ISD::FP_ROUND) +SignOp = SignOp.getOperand(0); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT SignVT = SignOp.getValueType(); // f64 fcopysign is really an f32 copysign on the high bits, so replace the // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) - if (MagnitudeOp.getValueType() == MVT::f64) { -SDValue MagAsVector = -DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp); -SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, -MagAsVector, DAG.getConstant(0, DL, MVT::i32)); -SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, -MagAsVector, DAG.getConstant(1, DL, MVT::i32)); + EVT MagVT = MagnitudeOp.getValueType(); + if (MagVT.getScalarType() == MVT::f64) { +unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + +EVT F32VT = MagVT.isVector() +? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) +: MVT::v2f32; + +SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp); + +SmallVector NewElts; +for (unsigned I = 0; I != NumElts; ++I) { + SDValue MagLo = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I, DL, MVT::i32)); + SDValue MagHi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I + 1, DL, MVT::i32)); -SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp); + SDValue SignOpElt = + MagVT.isVector() + ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(), +SignOp, DAG.getConstant(I, DL, MVT::i32)) + : SignOp; + + SDValue HiOp = + DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt); + + SDValue Vector = + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); + + SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); + NewElts.push_back(NewElt); +} -SDValue Vector = -DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); +if (NewElts.size() == 1) + return NewElts[0]; -return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); +return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignOp.getValueType() != MVT::f64) + if (SignVT != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index a01c2fa152ab3..15b049d4d7563 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -131,8 +131,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd: ; GFX9: ; %bb.0: ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT:v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT:v_or_b32_e32 v4, 1, v4 +; GFX9-NEXT:v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[4:5], v4 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[6:7], v6 ; GFX9-NEXT:s_brev_b32 s4, -2 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 32e3f72af516f..3bd068362410b 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU
[llvm-branch-commits] [clang] Backport: [clang] Serialization: support hashing null template arguments (PR #141957)
https://github.com/nikic milestoned https://github.com/llvm/llvm-project/pull/141957 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Add more f16 copysign tests (PR #142115)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142115 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/142114 Make symmetric with other copysign tests >From da7b0574d489d67f6f05dd396e4a8bdf95941bf8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 11:21:42 +0200 Subject: [PATCH] AMDGPU: Move bf16 copysign tests to separate file Make symmetric with other copysign tests --- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 959 + 1 file changed, 959 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll new file mode 100644 index 0..4fcce8a6d623f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -0,0 +1,959 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN +; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7 +; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16 + +declare bfloat @llvm.copysign.bf16(bfloat, bfloat) + +define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) { +; GCN-LABEL: v_copysign_bf16_bf16: +; GCN: ; %bb.0: +; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT:v_and_b32_e32 v1, 0x8000, v1 +; GCN-NEXT:v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT:v_or_b32_e32 v0, v0, v1 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_copysign_bf16_bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT:v_and_b32_e32 v1, 0x8000, v1 +; GFX7-NEXT:v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT:v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT:v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT:s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_copysign_bf16_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT:s_movk_i32 s4, 0x7fff +; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1 +; GFX8-NEXT:s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_copysign_bf16_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_movk_i32 s4, 0x7fff +; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_copysign_bf16_bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX10-NEXT:s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_copysign_bf16_bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT:v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-NEXT:s_setpc_b64 s[30:31] + %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) + ret bfloat %op +} + +define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { +; GCN-LABEL: v_copysign_bf16_s_bf16: +; GCN: ; %bb.0: +; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT:s_and_b32 s4, s16, 0x8000 +; GCN-NEXT:s_lshr_b32 s4, s4, 16 +; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT:v_or_b32_e32 v0, s4, v0 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_copysign_bf16_s_bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT:s_and_b32 s4, s16, 0x8000 +; GFX7-NEXT:s_lshr_b32 s4, s4, 16 +; GFX7-NEXT:v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT:v_or_b32_e32 v0, s4, v0 +; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT:s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_copysign_bf16_s_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT:s_movk_i32 s4, 0x7fff +; GFX8-NEXT:v_mov_b32_e32 v1, s16 +; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1 +; GFX8-NEXT:s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_copysign_bf16_s_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_movk_i32 s4, 0x7fff +; GFX9-NEXT:v_mov_b32_e32 v1, s16 +; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_copysign_bf16_s_bf16: +; GFX10: ; %bb.
[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142114 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] f981d2e - Revert "[Doc][NFC] Fix a typo in SanitizerSpecialCaseList doc. (#142168)"
Author: Qinkun Bao Date: 2025-05-30T12:10:48-04:00 New Revision: f981d2ede98876d7b5bc8b3ee12a35fa4d99dcf7 URL: https://github.com/llvm/llvm-project/commit/f981d2ede98876d7b5bc8b3ee12a35fa4d99dcf7 DIFF: https://github.com/llvm/llvm-project/commit/f981d2ede98876d7b5bc8b3ee12a35fa4d99dcf7.diff LOG: Revert "[Doc][NFC] Fix a typo in SanitizerSpecialCaseList doc. (#142168)" This reverts commit 6a47241c9983c46d805034821f04c34a475a254f. Added: Modified: clang/docs/SanitizerSpecialCaseList.rst Removed: diff --git a/clang/docs/SanitizerSpecialCaseList.rst b/clang/docs/SanitizerSpecialCaseList.rst index 6f924cfa97a97..b82db78a9203c 100644 --- a/clang/docs/SanitizerSpecialCaseList.rst +++ b/clang/docs/SanitizerSpecialCaseList.rst @@ -102,7 +102,7 @@ supported sanitizers. char c = toobig; // also not instrumented } -If multiple entries match the source, then the latest entry takes the +If multiple entries match the source, than the latest entry takes the precedence. .. code-block:: bash ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Make v2f16/v2bf16 copysign legal (PR #142173)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/142173 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142175 >From d94e349591ac69d46c5061b2af7722a49bbb5902 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 17:46:06 +0200 Subject: [PATCH] AMDGPU: Improve v8f16/v8bf16 copysign handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 295 +++-- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 229 ++-- 3 files changed, 74 insertions(+), 459 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1c30d3f3bd883..ecfa6daf7803d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -758,7 +758,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, - {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16}, + {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, +MVT::v8f16, MVT::v8bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5940,9 +5941,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, EVT VT = Op.getValueType(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || - VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || - VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 || - VT == MVT::v32f16); + VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || + VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 || + VT == MVT::v32i16 || VT == MVT::v32f16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 3bc1232ce3ed1..ab4cff2469467 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x ; ; GFX8-LABEL: s_copysign_v8bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT:s_movk_i32 s8, 0x7fff +; GFX8-NEXT:s_mov_b32 s8, 0x7fff7fff ; GFX8-NEXT:v_mov_b32_e32 v0, s3 ; GFX8-NEXT:v_mov_b32_e32 v1, s7 -; GFX8-NEXT:s_lshr_b32 s7, s7, 16 -; GFX8-NEXT:s_lshr_b32 s3, s3, 16 ; GFX8-NEXT:v_bfi_b32 v0, s8, v0, v1 -; GFX8-NEXT:v_mov_b32_e32 v1, s3 -; GFX8-NEXT:v_mov_b32_e32 v2, s7 -; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2 -; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v1, s2 ; GFX8-NEXT:v_mov_b32_e32 v2, s6 -; GFX8-NEXT:s_lshr_b32 s3, s6, 16 -; GFX8-NEXT:s_lshr_b32 s2, s2, 16 ; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2 -; GFX8-NEXT:v_mov_b32_e32 v2, s2 -; GFX8-NEXT:v_mov_b32_e32 v3, s3 -; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3 -; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v2, s1 ; GFX8-NEXT:v_mov_b32_e32 v3, s5 -; GFX8-NEXT:s_lshr_b32 s2, s5, 16 -; GFX8-NEXT:s_lshr_b32 s1, s1, 16 ; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3 -; GFX8-NEXT:v_mov_b32_e32 v3, s1 -; GFX8-NEXT:v_mov_b32_e32 v4, s2 -; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4 -; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v3, s0 ; GFX8-NEXT:v_mov_b32_e32 v4, s4 -; GFX8-NEXT:s_lshr_b32 s1, s4, 16 -; GFX8-NEXT:s_lshr_b32 s0, s0, 16 ; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4 -; GFX8-NEXT:v_mov_b32_e32 v4, s0 -; GFX8-NEXT:v_mov_b32_e32 v5, s1 -; GFX8-NEXT:v_bfi_b32 v4, s8, v4, v5 -; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_readfirstlane_b32 s0, v3 ; GFX8-NEXT:v_readfirstlane_b32 s1, v2 ; GFX8-NEXT:v_readfirstlane_b32 s2, v1 @@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x ; ; GFX9-LABEL: s_copysign_v8bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT:s_movk_i32 s8, 0x7fff +; GFX9-NEXT:s_mov_b32 s8, 0x7fff7fff ; GFX9-NEXT:v_mov_b32_e32 v0, s3 ; GFX9-NEXT:v_mov_b32_e32 v1, s7 -; GFX9-NEXT:s_lshr_b32 s7, s7, 16 -; GFX9-NEXT:s_lshr_b32 s3, s3, 16 ; GFX9-NEXT:v_bfi_b32 v0, s8, v0, v1 -; GFX9-NEXT:v_mov_b32_e32 v1, s3 -; GF
[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142176 >From bae344390de7b6e851ed40f356091d3b5f72b48e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 17:48:01 +0200 Subject: [PATCH] AMDGPU: Improve v16f16/v16bf16 copysign handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 565 +++-- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 431 ++-- 3 files changed, 126 insertions(+), 876 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ecfa6daf7803d..3535eb41682d9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -759,7 +759,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, -MVT::v8f16, MVT::v8bf16}, +MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5942,8 +5942,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || - VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 || - VT == MVT::v32i16 || VT == MVT::v32f16); + VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index ab4cff2469467..4bbd170529ad0 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1719,87 +1719,31 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x bfloat> inreg %arg_mag, <16 ; ; GFX8-LABEL: s_copysign_v16bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT:s_movk_i32 s16, 0x7fff +; GFX8-NEXT:s_mov_b32 s16, 0x7fff7fff ; GFX8-NEXT:v_mov_b32_e32 v0, s7 ; GFX8-NEXT:v_mov_b32_e32 v1, s15 -; GFX8-NEXT:s_lshr_b32 s15, s15, 16 -; GFX8-NEXT:s_lshr_b32 s7, s7, 16 ; GFX8-NEXT:v_bfi_b32 v0, s16, v0, v1 -; GFX8-NEXT:v_mov_b32_e32 v1, s7 -; GFX8-NEXT:v_mov_b32_e32 v2, s15 -; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2 -; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v1, s6 ; GFX8-NEXT:v_mov_b32_e32 v2, s14 -; GFX8-NEXT:s_lshr_b32 s7, s14, 16 -; GFX8-NEXT:s_lshr_b32 s6, s6, 16 ; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2 -; GFX8-NEXT:v_mov_b32_e32 v2, s6 -; GFX8-NEXT:v_mov_b32_e32 v3, s7 -; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3 -; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v2, s5 ; GFX8-NEXT:v_mov_b32_e32 v3, s13 -; GFX8-NEXT:s_lshr_b32 s6, s13, 16 -; GFX8-NEXT:s_lshr_b32 s5, s5, 16 ; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3 -; GFX8-NEXT:v_mov_b32_e32 v3, s5 -; GFX8-NEXT:v_mov_b32_e32 v4, s6 -; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4 -; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v3, s4 ; GFX8-NEXT:v_mov_b32_e32 v4, s12 -; GFX8-NEXT:s_lshr_b32 s5, s12, 16 -; GFX8-NEXT:s_lshr_b32 s4, s4, 16 ; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4 -; GFX8-NEXT:v_mov_b32_e32 v4, s4 -; GFX8-NEXT:v_mov_b32_e32 v5, s5 -; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5 -; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v4, s3 ; GFX8-NEXT:v_mov_b32_e32 v5, s11 -; GFX8-NEXT:s_lshr_b32 s4, s11, 16 -; GFX8-NEXT:s_lshr_b32 s3, s3, 16 ; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5 -; GFX8-NEXT:v_mov_b32_e32 v5, s3 -; GFX8-NEXT:v_mov_b32_e32 v6, s4 -; GFX8-NEXT:v_bfi_b32 v5, s16, v5, v6 -; GFX8-NEXT:v_lshlrev_b32_e32 v5, 16, v5 -; GFX8-NEXT:v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v5, s2 ; GFX8-NEXT:v_mov_b32_e32 v6, s10 -; GFX8-NEXT:s_lshr_b32 s3, s10, 16 -; GFX8-NEXT:s_lshr_b32 s2, s2, 16 ; GFX8-NEXT:v_bfi_b
[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142177 >From 039aac3892486d499dec0c72995bf0a75e86a409 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 17:53:15 +0200 Subject: [PATCH] AMDGPU: Improve v32f16/v32bf16 copysign handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 688 + llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 307 + 3 files changed, 999 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3535eb41682d9..1957e442dbabb 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -759,7 +759,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, -MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16}, +MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16, +MVT::v32f16, MVT::v32bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5943,7 +5944,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 || + VT == MVT::v32bf16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 4bbd170529ad0..7c89a41d62fbf 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -2562,6 +2562,694 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ret <16 x bfloat> %result } +define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign) { +; GCN-LABEL: v_copysign_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT:buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GCN-NEXT:buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT:s_waitcnt vmcnt(1) +; GCN-NEXT:v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT:v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT:v_bfe_u32 v32, v32, 16, 15 +; GCN-NEXT:v_and_b32_e32 v31, 0x8000, v31 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:124 +; GCN-NEXT:v_or_b32_e32 v31, v32, v31 +; GCN-NEXT:v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT:v_bfe_u32 v30, v30, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:120 +; GCN-NEXT:v_or_b32_e32 v30, v30, v32 +; GCN-NEXT:v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT:v_bfe_u32 v29, v29, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT:v_or_b32_e32 v29, v29, v32 +; GCN-NEXT:v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT:v_bfe_u32 v28, v28, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:112 +; GCN-NEXT:v_or_b32_e32 v28, v28, v32 +; GCN-NEXT:v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT:v_bfe_u32 v27, v27, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:108 +; GCN-NEXT:v_or_b32_e32 v27, v27, v32 +; GCN-NEXT:v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT:v_bfe_u32 v26, v26, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT:v_or_b32_e32 v26, v26, v32 +; GCN-NEXT:v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT:v_bfe_u32 v25, v25, 16, 15 +; GCN-NEXT:s_w
[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142176 >From 93748937ce90b591ef40e2d75e96c7f1904758f4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 17:48:01 +0200 Subject: [PATCH] AMDGPU: Improve v16f16/v16bf16 copysign handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 565 +++-- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 431 ++-- 3 files changed, 126 insertions(+), 876 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ecfa6daf7803d..3535eb41682d9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -759,7 +759,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, -MVT::v8f16, MVT::v8bf16}, +MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5942,8 +5942,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || - VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 || - VT == MVT::v32i16 || VT == MVT::v32f16); + VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index ab4cff2469467..4bbd170529ad0 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1719,87 +1719,31 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x bfloat> inreg %arg_mag, <16 ; ; GFX8-LABEL: s_copysign_v16bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT:s_movk_i32 s16, 0x7fff +; GFX8-NEXT:s_mov_b32 s16, 0x7fff7fff ; GFX8-NEXT:v_mov_b32_e32 v0, s7 ; GFX8-NEXT:v_mov_b32_e32 v1, s15 -; GFX8-NEXT:s_lshr_b32 s15, s15, 16 -; GFX8-NEXT:s_lshr_b32 s7, s7, 16 ; GFX8-NEXT:v_bfi_b32 v0, s16, v0, v1 -; GFX8-NEXT:v_mov_b32_e32 v1, s7 -; GFX8-NEXT:v_mov_b32_e32 v2, s15 -; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2 -; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v1, s6 ; GFX8-NEXT:v_mov_b32_e32 v2, s14 -; GFX8-NEXT:s_lshr_b32 s7, s14, 16 -; GFX8-NEXT:s_lshr_b32 s6, s6, 16 ; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2 -; GFX8-NEXT:v_mov_b32_e32 v2, s6 -; GFX8-NEXT:v_mov_b32_e32 v3, s7 -; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3 -; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v2, s5 ; GFX8-NEXT:v_mov_b32_e32 v3, s13 -; GFX8-NEXT:s_lshr_b32 s6, s13, 16 -; GFX8-NEXT:s_lshr_b32 s5, s5, 16 ; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3 -; GFX8-NEXT:v_mov_b32_e32 v3, s5 -; GFX8-NEXT:v_mov_b32_e32 v4, s6 -; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4 -; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v3, s4 ; GFX8-NEXT:v_mov_b32_e32 v4, s12 -; GFX8-NEXT:s_lshr_b32 s5, s12, 16 -; GFX8-NEXT:s_lshr_b32 s4, s4, 16 ; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4 -; GFX8-NEXT:v_mov_b32_e32 v4, s4 -; GFX8-NEXT:v_mov_b32_e32 v5, s5 -; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5 -; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v4, s3 ; GFX8-NEXT:v_mov_b32_e32 v5, s11 -; GFX8-NEXT:s_lshr_b32 s4, s11, 16 -; GFX8-NEXT:s_lshr_b32 s3, s3, 16 ; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5 -; GFX8-NEXT:v_mov_b32_e32 v5, s3 -; GFX8-NEXT:v_mov_b32_e32 v6, s4 -; GFX8-NEXT:v_bfi_b32 v5, s16, v5, v6 -; GFX8-NEXT:v_lshlrev_b32_e32 v5, 16, v5 -; GFX8-NEXT:v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v5, s2 ; GFX8-NEXT:v_mov_b32_e32 v6, s10 -; GFX8-NEXT:s_lshr_b32 s3, s10, 16 -; GFX8-NEXT:s_lshr_b32 s2, s2, 16 ; GFX8-NEXT:v_bfi_b
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142156 >From 158179f7aba2fcdc96091da39f33ad99fd040af6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 12:03:35 +0200 Subject: [PATCH] AMDGPU: Handle vectors in copysign magnitude sign case --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 58 +++- .../AMDGPU/copysign-simplify-demanded-bits.ll | 2 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll| 294 +++--- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 179 +-- 4 files changed, 242 insertions(+), 291 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 74ca3e43fce3a..af85c6bef273d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11721,29 +11721,63 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const { SDValue MagnitudeOp = N->getOperand(0); SDValue SignOp = N->getOperand(1); + + // The generic combine for fcopysign + fp cast is too conservative with + // vectors, and also gets confused by the splitting we will perform here, so + // peek through FP casts. + if (SignOp.getOpcode() == ISD::FP_EXTEND || + SignOp.getOpcode() == ISD::FP_ROUND) +SignOp = SignOp.getOperand(0); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT SignVT = SignOp.getValueType(); // f64 fcopysign is really an f32 copysign on the high bits, so replace the // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) - if (MagnitudeOp.getValueType() == MVT::f64) { -SDValue MagAsVector = -DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp); -SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, -MagAsVector, DAG.getConstant(0, DL, MVT::i32)); -SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, -MagAsVector, DAG.getConstant(1, DL, MVT::i32)); + EVT MagVT = MagnitudeOp.getValueType(); + if (MagVT.getScalarType() == MVT::f64) { +unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + +EVT F32VT = MagVT.isVector() +? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) +: MVT::v2f32; + +SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp); + +SmallVector NewElts; +for (unsigned I = 0; I != NumElts; ++I) { + SDValue MagLo = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I, DL, MVT::i32)); + SDValue MagHi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I + 1, DL, MVT::i32)); -SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp); + SDValue SignOpElt = + MagVT.isVector() + ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(), +SignOp, DAG.getConstant(I, DL, MVT::i32)) + : SignOp; + + SDValue HiOp = + DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt); + + SDValue Vector = + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); + + SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); + NewElts.push_back(NewElt); +} -SDValue Vector = -DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); +if (NewElts.size() == 1) + return NewElts[0]; -return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); +return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignOp.getValueType() != MVT::f64) + if (SignVT != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index a01c2fa152ab3..15b049d4d7563 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -131,8 +131,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd: ; GFX9: ; %bb.0: ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT:v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT:v_or_b32_e32 v4, 1, v4 +; GFX9-NEXT:v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[4:5], v4 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[6:7], v6 ; GFX9-NEXT:s_brev_b32 s4, -2 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 32e3f72af516f..3bd068362410b 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142157 >From ed0712298fd1c3a625ad870d54c5bf3c21052712 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 12:15:33 +0200 Subject: [PATCH] AMDGPU: Handle vectors in copysign sign type combine This avoids some ugly codegen on pre-16-bit instruction targets now from annoying f16 legalization effects. This also avoids regressions on newer targets in a future patch. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 174 --- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 501 ++--- 3 files changed, 129 insertions(+), 581 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index af85c6bef273d..c61c52ec5843e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11737,9 +11737,10 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) EVT MagVT = MagnitudeOp.getValueType(); - if (MagVT.getScalarType() == MVT::f64) { -unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + + if (MagVT.getScalarType() == MVT::f64) { EVT F32VT = MagVT.isVector() ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) : MVT::v2f32; @@ -11777,7 +11778,7 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignVT != MVT::f64) + if (SignVT.getScalarType() != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. @@ -11785,13 +11786,31 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // fcopysign f64:x, f64:y -> // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) // TODO: In some cases it might make sense to go all the way to f16. - SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp); - SDValue SignAsF32 = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, - DAG.getConstant(1, DL, MVT::i32)); + + EVT F32VT = MagVT.isVector() + ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) + : MVT::v2f32; + + SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp); + + SmallVector F32Signs; + for (unsigned I = 0; I != NumElts; ++I) { +// Take sign from odd elements of cast vector +SDValue SignAsF32 = +DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, +DAG.getConstant(2 * I + 1, DL, MVT::i32)); +F32Signs.push_back(SignAsF32); + } + + SDValue NewSign = + NumElts == 1 + ? F32Signs.back() + : DAG.getNode(ISD::BUILD_VECTOR, DL, +EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts), +F32Signs); return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), - SignAsF32); + NewSign); } // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 3bd068362410b..26ea80a802f91 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -4677,37 +4677,33 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m ; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT:v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT:v_cvt_f32_f64_e32 v2, v[2:3] -; GCN-NEXT:v_cvt_f32_f64_e32 v3, v[4:5] +; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v5 +; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3 ; GCN-NEXT:v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT:v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT:v_or_b32_e32 v1, v1, v3 -; GCN-NEXT:v_or_b32_e32 v0, v0, v2 -; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:v_or_b32_e32 v1, v1, v2 +; GCN-NEXT:v_or_b32_e32 v0, v0, v3 ; GCN-NEXT:v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT:s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT:v_cvt_f32_f64_e32 v2, v[2:3] -; GFX7-NEXT:v_cvt_f32_f64_e32 v3, v[4:5] -; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT:v_mul_f32_e32 v
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142156 >From 158179f7aba2fcdc96091da39f33ad99fd040af6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 12:03:35 +0200 Subject: [PATCH] AMDGPU: Handle vectors in copysign magnitude sign case --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 58 +++- .../AMDGPU/copysign-simplify-demanded-bits.ll | 2 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll| 294 +++--- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 179 +-- 4 files changed, 242 insertions(+), 291 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 74ca3e43fce3a..af85c6bef273d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11721,29 +11721,63 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const { SDValue MagnitudeOp = N->getOperand(0); SDValue SignOp = N->getOperand(1); + + // The generic combine for fcopysign + fp cast is too conservative with + // vectors, and also gets confused by the splitting we will perform here, so + // peek through FP casts. + if (SignOp.getOpcode() == ISD::FP_EXTEND || + SignOp.getOpcode() == ISD::FP_ROUND) +SignOp = SignOp.getOperand(0); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT SignVT = SignOp.getValueType(); // f64 fcopysign is really an f32 copysign on the high bits, so replace the // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) - if (MagnitudeOp.getValueType() == MVT::f64) { -SDValue MagAsVector = -DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp); -SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, -MagAsVector, DAG.getConstant(0, DL, MVT::i32)); -SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, -MagAsVector, DAG.getConstant(1, DL, MVT::i32)); + EVT MagVT = MagnitudeOp.getValueType(); + if (MagVT.getScalarType() == MVT::f64) { +unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + +EVT F32VT = MagVT.isVector() +? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) +: MVT::v2f32; + +SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp); + +SmallVector NewElts; +for (unsigned I = 0; I != NumElts; ++I) { + SDValue MagLo = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I, DL, MVT::i32)); + SDValue MagHi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I + 1, DL, MVT::i32)); -SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp); + SDValue SignOpElt = + MagVT.isVector() + ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(), +SignOp, DAG.getConstant(I, DL, MVT::i32)) + : SignOp; + + SDValue HiOp = + DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt); + + SDValue Vector = + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); + + SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); + NewElts.push_back(NewElt); +} -SDValue Vector = -DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); +if (NewElts.size() == 1) + return NewElts[0]; -return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); +return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignOp.getValueType() != MVT::f64) + if (SignVT != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index a01c2fa152ab3..15b049d4d7563 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -131,8 +131,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd: ; GFX9: ; %bb.0: ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT:v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT:v_or_b32_e32 v4, 1, v4 +; GFX9-NEXT:v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[4:5], v4 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[6:7], v6 ; GFX9-NEXT:s_brev_b32 s4, -2 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 32e3f72af516f..3bd068362410b 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU
[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142177 >From f6e957bcc7f122fb35e0ecc7dfa82fec56b2a865 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 17:53:15 +0200 Subject: [PATCH] AMDGPU: Improve v32f16/v32bf16 copysign handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 688 + llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 307 + 3 files changed, 999 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3535eb41682d9..1957e442dbabb 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -759,7 +759,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, -MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16}, +MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16, +MVT::v32f16, MVT::v32bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5943,7 +5944,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 || + VT == MVT::v32bf16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 4bbd170529ad0..7c89a41d62fbf 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -2562,6 +2562,694 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ret <16 x bfloat> %result } +define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign) { +; GCN-LABEL: v_copysign_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT:buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GCN-NEXT:buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT:s_waitcnt vmcnt(1) +; GCN-NEXT:v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT:v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT:v_bfe_u32 v32, v32, 16, 15 +; GCN-NEXT:v_and_b32_e32 v31, 0x8000, v31 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:124 +; GCN-NEXT:v_or_b32_e32 v31, v32, v31 +; GCN-NEXT:v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT:v_bfe_u32 v30, v30, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:120 +; GCN-NEXT:v_or_b32_e32 v30, v30, v32 +; GCN-NEXT:v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT:v_bfe_u32 v29, v29, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT:v_or_b32_e32 v29, v29, v32 +; GCN-NEXT:v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT:v_bfe_u32 v28, v28, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:112 +; GCN-NEXT:v_or_b32_e32 v28, v28, v32 +; GCN-NEXT:v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT:v_bfe_u32 v27, v27, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:108 +; GCN-NEXT:v_or_b32_e32 v27, v27, v32 +; GCN-NEXT:v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT:v_bfe_u32 v26, v26, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT:v_or_b32_e32 v26, v26, v32 +; GCN-NEXT:v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT:v_bfe_u32 v25, v25, 16, 15 +; GCN-NEXT:s_w
[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/142177 None >From e55b837d5e54d23a16219ca133838fdcab3b Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 17:53:15 +0200 Subject: [PATCH] AMDGPU: Improve v32f16/v32bf16 copysign handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 688 + llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 307 + 3 files changed, 999 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3535eb41682d9..1957e442dbabb 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -759,7 +759,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, -MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16}, +MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16, +MVT::v32f16, MVT::v32bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5943,7 +5944,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 || + VT == MVT::v32bf16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 4bbd170529ad0..7c89a41d62fbf 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -2562,6 +2562,694 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ret <16 x bfloat> %result } +define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign) { +; GCN-LABEL: v_copysign_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT:buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GCN-NEXT:buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT:s_waitcnt vmcnt(1) +; GCN-NEXT:v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT:v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT:v_bfe_u32 v32, v32, 16, 15 +; GCN-NEXT:v_and_b32_e32 v31, 0x8000, v31 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:124 +; GCN-NEXT:v_or_b32_e32 v31, v32, v31 +; GCN-NEXT:v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT:v_bfe_u32 v30, v30, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:120 +; GCN-NEXT:v_or_b32_e32 v30, v30, v32 +; GCN-NEXT:v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT:v_bfe_u32 v29, v29, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT:v_or_b32_e32 v29, v29, v32 +; GCN-NEXT:v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT:v_bfe_u32 v28, v28, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:112 +; GCN-NEXT:v_or_b32_e32 v28, v28, v32 +; GCN-NEXT:v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT:v_bfe_u32 v27, v27, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:108 +; GCN-NEXT:v_or_b32_e32 v27, v27, v32 +; GCN-NEXT:v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT:v_bfe_u32 v26, v26, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT:v_or_b32_e32 v26, v26, v32 +; GCN-NEXT:v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT:v_bfe_u32 v25, v25, 16, 15 +; GCN-NEXT:
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)
https://github.com/rampitec approved this pull request. https://github.com/llvm/llvm-project/pull/142156 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)
https://github.com/rampitec approved this pull request. https://github.com/llvm/llvm-project/pull/142157 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v4f16/v4bf16 copysign handling (PR #142174)
https://github.com/rampitec approved this pull request. https://github.com/llvm/llvm-project/pull/142174 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)
https://github.com/rampitec approved this pull request. https://github.com/llvm/llvm-project/pull/142175 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)
https://github.com/rampitec approved this pull request. https://github.com/llvm/llvm-project/pull/142177 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)
arsenm wrote: ### Merge activity * **May 30, 5:45 PM UTC**: A user started a stack merge that includes this pull request via [Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/142157). https://github.com/llvm/llvm-project/pull/142157 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Make v2f16/v2bf16 copysign legal (PR #142173)
https://github.com/rampitec approved this pull request. LGTM with a nit: title says it is legal, but it is custom. https://github.com/llvm/llvm-project/pull/142173 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)
https://github.com/rampitec approved this pull request. https://github.com/llvm/llvm-project/pull/142114 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)
arsenm wrote: ### Merge activity * **May 30, 5:45 PM UTC**: A user started a stack merge that includes this pull request via [Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/142156). https://github.com/llvm/llvm-project/pull/142156 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)
https://github.com/rampitec approved this pull request. https://github.com/llvm/llvm-project/pull/142176 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Make v2f16/v2bf16 copysign legal (PR #142173)
arsenm wrote: > LGTM with a nit: title says it is legal, but it is custom. The same type size is still treated as legal https://github.com/llvm/llvm-project/pull/142173 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)
shiltian wrote: Is it a "move" or adds new tests? There doesn't seem to be any delete. https://github.com/llvm/llvm-project/pull/142114 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142177 >From f6e957bcc7f122fb35e0ecc7dfa82fec56b2a865 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 17:53:15 +0200 Subject: [PATCH] AMDGPU: Improve v32f16/v32bf16 copysign handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 688 + llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 307 + 3 files changed, 999 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3535eb41682d9..1957e442dbabb 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -759,7 +759,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, -MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16}, +MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16, +MVT::v32f16, MVT::v32bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5943,7 +5944,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 || + VT == MVT::v32bf16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 4bbd170529ad0..7c89a41d62fbf 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -2562,6 +2562,694 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ret <16 x bfloat> %result } +define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign) { +; GCN-LABEL: v_copysign_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT:buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GCN-NEXT:buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT:s_waitcnt vmcnt(1) +; GCN-NEXT:v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT:v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT:v_bfe_u32 v32, v32, 16, 15 +; GCN-NEXT:v_and_b32_e32 v31, 0x8000, v31 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:124 +; GCN-NEXT:v_or_b32_e32 v31, v32, v31 +; GCN-NEXT:v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT:v_bfe_u32 v30, v30, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:120 +; GCN-NEXT:v_or_b32_e32 v30, v30, v32 +; GCN-NEXT:v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT:v_bfe_u32 v29, v29, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT:v_or_b32_e32 v29, v29, v32 +; GCN-NEXT:v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT:v_bfe_u32 v28, v28, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:112 +; GCN-NEXT:v_or_b32_e32 v28, v28, v32 +; GCN-NEXT:v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT:v_bfe_u32 v27, v27, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:108 +; GCN-NEXT:v_or_b32_e32 v27, v27, v32 +; GCN-NEXT:v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT:v_bfe_u32 v26, v26, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT:v_or_b32_e32 v26, v26, v32 +; GCN-NEXT:v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT:v_bfe_u32 v25, v25, 16, 15 +; GCN-NEXT:s_w
[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142176 >From 93748937ce90b591ef40e2d75e96c7f1904758f4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 17:48:01 +0200 Subject: [PATCH] AMDGPU: Improve v16f16/v16bf16 copysign handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 565 +++-- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 431 ++-- 3 files changed, 126 insertions(+), 876 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ecfa6daf7803d..3535eb41682d9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -759,7 +759,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, -MVT::v8f16, MVT::v8bf16}, +MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5942,8 +5942,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || - VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 || - VT == MVT::v32i16 || VT == MVT::v32f16); + VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index ab4cff2469467..4bbd170529ad0 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1719,87 +1719,31 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x bfloat> inreg %arg_mag, <16 ; ; GFX8-LABEL: s_copysign_v16bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT:s_movk_i32 s16, 0x7fff +; GFX8-NEXT:s_mov_b32 s16, 0x7fff7fff ; GFX8-NEXT:v_mov_b32_e32 v0, s7 ; GFX8-NEXT:v_mov_b32_e32 v1, s15 -; GFX8-NEXT:s_lshr_b32 s15, s15, 16 -; GFX8-NEXT:s_lshr_b32 s7, s7, 16 ; GFX8-NEXT:v_bfi_b32 v0, s16, v0, v1 -; GFX8-NEXT:v_mov_b32_e32 v1, s7 -; GFX8-NEXT:v_mov_b32_e32 v2, s15 -; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2 -; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v1, s6 ; GFX8-NEXT:v_mov_b32_e32 v2, s14 -; GFX8-NEXT:s_lshr_b32 s7, s14, 16 -; GFX8-NEXT:s_lshr_b32 s6, s6, 16 ; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2 -; GFX8-NEXT:v_mov_b32_e32 v2, s6 -; GFX8-NEXT:v_mov_b32_e32 v3, s7 -; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3 -; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v2, s5 ; GFX8-NEXT:v_mov_b32_e32 v3, s13 -; GFX8-NEXT:s_lshr_b32 s6, s13, 16 -; GFX8-NEXT:s_lshr_b32 s5, s5, 16 ; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3 -; GFX8-NEXT:v_mov_b32_e32 v3, s5 -; GFX8-NEXT:v_mov_b32_e32 v4, s6 -; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4 -; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v3, s4 ; GFX8-NEXT:v_mov_b32_e32 v4, s12 -; GFX8-NEXT:s_lshr_b32 s5, s12, 16 -; GFX8-NEXT:s_lshr_b32 s4, s4, 16 ; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4 -; GFX8-NEXT:v_mov_b32_e32 v4, s4 -; GFX8-NEXT:v_mov_b32_e32 v5, s5 -; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5 -; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v4, s3 ; GFX8-NEXT:v_mov_b32_e32 v5, s11 -; GFX8-NEXT:s_lshr_b32 s4, s11, 16 -; GFX8-NEXT:s_lshr_b32 s3, s3, 16 ; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5 -; GFX8-NEXT:v_mov_b32_e32 v5, s3 -; GFX8-NEXT:v_mov_b32_e32 v6, s4 -; GFX8-NEXT:v_bfi_b32 v5, s16, v5, v6 -; GFX8-NEXT:v_lshlrev_b32_e32 v5, 16, v5 -; GFX8-NEXT:v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v5, s2 ; GFX8-NEXT:v_mov_b32_e32 v6, s10 -; GFX8-NEXT:s_lshr_b32 s3, s10, 16 -; GFX8-NEXT:s_lshr_b32 s2, s2, 16 ; GFX8-NEXT:v_bfi_b
[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142175 >From 883a508fa80728ad2a916d4a5963b23cf585aaa2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 17:46:06 +0200 Subject: [PATCH] AMDGPU: Improve v8f16/v8bf16 copysign handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 295 +++-- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 229 ++-- 3 files changed, 74 insertions(+), 459 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1c30d3f3bd883..ecfa6daf7803d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -758,7 +758,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, - {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16}, + {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, +MVT::v8f16, MVT::v8bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5940,9 +5941,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, EVT VT = Op.getValueType(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || - VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || - VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 || - VT == MVT::v32f16); + VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || + VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 || + VT == MVT::v32i16 || VT == MVT::v32f16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 3bc1232ce3ed1..ab4cff2469467 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x ; ; GFX8-LABEL: s_copysign_v8bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT:s_movk_i32 s8, 0x7fff +; GFX8-NEXT:s_mov_b32 s8, 0x7fff7fff ; GFX8-NEXT:v_mov_b32_e32 v0, s3 ; GFX8-NEXT:v_mov_b32_e32 v1, s7 -; GFX8-NEXT:s_lshr_b32 s7, s7, 16 -; GFX8-NEXT:s_lshr_b32 s3, s3, 16 ; GFX8-NEXT:v_bfi_b32 v0, s8, v0, v1 -; GFX8-NEXT:v_mov_b32_e32 v1, s3 -; GFX8-NEXT:v_mov_b32_e32 v2, s7 -; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2 -; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v1, s2 ; GFX8-NEXT:v_mov_b32_e32 v2, s6 -; GFX8-NEXT:s_lshr_b32 s3, s6, 16 -; GFX8-NEXT:s_lshr_b32 s2, s2, 16 ; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2 -; GFX8-NEXT:v_mov_b32_e32 v2, s2 -; GFX8-NEXT:v_mov_b32_e32 v3, s3 -; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3 -; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v2, s1 ; GFX8-NEXT:v_mov_b32_e32 v3, s5 -; GFX8-NEXT:s_lshr_b32 s2, s5, 16 -; GFX8-NEXT:s_lshr_b32 s1, s1, 16 ; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3 -; GFX8-NEXT:v_mov_b32_e32 v3, s1 -; GFX8-NEXT:v_mov_b32_e32 v4, s2 -; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4 -; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v3, s0 ; GFX8-NEXT:v_mov_b32_e32 v4, s4 -; GFX8-NEXT:s_lshr_b32 s1, s4, 16 -; GFX8-NEXT:s_lshr_b32 s0, s0, 16 ; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4 -; GFX8-NEXT:v_mov_b32_e32 v4, s0 -; GFX8-NEXT:v_mov_b32_e32 v5, s1 -; GFX8-NEXT:v_bfi_b32 v4, s8, v4, v5 -; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_readfirstlane_b32 s0, v3 ; GFX8-NEXT:v_readfirstlane_b32 s1, v2 ; GFX8-NEXT:v_readfirstlane_b32 s2, v1 @@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x ; ; GFX9-LABEL: s_copysign_v8bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT:s_movk_i32 s8, 0x7fff +; GFX9-NEXT:s_mov_b32 s8, 0x7fff7fff ; GFX9-NEXT:v_mov_b32_e32 v0, s3 ; GFX9-NEXT:v_mov_b32_e32 v1, s7 -; GFX9-NEXT:s_lshr_b32 s7, s7, 16 -; GFX9-NEXT:s_lshr_b32 s3, s3, 16 ; GFX9-NEXT:v_bfi_b32 v0, s8, v0, v1 -; GFX9-NEXT:v_mov_b32_e32 v1, s3 -; GF
[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142114 >From da7b0574d489d67f6f05dd396e4a8bdf95941bf8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 11:21:42 +0200 Subject: [PATCH 1/2] AMDGPU: Move bf16 copysign tests to separate file Make symmetric with other copysign tests --- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 959 + 1 file changed, 959 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll new file mode 100644 index 0..4fcce8a6d623f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -0,0 +1,959 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN +; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7 +; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16 + +declare bfloat @llvm.copysign.bf16(bfloat, bfloat) + +define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) { +; GCN-LABEL: v_copysign_bf16_bf16: +; GCN: ; %bb.0: +; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT:v_and_b32_e32 v1, 0x8000, v1 +; GCN-NEXT:v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT:v_or_b32_e32 v0, v0, v1 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_copysign_bf16_bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT:v_and_b32_e32 v1, 0x8000, v1 +; GFX7-NEXT:v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT:v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT:v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT:s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_copysign_bf16_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT:s_movk_i32 s4, 0x7fff +; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1 +; GFX8-NEXT:s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_copysign_bf16_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_movk_i32 s4, 0x7fff +; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_copysign_bf16_bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX10-NEXT:s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_copysign_bf16_bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT:v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-NEXT:s_setpc_b64 s[30:31] + %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) + ret bfloat %op +} + +define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { +; GCN-LABEL: v_copysign_bf16_s_bf16: +; GCN: ; %bb.0: +; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT:s_and_b32 s4, s16, 0x8000 +; GCN-NEXT:s_lshr_b32 s4, s4, 16 +; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15 +; GCN-NEXT:v_or_b32_e32 v0, s4, v0 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_copysign_bf16_s_bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT:s_and_b32 s4, s16, 0x8000 +; GFX7-NEXT:s_lshr_b32 s4, s4, 16 +; GFX7-NEXT:v_bfe_u32 v0, v0, 16, 15 +; GFX7-NEXT:v_or_b32_e32 v0, s4, v0 +; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT:s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_copysign_bf16_s_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT:s_movk_i32 s4, 0x7fff +; GFX8-NEXT:v_mov_b32_e32 v1, s16 +; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1 +; GFX8-NEXT:s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_copysign_bf16_s_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_movk_i32 s4, 0x7fff +; GFX9-NEXT:v_mov_b32_e32 v1, s16 +; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1 +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_copysign_bf16_s_bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT:s_waitcnt vmcnt(0
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142157 >From ed0712298fd1c3a625ad870d54c5bf3c21052712 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 12:15:33 +0200 Subject: [PATCH] AMDGPU: Handle vectors in copysign sign type combine This avoids some ugly codegen on pre-16-bit instruction targets now from annoying f16 legalization effects. This also avoids regressions on newer targets in a future patch. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 174 --- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 501 ++--- 3 files changed, 129 insertions(+), 581 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index af85c6bef273d..c61c52ec5843e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11737,9 +11737,10 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) EVT MagVT = MagnitudeOp.getValueType(); - if (MagVT.getScalarType() == MVT::f64) { -unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + + if (MagVT.getScalarType() == MVT::f64) { EVT F32VT = MagVT.isVector() ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) : MVT::v2f32; @@ -11777,7 +11778,7 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignVT != MVT::f64) + if (SignVT.getScalarType() != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. @@ -11785,13 +11786,31 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // fcopysign f64:x, f64:y -> // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) // TODO: In some cases it might make sense to go all the way to f16. - SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp); - SDValue SignAsF32 = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, - DAG.getConstant(1, DL, MVT::i32)); + + EVT F32VT = MagVT.isVector() + ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) + : MVT::v2f32; + + SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp); + + SmallVector F32Signs; + for (unsigned I = 0; I != NumElts; ++I) { +// Take sign from odd elements of cast vector +SDValue SignAsF32 = +DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, +DAG.getConstant(2 * I + 1, DL, MVT::i32)); +F32Signs.push_back(SignAsF32); + } + + SDValue NewSign = + NumElts == 1 + ? F32Signs.back() + : DAG.getNode(ISD::BUILD_VECTOR, DL, +EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts), +F32Signs); return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), - SignAsF32); + NewSign); } // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 3bd068362410b..26ea80a802f91 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -4677,37 +4677,33 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m ; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT:v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT:v_cvt_f32_f64_e32 v2, v[2:3] -; GCN-NEXT:v_cvt_f32_f64_e32 v3, v[4:5] +; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v5 +; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3 ; GCN-NEXT:v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT:v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT:v_or_b32_e32 v1, v1, v3 -; GCN-NEXT:v_or_b32_e32 v0, v0, v2 -; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:v_or_b32_e32 v1, v1, v2 +; GCN-NEXT:v_or_b32_e32 v0, v0, v3 ; GCN-NEXT:v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT:s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT:v_cvt_f32_f64_e32 v2, v[2:3] -; GFX7-NEXT:v_cvt_f32_f64_e32 v3, v[4:5] -; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT:v_mul_f32_e32 v
[llvm-branch-commits] [llvm] AMDGPU: Make v2f16/v2bf16 copysign legal (PR #142173)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes Fixes #141931 --- Patch is 153.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142173.diff 6 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+31) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+1) - (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+10) - (modified) llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll (+1-6) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+515-610) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+550-649) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c61c52ec5843e..ab3c316f76deb 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -756,6 +756,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // allows matching fneg (fabs x) patterns) setOperationAction(ISD::FABS, MVT::v2f16, Legal); +// Can do this in one BFI plus a constant materialize. +setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16}, Custom); + setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); @@ -6088,6 +6091,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SADDSAT: case ISD::SSUBSAT: return splitBinaryVectorOp(Op, DAG); + case ISD::FCOPYSIGN: +return lowerFCOPYSIGN(Op, DAG); case ISD::MUL: return lowerMUL(Op, DAG); case ISD::SMULO: @@ -7115,6 +7120,32 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op, return DAG.getZExtOrTrunc(NewVal, DL, OpTy); } +SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { + SDValue Mag = Op.getOperand(0); + SDValue Sign = Op.getOperand(1); + + EVT MagVT = Mag.getValueType(); + EVT SignVT = Sign.getValueType(); + + assert(MagVT.isVector()); + + if (MagVT == SignVT) +return Op; + + assert(MagVT.getVectorNumElements() == 2); + + // fcopysign v2f16:mag, v2f32:sign -> + // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16) + + SDLoc SL(Op); + SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign); + SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32); + + SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16); + + return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16); +} + // Custom lowering for vector multiplications and s_mul_u64. SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index c42366a1c04c8..283f8136d352a 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -149,6 +149,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const; SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const; + SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const; SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 2e2913d88cc54..28557ad516865 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2062,6 +2062,16 @@ def : GCNPat < >; } // End foreach fp16vt = [f16, bf16] + +foreach fp16vt = [v2f16, v2bf16] in { + +def : GCNPat < + (fcopysign fp16vt:$src0, fp16vt:$src1), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src0, $src1) +>; + +} + /** == **/ /** Immediate Patterns **/ /** == **/ diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index 15b049d4d7563..021104114d796 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -36,17 +36,12 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3 ; GFX9-NEXT:v_cvt_f32_i32_e32 v2, v2 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT:v_and_b32_e32 v3, 0x7fff7fff, v0 -; GFX9-NEXT:s_movk_i32 s4, 0x7fff +; GFX9-NEXT:s_mov_b32 s4, 0x7fff7fff ; GFX9-NEXT:v_cvt_f16_f32_e32 v2, v2 ; GFX9-NEXT:v_cvt_f16_f32_e32 v1, v1 ; GFX9-NEXT:v_pack_b32_f16 v1, v1, v2 ; GFX9-NEXT:v_pk_mul_f16 v1, v3, v
[llvm-branch-commits] [mlir] [MLIR] Add apply_patterns.vector.arm_sve.lower_contraction TD Op (PR #140572)
https://github.com/banach-space approved this pull request. LGTM, thanks! https://github.com/llvm/llvm-project/pull/140572 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/142176 None >From a05dedab56153ae13dfa3ed168e73b42d4188bb0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 17:48:01 +0200 Subject: [PATCH] AMDGPU: Improve v16f16/v16bf16 copysign handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 565 +++-- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 431 ++-- 3 files changed, 126 insertions(+), 876 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ecfa6daf7803d..3535eb41682d9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -759,7 +759,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, -MVT::v8f16, MVT::v8bf16}, +MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5942,8 +5942,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || - VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 || - VT == MVT::v32i16 || VT == MVT::v32f16); + VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index ab4cff2469467..4bbd170529ad0 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1719,87 +1719,31 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x bfloat> inreg %arg_mag, <16 ; ; GFX8-LABEL: s_copysign_v16bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT:s_movk_i32 s16, 0x7fff +; GFX8-NEXT:s_mov_b32 s16, 0x7fff7fff ; GFX8-NEXT:v_mov_b32_e32 v0, s7 ; GFX8-NEXT:v_mov_b32_e32 v1, s15 -; GFX8-NEXT:s_lshr_b32 s15, s15, 16 -; GFX8-NEXT:s_lshr_b32 s7, s7, 16 ; GFX8-NEXT:v_bfi_b32 v0, s16, v0, v1 -; GFX8-NEXT:v_mov_b32_e32 v1, s7 -; GFX8-NEXT:v_mov_b32_e32 v2, s15 -; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2 -; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v1, s6 ; GFX8-NEXT:v_mov_b32_e32 v2, s14 -; GFX8-NEXT:s_lshr_b32 s7, s14, 16 -; GFX8-NEXT:s_lshr_b32 s6, s6, 16 ; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2 -; GFX8-NEXT:v_mov_b32_e32 v2, s6 -; GFX8-NEXT:v_mov_b32_e32 v3, s7 -; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3 -; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v2, s5 ; GFX8-NEXT:v_mov_b32_e32 v3, s13 -; GFX8-NEXT:s_lshr_b32 s6, s13, 16 -; GFX8-NEXT:s_lshr_b32 s5, s5, 16 ; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3 -; GFX8-NEXT:v_mov_b32_e32 v3, s5 -; GFX8-NEXT:v_mov_b32_e32 v4, s6 -; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4 -; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v3, s4 ; GFX8-NEXT:v_mov_b32_e32 v4, s12 -; GFX8-NEXT:s_lshr_b32 s5, s12, 16 -; GFX8-NEXT:s_lshr_b32 s4, s4, 16 ; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4 -; GFX8-NEXT:v_mov_b32_e32 v4, s4 -; GFX8-NEXT:v_mov_b32_e32 v5, s5 -; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5 -; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v4, s3 ; GFX8-NEXT:v_mov_b32_e32 v5, s11 -; GFX8-NEXT:s_lshr_b32 s4, s11, 16 -; GFX8-NEXT:s_lshr_b32 s3, s3, 16 ; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5 -; GFX8-NEXT:v_mov_b32_e32 v5, s3 -; GFX8-NEXT:v_mov_b32_e32 v6, s4 -; GFX8-NEXT:v_bfi_b32 v5, s16, v5, v6 -; GFX8-NEXT:v_lshlrev_b32_e32 v5, 16, v5 -; GFX8-NEXT:v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v5, s2 ; GFX8-NEXT:v_mov_b32_e32 v6, s10 -; GFX8-NEXT:s_lshr_b32 s3, s10, 16 -; GFX8-NEXT:s_lshr_b32 s2, s2, 16 ; GFX8-NEXT:v
[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142177** https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142176** https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142175** https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142174** https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142173** https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142176 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/142175 None >From 196b0107e162236bb902c52ddfba2e732dbc1db2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 17:46:06 +0200 Subject: [PATCH] AMDGPU: Improve v8f16/v8bf16 copysign handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 295 +++-- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 229 ++-- 3 files changed, 74 insertions(+), 459 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1c30d3f3bd883..ecfa6daf7803d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -758,7 +758,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, - {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16}, + {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, +MVT::v8f16, MVT::v8bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5940,9 +5941,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, EVT VT = Op.getValueType(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || - VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || - VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 || - VT == MVT::v32f16); + VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || + VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 || + VT == MVT::v32i16 || VT == MVT::v32f16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 3bc1232ce3ed1..ab4cff2469467 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x ; ; GFX8-LABEL: s_copysign_v8bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT:s_movk_i32 s8, 0x7fff +; GFX8-NEXT:s_mov_b32 s8, 0x7fff7fff ; GFX8-NEXT:v_mov_b32_e32 v0, s3 ; GFX8-NEXT:v_mov_b32_e32 v1, s7 -; GFX8-NEXT:s_lshr_b32 s7, s7, 16 -; GFX8-NEXT:s_lshr_b32 s3, s3, 16 ; GFX8-NEXT:v_bfi_b32 v0, s8, v0, v1 -; GFX8-NEXT:v_mov_b32_e32 v1, s3 -; GFX8-NEXT:v_mov_b32_e32 v2, s7 -; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2 -; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v1, s2 ; GFX8-NEXT:v_mov_b32_e32 v2, s6 -; GFX8-NEXT:s_lshr_b32 s3, s6, 16 -; GFX8-NEXT:s_lshr_b32 s2, s2, 16 ; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2 -; GFX8-NEXT:v_mov_b32_e32 v2, s2 -; GFX8-NEXT:v_mov_b32_e32 v3, s3 -; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3 -; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v2, s1 ; GFX8-NEXT:v_mov_b32_e32 v3, s5 -; GFX8-NEXT:s_lshr_b32 s2, s5, 16 -; GFX8-NEXT:s_lshr_b32 s1, s1, 16 ; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3 -; GFX8-NEXT:v_mov_b32_e32 v3, s1 -; GFX8-NEXT:v_mov_b32_e32 v4, s2 -; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4 -; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v3, s0 ; GFX8-NEXT:v_mov_b32_e32 v4, s4 -; GFX8-NEXT:s_lshr_b32 s1, s4, 16 -; GFX8-NEXT:s_lshr_b32 s0, s0, 16 ; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4 -; GFX8-NEXT:v_mov_b32_e32 v4, s0 -; GFX8-NEXT:v_mov_b32_e32 v5, s1 -; GFX8-NEXT:v_bfi_b32 v4, s8, v4, v5 -; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_readfirstlane_b32 s0, v3 ; GFX8-NEXT:v_readfirstlane_b32 s1, v2 ; GFX8-NEXT:v_readfirstlane_b32 s2, v1 @@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x ; ; GFX9-LABEL: s_copysign_v8bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT:s_movk_i32 s8, 0x7fff +; GFX9-NEXT:s_mov_b32 s8, 0x7fff7fff ; GFX9-NEXT:v_mov_b32_e32 v0, s3 ; GFX9-NEXT:v_mov_b32_e32 v1, s7 -; GFX9-NEXT:s_lshr_b32 s7, s7, 16 -; GFX9-NEXT:s_lshr_b32 s3, s3, 16 ; GFX9-NEXT:v_bfi_b32 v0, s8, v0, v1 -; GFX9-NEXT:v_mov_b32_e32 v1, s3
[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142177** https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142176** https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142175** https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142174** https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142173** https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142177 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v4f16/v4bf16 copysign handling (PR #142174)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes --- Patch is 284.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142174.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+13-11) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+938-1162) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+1059-1305) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ab3c316f76deb..1c30d3f3bd883 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -757,7 +757,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FABS, MVT::v2f16, Legal); // Can do this in one BFI plus a constant materialize. -setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16}, Custom); +setOperationAction(ISD::FCOPYSIGN, + {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16}, + Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); @@ -5936,10 +5938,11 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || - VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || + VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || + VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 || + VT == MVT::v32f16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); @@ -7122,18 +7125,17 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op, SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { SDValue Mag = Op.getOperand(0); - SDValue Sign = Op.getOperand(1); - EVT MagVT = Mag.getValueType(); - EVT SignVT = Sign.getValueType(); - assert(MagVT.isVector()); + if (MagVT.getVectorNumElements() > 2) +return splitBinaryVectorOp(Op, DAG); + + SDValue Sign = Op.getOperand(1); + EVT SignVT = Sign.getValueType(); if (MagVT == SignVT) return Op; - assert(MagVT.getVectorNumElements() == 2); - // fcopysign v2f16:mag, v2f32:sign -> // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index a5a36d7122f68..3bc1232ce3ed1 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1090,40 +1090,26 @@ define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x ; ; GFX8-LABEL: s_copysign_v3bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT:s_movk_i32 s4, 0x7fff +; GFX8-NEXT:s_mov_b32 s4, 0x7fff7fff ; GFX8-NEXT:v_mov_b32_e32 v0, s1 ; GFX8-NEXT:v_mov_b32_e32 v1, s3 -; GFX8-NEXT:s_lshr_b32 s1, s2, 16 -; GFX8-NEXT:s_lshr_b32 s3, s0, 16 ; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1 -; GFX8-NEXT:v_mov_b32_e32 v1, s3 -; GFX8-NEXT:v_mov_b32_e32 v2, s1 +; GFX8-NEXT:v_mov_b32_e32 v1, s0 +; GFX8-NEXT:v_mov_b32_e32 v2, s2 ; GFX8-NEXT:v_bfi_b32 v1, s4, v1, v2 -; GFX8-NEXT:v_mov_b32_e32 v2, s0 -; GFX8-NEXT:v_mov_b32_e32 v3, s2 -; GFX8-NEXT:v_bfi_b32 v2, s4, v2, v3 -; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT:v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_readfirstlane_b32 s0, v1 ; GFX8-NEXT:v_readfirstlane_b32 s1, v0 ; GFX8-NEXT:; return to shader part epilog ; ; GFX9-LABEL: s_copysign_v3bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT:s_movk_i32 s4, 0x7fff +; GFX9-NEXT:s_mov_b32 s4, 0x7fff7fff ; GFX9-NEXT:v_mov_b32_e32 v0, s1 ; GFX9-NEXT:v_mov_b32_e32 v1, s3 ; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1 ; GFX9-NEXT:v_mov_b32_e32 v1, s0 ; GFX9-NEXT:v_mov_b32_e32 v2, s2 -; GFX9-NEXT:s_lshr_b32 s1, s2, 16 -; GFX9-NEXT:s_lshr_b32 s0, s0, 16 ; GFX9-NEXT:v_bfi_b32 v1, s4, v1, v2 -; GFX9-NEXT:v_mov_b32_e32 v2, s0 -; GFX9-NEXT:v_mov_b32_e32 v3, s1 -; GFX9-NEXT:v_bfi_b32 v2, s4, v2, v3 -; GFX9-NEXT:v_and_b32_e32 v1, 0x, v1 -; GFX9-NEXT:v_lshl_or_b32 v1, v2, 16, v1 ; GFX9-NEXT:v_readfirstlane_b32 s0, v1 ; GFX9-NEXT:v_readfirstlane_b32 s1, v0 ; GFX
[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142177** https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142176** https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142175** https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142174** https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142173** https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142175 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes --- Patch is 56.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142176.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+3-3) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+75-490) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+48-383) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index ecfa6daf7803d..3535eb41682d9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -759,7 +759,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, -MVT::v8f16, MVT::v8bf16}, +MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5942,8 +5942,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || - VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 || - VT == MVT::v32i16 || VT == MVT::v32f16); + VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index ab4cff2469467..4bbd170529ad0 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1719,87 +1719,31 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x bfloat> inreg %arg_mag, <16 ; ; GFX8-LABEL: s_copysign_v16bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT:s_movk_i32 s16, 0x7fff +; GFX8-NEXT:s_mov_b32 s16, 0x7fff7fff ; GFX8-NEXT:v_mov_b32_e32 v0, s7 ; GFX8-NEXT:v_mov_b32_e32 v1, s15 -; GFX8-NEXT:s_lshr_b32 s15, s15, 16 -; GFX8-NEXT:s_lshr_b32 s7, s7, 16 ; GFX8-NEXT:v_bfi_b32 v0, s16, v0, v1 -; GFX8-NEXT:v_mov_b32_e32 v1, s7 -; GFX8-NEXT:v_mov_b32_e32 v2, s15 -; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2 -; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v1, s6 ; GFX8-NEXT:v_mov_b32_e32 v2, s14 -; GFX8-NEXT:s_lshr_b32 s7, s14, 16 -; GFX8-NEXT:s_lshr_b32 s6, s6, 16 ; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2 -; GFX8-NEXT:v_mov_b32_e32 v2, s6 -; GFX8-NEXT:v_mov_b32_e32 v3, s7 -; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3 -; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v2, s5 ; GFX8-NEXT:v_mov_b32_e32 v3, s13 -; GFX8-NEXT:s_lshr_b32 s6, s13, 16 -; GFX8-NEXT:s_lshr_b32 s5, s5, 16 ; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3 -; GFX8-NEXT:v_mov_b32_e32 v3, s5 -; GFX8-NEXT:v_mov_b32_e32 v4, s6 -; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4 -; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v3, s4 ; GFX8-NEXT:v_mov_b32_e32 v4, s12 -; GFX8-NEXT:s_lshr_b32 s5, s12, 16 -; GFX8-NEXT:s_lshr_b32 s4, s4, 16 ; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4 -; GFX8-NEXT:v_mov_b32_e32 v4, s4 -; GFX8-NEXT:v_mov_b32_e32 v5, s5 -; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5 -; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v4, s3 ; GFX8-NEXT:v_mov_b32_e32 v5, s11 -; GFX8-NEXT:s_lshr_b32 s4, s11, 16 -; GFX8-NEXT:s_lshr_b32 s3, s3, 16 ; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5 -; GFX8-NEXT:v_mov_b32_e32 v5, s3 -; GFX8-NEXT:v_mov_b32_e32 v6, s4 -; GFX8-NEXT:v_bfi_b32 v5, s16, v5, v6 -; GFX8-NEXT:v_lshlrev_b32_e32 v5, 16, v5 -; GFX8-NEXT:v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v5, s2 ; GFX8-NEXT:v_mov_b32_e32 v6, s10 -; GFX8-NEXT:s_lshr_b32 s3, s10, 16 -; GFX8-NEXT:s_lshr_b32 s2, s2, 16 ; GFX8-NEXT:v_bfi_b32 v5, s16, v5, v6 -; GFX8-NEXT:v_mov_b32_e32 v6, s2 -; GFX8-NEXT:v_
[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/142176 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/142177 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Make v2f16/v2bf16 copysign legal (PR #142173)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142177** https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142176** https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142175** https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142174** https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142173** https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142173 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v4f16/v4bf16 copysign handling (PR #142174)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142177** https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142176** https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142175** https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142174** https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142173** https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142174 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v4f16/v4bf16 copysign handling (PR #142174)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/142174 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/142175 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes --- Patch is 32.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142175.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+5-4) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+41-254) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+28-201) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1c30d3f3bd883..ecfa6daf7803d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -758,7 +758,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, - {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16}, + {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, +MVT::v8f16, MVT::v8bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5940,9 +5941,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, EVT VT = Op.getValueType(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || - VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || - VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 || - VT == MVT::v32f16); + VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || + VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 || + VT == MVT::v32i16 || VT == MVT::v32f16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 3bc1232ce3ed1..ab4cff2469467 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x ; ; GFX8-LABEL: s_copysign_v8bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT:s_movk_i32 s8, 0x7fff +; GFX8-NEXT:s_mov_b32 s8, 0x7fff7fff ; GFX8-NEXT:v_mov_b32_e32 v0, s3 ; GFX8-NEXT:v_mov_b32_e32 v1, s7 -; GFX8-NEXT:s_lshr_b32 s7, s7, 16 -; GFX8-NEXT:s_lshr_b32 s3, s3, 16 ; GFX8-NEXT:v_bfi_b32 v0, s8, v0, v1 -; GFX8-NEXT:v_mov_b32_e32 v1, s3 -; GFX8-NEXT:v_mov_b32_e32 v2, s7 -; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2 -; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v1, s2 ; GFX8-NEXT:v_mov_b32_e32 v2, s6 -; GFX8-NEXT:s_lshr_b32 s3, s6, 16 -; GFX8-NEXT:s_lshr_b32 s2, s2, 16 ; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2 -; GFX8-NEXT:v_mov_b32_e32 v2, s2 -; GFX8-NEXT:v_mov_b32_e32 v3, s3 -; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3 -; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v2, s1 ; GFX8-NEXT:v_mov_b32_e32 v3, s5 -; GFX8-NEXT:s_lshr_b32 s2, s5, 16 -; GFX8-NEXT:s_lshr_b32 s1, s1, 16 ; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3 -; GFX8-NEXT:v_mov_b32_e32 v3, s1 -; GFX8-NEXT:v_mov_b32_e32 v4, s2 -; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4 -; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v3, s0 ; GFX8-NEXT:v_mov_b32_e32 v4, s4 -; GFX8-NEXT:s_lshr_b32 s1, s4, 16 -; GFX8-NEXT:s_lshr_b32 s0, s0, 16 ; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4 -; GFX8-NEXT:v_mov_b32_e32 v4, s0 -; GFX8-NEXT:v_mov_b32_e32 v5, s1 -; GFX8-NEXT:v_bfi_b32 v4, s8, v4, v5 -; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_readfirstlane_b32 s0, v3 ; GFX8-NEXT:v_readfirstlane_b32 s1, v2 ; GFX8-NEXT:v_readfirstlane_b32 s2, v1 @@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x ; ; GFX9-LABEL: s_copysign_v8bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT:s_movk_i32 s8, 0x7fff +; GFX9-NEXT:s_mov_b32 s8, 0x7fff7fff ; GFX9-NEXT:v_mov_b32_e32 v0, s3 ; GFX9-NEXT:v_mov_b32_e32 v1, s7 -; GFX9-NEXT:s_lshr_b32 s7, s7, 16 -; GFX9-NEXT:s_lshr_b32 s3, s3, 16 ; GFX9-NEXT:v_bfi_b32 v0, s8, v0, v1 -; GFX9-NEXT:v_mov_b32_e32 v1, s3 -; GFX9-NEXT:v_mov_b32_e32 v2, s7 -; GFX9-NEXT:v_bfi_b32 v1, s8, v1, v
[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes --- Patch is 46.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142177.diff 3 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+4-2) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+688) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+307) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3535eb41682d9..1957e442dbabb 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -759,7 +759,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, -MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16}, +MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16, +MVT::v32f16, MVT::v32bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5943,7 +5944,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 || + VT == MVT::v32bf16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 4bbd170529ad0..7c89a41d62fbf 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -2562,6 +2562,694 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign ret <16 x bfloat> %result } +define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign) { +; GCN-LABEL: v_copysign_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT:buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GCN-NEXT:buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT:s_waitcnt vmcnt(1) +; GCN-NEXT:v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT:v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT:v_bfe_u32 v32, v32, 16, 15 +; GCN-NEXT:v_and_b32_e32 v31, 0x8000, v31 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:124 +; GCN-NEXT:v_or_b32_e32 v31, v32, v31 +; GCN-NEXT:v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT:v_bfe_u32 v30, v30, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:120 +; GCN-NEXT:v_or_b32_e32 v30, v30, v32 +; GCN-NEXT:v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT:v_bfe_u32 v29, v29, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT:v_or_b32_e32 v29, v29, v32 +; GCN-NEXT:v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT:v_bfe_u32 v28, v28, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:112 +; GCN-NEXT:v_or_b32_e32 v28, v28, v32 +; GCN-NEXT:v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT:v_bfe_u32 v27, v27, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:108 +; GCN-NEXT:v_or_b32_e32 v27, v27, v32 +; GCN-NEXT:v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT:v_bfe_u32 v26, v26, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32 +; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT:v_or_b32_e32 v26, v26, v32 +; GCN-NEXT:v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT:v_bfe_u32 v25, v25, 16, 15 +; GCN-NEXT:s_waitcnt vmcnt(0) +; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33 +; GCN-NEXT:
[llvm-branch-commits] [llvm] AMDGPU: Make v2f16/v2bf16 copysign legal (PR #142173)
arsenm wrote: Note GloblISel currently just expands G_COPYSIGN, and there is TODO to check the expansion does the right thing to form a BFI, but it does not. We should probably match the custom lowering / directly legal there https://github.com/llvm/llvm-project/pull/142173 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142175 >From 883a508fa80728ad2a916d4a5963b23cf585aaa2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 17:46:06 +0200 Subject: [PATCH] AMDGPU: Improve v8f16/v8bf16 copysign handling --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 9 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 295 +++-- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 229 ++-- 3 files changed, 74 insertions(+), 459 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1c30d3f3bd883..ecfa6daf7803d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -758,7 +758,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // Can do this in one BFI plus a constant materialize. setOperationAction(ISD::FCOPYSIGN, - {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16}, + {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16, +MVT::v8f16, MVT::v8bf16}, Custom); setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom); @@ -5940,9 +5941,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, EVT VT = Op.getValueType(); assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || - VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || - VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 || - VT == MVT::v32f16); + VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || + VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 || + VT == MVT::v32i16 || VT == MVT::v32f16); auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0); auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1); diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 3bc1232ce3ed1..ab4cff2469467 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x ; ; GFX8-LABEL: s_copysign_v8bf16: ; GFX8: ; %bb.0: -; GFX8-NEXT:s_movk_i32 s8, 0x7fff +; GFX8-NEXT:s_mov_b32 s8, 0x7fff7fff ; GFX8-NEXT:v_mov_b32_e32 v0, s3 ; GFX8-NEXT:v_mov_b32_e32 v1, s7 -; GFX8-NEXT:s_lshr_b32 s7, s7, 16 -; GFX8-NEXT:s_lshr_b32 s3, s3, 16 ; GFX8-NEXT:v_bfi_b32 v0, s8, v0, v1 -; GFX8-NEXT:v_mov_b32_e32 v1, s3 -; GFX8-NEXT:v_mov_b32_e32 v2, s7 -; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2 -; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v1, s2 ; GFX8-NEXT:v_mov_b32_e32 v2, s6 -; GFX8-NEXT:s_lshr_b32 s3, s6, 16 -; GFX8-NEXT:s_lshr_b32 s2, s2, 16 ; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2 -; GFX8-NEXT:v_mov_b32_e32 v2, s2 -; GFX8-NEXT:v_mov_b32_e32 v3, s3 -; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3 -; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v2, s1 ; GFX8-NEXT:v_mov_b32_e32 v3, s5 -; GFX8-NEXT:s_lshr_b32 s2, s5, 16 -; GFX8-NEXT:s_lshr_b32 s1, s1, 16 ; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3 -; GFX8-NEXT:v_mov_b32_e32 v3, s1 -; GFX8-NEXT:v_mov_b32_e32 v4, s2 -; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4 -; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3 -; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_mov_b32_e32 v3, s0 ; GFX8-NEXT:v_mov_b32_e32 v4, s4 -; GFX8-NEXT:s_lshr_b32 s1, s4, 16 -; GFX8-NEXT:s_lshr_b32 s0, s0, 16 ; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4 -; GFX8-NEXT:v_mov_b32_e32 v4, s0 -; GFX8-NEXT:v_mov_b32_e32 v5, s1 -; GFX8-NEXT:v_bfi_b32 v4, s8, v4, v5 -; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT:v_readfirstlane_b32 s0, v3 ; GFX8-NEXT:v_readfirstlane_b32 s1, v2 ; GFX8-NEXT:v_readfirstlane_b32 s2, v1 @@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x bfloat> inreg %arg_mag, <8 x ; ; GFX9-LABEL: s_copysign_v8bf16: ; GFX9: ; %bb.0: -; GFX9-NEXT:s_movk_i32 s8, 0x7fff +; GFX9-NEXT:s_mov_b32 s8, 0x7fff7fff ; GFX9-NEXT:v_mov_b32_e32 v0, s3 ; GFX9-NEXT:v_mov_b32_e32 v1, s7 -; GFX9-NEXT:s_lshr_b32 s7, s7, 16 -; GFX9-NEXT:s_lshr_b32 s3, s3, 16 ; GFX9-NEXT:v_bfi_b32 v0, s8, v0, v1 -; GFX9-NEXT:v_mov_b32_e32 v1, s3 -; GF
[llvm-branch-commits] [llvm] AMDGPU: Add more f16 copysign tests (PR #142115)
https://github.com/rampitec approved this pull request. https://github.com/llvm/llvm-project/pull/142115 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [OpenMP] Add directive spellings introduced in spec v6.0 (PR #141772)
https://github.com/mjklemm approved this pull request. LGTM https://github.com/llvm/llvm-project/pull/141772 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [KeyInstr][Clang] Coerced store atoms (PR #134653)
https://github.com/SLTozer approved this pull request. https://github.com/llvm/llvm-project/pull/134653 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [UBSan][Ignorelist] Expanding =sanitize to global. (PR #142077)
https://github.com/qinkunbao closed https://github.com/llvm/llvm-project/pull/142077 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] b9503fe - Revert "Add IR Profile-Guided Optimization (IR PGO) support to the Flang comp…"
Author: Tarun Prabhu Date: 2025-05-30T08:22:15-06:00 New Revision: b9503fe262c416111ee77be30767a791cf750fb8 URL: https://github.com/llvm/llvm-project/commit/b9503fe262c416111ee77be30767a791cf750fb8 DIFF: https://github.com/llvm/llvm-project/commit/b9503fe262c416111ee77be30767a791cf750fb8.diff LOG: Revert "Add IR Profile-Guided Optimization (IR PGO) support to the Flang comp…" This reverts commit d27a210a77af63568db9f829702b4b2c98473a46. Added: Modified: clang/include/clang/Basic/CodeGenOptions.def clang/include/clang/Basic/CodeGenOptions.h clang/include/clang/Basic/ProfileList.h clang/include/clang/Driver/Options.td clang/lib/Basic/ProfileList.cpp clang/lib/CodeGen/BackendUtil.cpp clang/lib/CodeGen/CodeGenAction.cpp clang/lib/CodeGen/CodeGenFunction.cpp clang/lib/CodeGen/CodeGenModule.cpp clang/lib/Driver/ToolChains/Flang.cpp clang/lib/Frontend/CompilerInvocation.cpp flang/include/flang/Frontend/CodeGenOptions.def flang/include/flang/Frontend/CodeGenOptions.h flang/lib/Frontend/CompilerInvocation.cpp flang/lib/Frontend/FrontendActions.cpp flang/test/Driver/flang-f-opts.f90 llvm/include/llvm/Frontend/Driver/CodeGenOptions.h llvm/lib/Frontend/Driver/CodeGenOptions.cpp Removed: flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext flang/test/Profile/gcc-flag-compatibility.f90 diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index 11dad53a52efe..aad4e107cbeb3 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -223,11 +223,9 @@ AFFECTING_VALUE_CODEGENOPT(OptimizeSize, 2, 0) ///< If -Os (==1) or -Oz (==2) is CODEGENOPT(AtomicProfileUpdate , 1, 0) ///< Set -fprofile-update=atomic CODEGENOPT(ContinuousProfileSync, 1, 0) ///< Enable continuous instrumentation profiling /// Choose profile instrumenation kind or no instrumentation. - -ENUM_CODEGENOPT(ProfileInstr, llvm::driver::ProfileInstrKind, 4, llvm::driver::ProfileInstrKind::ProfileNone) - +ENUM_CODEGENOPT(ProfileInstr, ProfileInstrKind, 4, ProfileNone) /// Choose profile kind for PGO use compilation. -ENUM_CODEGENOPT(ProfileUse, llvm::driver::ProfileInstrKind, 2, llvm::driver::ProfileInstrKind::ProfileNone) +ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileNone) /// Partition functions into N groups and select only functions in group i to be /// instrumented. Selected group numbers can be 0 to N-1 inclusive. VALUE_CODEGENOPT(ProfileTotalFunctionGroups, 32, 1) diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h index bffbd00b1bd72..278803f7bb960 100644 --- a/clang/include/clang/Basic/CodeGenOptions.h +++ b/clang/include/clang/Basic/CodeGenOptions.h @@ -518,41 +518,35 @@ class CodeGenOptions : public CodeGenOptionsBase { /// Check if Clang profile instrumenation is on. bool hasProfileClangInstr() const { -return getProfileInstr() == - llvm::driver::ProfileInstrKind::ProfileClangInstr; +return getProfileInstr() == ProfileClangInstr; } /// Check if IR level profile instrumentation is on. bool hasProfileIRInstr() const { -return getProfileInstr() == llvm::driver::ProfileInstrKind::ProfileIRInstr; +return getProfileInstr() == ProfileIRInstr; } /// Check if CS IR level profile instrumentation is on. bool hasProfileCSIRInstr() const { -return getProfileInstr() == - llvm::driver::ProfileInstrKind::ProfileCSIRInstr; +return getProfileInstr() == ProfileCSIRInstr; } /// Check if any form of instrumentation is on. - bool hasProfileInstr() const { -return getProfileInstr() != llvm::driver::ProfileInstrKind::ProfileNone; - } + bool hasProfileInstr() const { return getProfileInstr() != ProfileNone; } /// Check if Clang profile use is on. bool hasProfileClangUse() const { -return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileClangInstr; +return getProfileUse() == ProfileClangInstr; } /// Check if IR level profile use is on. bool hasProfileIRUse() const { -return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileIRInstr || - getProfileUse() == llvm::driver::ProfileInstrKind::ProfileCSIRInstr; +return getProfileUse() == ProfileIRInstr || + getProfileUse() == ProfileCSIRInstr; } /// Check if CSIR profile use is on. - bool hasProfileCSIRUse() const { -return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileCSIRInstr; - } + bool hasProfileCSIRUse() const { return getProfileUse() == ProfileCSIRInstr; } /// Check if type and variable info should be emitted. bool hasReducedDebugInfo() const { diff --git a/clang/include/clang/Basic/ProfileList.h b/c
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/142157 This avoids some ugly codegen on pre-16-bit instruction targets now from annoying f16 legalization effects. This also avoids regressions on newer targets in a future patch. >From ad2fdd8df6f80fb7c3792b33012b0ecba28d656b Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 12:15:33 +0200 Subject: [PATCH] AMDGPU: Handle vectors in copysign sign type combine This avoids some ugly codegen on pre-16-bit instruction targets now from annoying f16 legalization effects. This also avoids regressions on newer targets in a future patch. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +++- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 64 --- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 186 ++--- 3 files changed, 65 insertions(+), 220 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index af85c6bef273d..c61c52ec5843e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11737,9 +11737,10 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) EVT MagVT = MagnitudeOp.getValueType(); - if (MagVT.getScalarType() == MVT::f64) { -unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + + if (MagVT.getScalarType() == MVT::f64) { EVT F32VT = MagVT.isVector() ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) : MVT::v2f32; @@ -11777,7 +11778,7 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignVT != MVT::f64) + if (SignVT.getScalarType() != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. @@ -11785,13 +11786,31 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, // fcopysign f64:x, f64:y -> // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) // TODO: In some cases it might make sense to go all the way to f16. - SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp); - SDValue SignAsF32 = - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, - DAG.getConstant(1, DL, MVT::i32)); + + EVT F32VT = MagVT.isVector() + ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) + : MVT::v2f32; + + SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp); + + SmallVector F32Signs; + for (unsigned I = 0; I != NumElts; ++I) { +// Take sign from odd elements of cast vector +SDValue SignAsF32 = +DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, +DAG.getConstant(2 * I + 1, DL, MVT::i32)); +F32Signs.push_back(SignAsF32); + } + + SDValue NewSign = + NumElts == 1 + ? F32Signs.back() + : DAG.getNode(ISD::BUILD_VECTOR, DL, +EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts), +F32Signs); return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), - SignAsF32); + NewSign); } // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 90a368885bfdc..45bf0770ad924 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -4677,37 +4677,33 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m ; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT:v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT:v_cvt_f32_f64_e32 v2, v[2:3] -; GCN-NEXT:v_cvt_f32_f64_e32 v3, v[4:5] +; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v5 +; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3 ; GCN-NEXT:v_lshrrev_b32_e32 v2, 16, v2 -; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT:v_bfe_u32 v1, v1, 16, 15 +; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3 ; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15 -; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3 -; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT:v_or_b32_e32 v1, v1, v3 -; GCN-NEXT:v_or_b32_e32 v0, v0, v2 -; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT:v_or_b32_e32 v1, v1, v2 +; GCN-NEXT:v_or_b32_e32 v0, v0, v3 ; GCN-NEXT:v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT:s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgk
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/142156 None >From 41692703e0fea3a91ffcb910eb56b5921f2b9ed1 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 30 May 2025 12:03:35 +0200 Subject: [PATCH] AMDGPU: Handle vectors in copysign magnitude sign case --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 58 --- .../AMDGPU/copysign-simplify-demanded-bits.ll | 2 +- llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll| 99 --- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 65 ++-- 4 files changed, 117 insertions(+), 107 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 74ca3e43fce3a..af85c6bef273d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11721,29 +11721,63 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const { SDValue MagnitudeOp = N->getOperand(0); SDValue SignOp = N->getOperand(1); + + // The generic combine for fcopysign + fp cast is too conservative with + // vectors, and also gets confused by the splitting we will perform here, so + // peek through FP casts. + if (SignOp.getOpcode() == ISD::FP_EXTEND || + SignOp.getOpcode() == ISD::FP_ROUND) +SignOp = SignOp.getOperand(0); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT SignVT = SignOp.getValueType(); // f64 fcopysign is really an f32 copysign on the high bits, so replace the // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) - if (MagnitudeOp.getValueType() == MVT::f64) { -SDValue MagAsVector = -DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp); -SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, -MagAsVector, DAG.getConstant(0, DL, MVT::i32)); -SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, -MagAsVector, DAG.getConstant(1, DL, MVT::i32)); + EVT MagVT = MagnitudeOp.getValueType(); + if (MagVT.getScalarType() == MVT::f64) { +unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + +EVT F32VT = MagVT.isVector() +? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) +: MVT::v2f32; + +SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp); + +SmallVector NewElts; +for (unsigned I = 0; I != NumElts; ++I) { + SDValue MagLo = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I, DL, MVT::i32)); + SDValue MagHi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I + 1, DL, MVT::i32)); -SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp); + SDValue SignOpElt = + MagVT.isVector() + ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(), +SignOp, DAG.getConstant(I, DL, MVT::i32)) + : SignOp; + + SDValue HiOp = + DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt); + + SDValue Vector = + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); + + SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); + NewElts.push_back(NewElt); +} -SDValue Vector = -DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); +if (NewElts.size() == 1) + return NewElts[0]; -return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); +return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignOp.getValueType() != MVT::f64) + if (SignVT != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index a01c2fa152ab3..15b049d4d7563 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -131,8 +131,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd: ; GFX9: ; %bb.0: ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT:v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT:v_or_b32_e32 v4, 1, v4 +; GFX9-NEXT:v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[4:5], v4 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[6:7], v6 ; GFX9-NEXT:s_brev_b32 s4, -2 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index e99a6bf273e3b..90a368885bfdc 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/Cod
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142156 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-view-in-graphite"; target="_blank">(View in Graphite) * **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn more about https://stacking.dev/?utm_source=stack-comment";>stacking. https://github.com/llvm/llvm-project/pull/142157 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/142156 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes --- Full diff: https://github.com/llvm/llvm-project/pull/142156.diff 4 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+46-12) - (modified) llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+40-59) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+30-35) ``diff diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 74ca3e43fce3a..af85c6bef273d 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11721,29 +11721,63 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const { SDValue MagnitudeOp = N->getOperand(0); SDValue SignOp = N->getOperand(1); + + // The generic combine for fcopysign + fp cast is too conservative with + // vectors, and also gets confused by the splitting we will perform here, so + // peek through FP casts. + if (SignOp.getOpcode() == ISD::FP_EXTEND || + SignOp.getOpcode() == ISD::FP_ROUND) +SignOp = SignOp.getOperand(0); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT SignVT = SignOp.getValueType(); // f64 fcopysign is really an f32 copysign on the high bits, so replace the // lower half with a copy. // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) - if (MagnitudeOp.getValueType() == MVT::f64) { -SDValue MagAsVector = -DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp); -SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, -MagAsVector, DAG.getConstant(0, DL, MVT::i32)); -SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, -MagAsVector, DAG.getConstant(1, DL, MVT::i32)); + EVT MagVT = MagnitudeOp.getValueType(); + if (MagVT.getScalarType() == MVT::f64) { +unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1; + +EVT F32VT = MagVT.isVector() +? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts) +: MVT::v2f32; + +SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp); + +SmallVector NewElts; +for (unsigned I = 0; I != NumElts; ++I) { + SDValue MagLo = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I, DL, MVT::i32)); + SDValue MagHi = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector, + DAG.getConstant(2 * I + 1, DL, MVT::i32)); -SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp); + SDValue SignOpElt = + MagVT.isVector() + ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(), +SignOp, DAG.getConstant(I, DL, MVT::i32)) + : SignOp; + + SDValue HiOp = + DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt); + + SDValue Vector = + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); + + SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); + NewElts.push_back(NewElt); +} -SDValue Vector = -DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp); +if (NewElts.size() == 1) + return NewElts[0]; -return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector); +return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts); } - if (SignOp.getValueType() != MVT::f64) + if (SignVT != MVT::f64) return SDValue(); // Reduce width of sign operand, we only need the highest bit. diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index a01c2fa152ab3..15b049d4d7563 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -131,8 +131,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd: ; GFX9: ; %bb.0: ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT:v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT:v_or_b32_e32 v4, 1, v4 +; GFX9-NEXT:v_or_b32_e32 v6, 1, v5 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[4:5], v4 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[6:7], v6 ; GFX9-NEXT:s_brev_b32 s4, -2 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index e99a6bf273e3b..90a368885bfdc 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -4055,50 +4055,38 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x do
[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/142157 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [HLSL] Adding support for root descriptors in root signature metadata representation (PR #139781)
@@ -105,6 +113,56 @@ static bool parseRootConstants(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, return false; } +static bool parseRootDescriptors(LLVMContext *Ctx, + mcdxbc::RootSignatureDesc &RSD, + MDNode *RootDescriptorNode) { + + if (RootDescriptorNode->getNumOperands() != 5) +return reportError(Ctx, "Invalid format for RootConstants Element"); + + std::optional ElementText = + extractMdStringValue(RootDescriptorNode, 0); + assert(!ElementText->empty()); + + dxbc::RootParameterHeader Header; + Header.ParameterType = + StringSwitch(*ElementText) + .Case("RootCBV", llvm::to_underlying(dxbc::RootParameterType::CBV)) + .Case("RootSRV", llvm::to_underlying(dxbc::RootParameterType::SRV)) + .Case("RootUAV", llvm::to_underlying(dxbc::RootParameterType::UAV)); joaosaffran wrote: I did some research, it seems that it would be undefined behavior, @bogner correct me if I am wrong please. Will update to handle it better https://github.com/llvm/llvm-project/pull/139781 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)
arsenm wrote: ### Merge activity * **May 30, 5:45 PM UTC**: A user started a stack merge that includes this pull request via [Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/142114). https://github.com/llvm/llvm-project/pull/142114 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Add more f16 copysign tests (PR #142115)
arsenm wrote: ### Merge activity * **May 30, 5:45 PM UTC**: A user started a stack merge that includes this pull request via [Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/142115). https://github.com/llvm/llvm-project/pull/142115 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Improve v4f16/v4bf16 copysign handling (PR #142174)
https://github.com/changpeng approved this pull request. https://github.com/llvm/llvm-project/pull/142174 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [HLSL] Adding support for root descriptors in root signature metadata representation (PR #139781)
@@ -105,6 +113,56 @@ static bool parseRootConstants(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, return false; } +static bool parseRootDescriptors(LLVMContext *Ctx, + mcdxbc::RootSignatureDesc &RSD, + MDNode *RootDescriptorNode) { + + if (RootDescriptorNode->getNumOperands() != 5) +return reportError(Ctx, "Invalid format for RootConstants Element"); + + std::optional ElementText = + extractMdStringValue(RootDescriptorNode, 0); + assert(!ElementText->empty()); + + dxbc::RootParameterHeader Header; + Header.ParameterType = + StringSwitch(*ElementText) + .Case("RootCBV", llvm::to_underlying(dxbc::RootParameterType::CBV)) + .Case("RootSRV", llvm::to_underlying(dxbc::RootParameterType::SRV)) + .Case("RootUAV", llvm::to_underlying(dxbc::RootParameterType::UAV)); joaosaffran wrote: Took a look into this, I don't think a default scenario is needed here, this is previously checked and error handled when this method is called. https://github.com/llvm/llvm-project/pull/139781 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [HLSL] Adding support for root descriptors in root signature metadata representation (PR #139781)
https://github.com/joaosaffran updated https://github.com/llvm/llvm-project/pull/139781 >From f871e2049418d6f09bedaa685e72d3a76f15 Mon Sep 17 00:00:00 2001 From: joaosaffran Date: Tue, 13 May 2025 02:07:31 + Subject: [PATCH 1/5] adding support for root descriptors --- llvm/lib/Target/DirectX/DXILRootSignature.cpp | 64 ++- llvm/lib/Target/DirectX/DXILRootSignature.h | 3 +- .../RootSignature-RootDescriptor.ll | 34 ++ 3 files changed, 99 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor.ll diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp index 1bd816b026fec..77bdb2c2f588f 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp +++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp @@ -55,6 +55,14 @@ static std::optional extractMdIntValue(MDNode *Node, return std::nullopt; } +static std::optional extractMdStringValue(MDNode *Node, + unsigned int OpId) { + MDString *NodeText = cast(Node->getOperand(OpId)); + if (NodeText == nullptr) +return std::nullopt; + return NodeText->getString(); +} + static bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, MDNode *RootFlagNode) { @@ -105,6 +113,56 @@ static bool parseRootConstants(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, return false; } +static bool parseRootDescriptors(LLVMContext *Ctx, + mcdxbc::RootSignatureDesc &RSD, + MDNode *RootDescriptorNode) { + + if (RootDescriptorNode->getNumOperands() != 5) +return reportError(Ctx, "Invalid format for RootConstants Element"); + + std::optional ElementText = + extractMdStringValue(RootDescriptorNode, 0); + assert(!ElementText->empty()); + + dxbc::RootParameterHeader Header; + Header.ParameterType = + StringSwitch(*ElementText) + .Case("RootCBV", llvm::to_underlying(dxbc::RootParameterType::CBV)) + .Case("RootSRV", llvm::to_underlying(dxbc::RootParameterType::SRV)) + .Case("RootUAV", llvm::to_underlying(dxbc::RootParameterType::UAV)); + + if (std::optional Val = extractMdIntValue(RootDescriptorNode, 1)) +Header.ShaderVisibility = *Val; + else +return reportError(Ctx, "Invalid value for ShaderVisibility"); + + dxbc::RTS0::v1::RootDescriptor Descriptor; + if (std::optional Val = extractMdIntValue(RootDescriptorNode, 2)) +Descriptor.ShaderRegister = *Val; + else +return reportError(Ctx, "Invalid value for ShaderRegister"); + + if (std::optional Val = extractMdIntValue(RootDescriptorNode, 3)) +Descriptor.RegisterSpace = *Val; + else +return reportError(Ctx, "Invalid value for RegisterSpace"); + + if (RSD.Version == 1) { +RSD.ParametersContainer.addParameter(Header, Descriptor); +return false; + } + assert(RSD.Version > 1); + dxbc::RTS0::v2::RootDescriptor DescriptorV2(Descriptor); + + if (std::optional Val = extractMdIntValue(RootDescriptorNode, 4)) +DescriptorV2.Flags = *Val; + else +return reportError(Ctx, "Invalid value for Root Descriptor Flags"); + + RSD.ParametersContainer.addParameter(Header, DescriptorV2); + return false; +} + static bool parseRootSignatureElement(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD, MDNode *Element) { @@ -116,6 +174,9 @@ static bool parseRootSignatureElement(LLVMContext *Ctx, StringSwitch(ElementText->getString()) .Case("RootFlags", RootSignatureElementKind::RootFlags) .Case("RootConstants", RootSignatureElementKind::RootConstants) + .Case("RootCBV", RootSignatureElementKind::RootDescriptors) + .Case("RootSRV", RootSignatureElementKind::RootDescriptors) + .Case("RootUAV", RootSignatureElementKind::RootDescriptors) .Default(RootSignatureElementKind::Error); switch (ElementKind) { @@ -124,7 +185,8 @@ static bool parseRootSignatureElement(LLVMContext *Ctx, return parseRootFlags(Ctx, RSD, Element); case RootSignatureElementKind::RootConstants: return parseRootConstants(Ctx, RSD, Element); -break; + case RootSignatureElementKind::RootDescriptors: +return parseRootDescriptors(Ctx, RSD, Element); case RootSignatureElementKind::Error: return reportError(Ctx, "Invalid Root Signature Element: " + ElementText->getString()); diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h b/llvm/lib/Target/DirectX/DXILRootSignature.h index 93ec614f1ab85..b8742d1b1fdfd 100644 --- a/llvm/lib/Target/DirectX/DXILRootSignature.h +++ b/llvm/lib/Target/DirectX/DXILRootSignature.h @@ -27,7 +27,8 @@ namespace dxil { enum class RootSignatureElementKind { Error = 0, RootFlags = 1, - RootConstants = 2 + RootConstants = 2, + RootDes