[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **May 31, 5:58 AM UTC**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/142177).


https://github.com/llvm/llvm-project/pull/142177
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **May 31, 5:58 AM UTC**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/142175).


https://github.com/llvm/llvm-project/pull/142175
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] Backport: [clang] Serialization: support hashing null template arguments (PR #141957)

2025-05-30 Thread Matheus Izvekov via llvm-branch-commits

https://github.com/mizvekov edited 
https://github.com/llvm/llvm-project/pull/141957
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)

2025-05-30 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes

Make symmetric with other copysign tests

---

Patch is 35.67 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/142114.diff


1 Files Affected:

- (added) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+959) 


``diff
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
new file mode 100644
index 0..4fcce8a6d623f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -0,0 +1,959 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
+; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s 
-check-prefixes=GFX7
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s 
-check-prefixes=GFX9
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s 
-check-prefixes=GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck 
%s -check-prefixes=GFX11,GFX11TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck 
%s -check-prefixes=GFX11,GFX11FAKE16
+
+declare bfloat @llvm.copysign.bf16(bfloat, bfloat)
+
+define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
+; GCN-LABEL: v_copysign_bf16_bf16:
+; GCN:   ; %bb.0:
+; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:v_and_b32_e32 v1, 0x8000, v1
+; GCN-NEXT:v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15
+; GCN-NEXT:v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_bf16:
+; GFX7:   ; %bb.0:
+; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:v_and_b32_e32 v1, 0x8000, v1
+; GFX7-NEXT:v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:v_bfe_u32 v0, v0, 16, 15
+; GFX7-NEXT:v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_bf16:
+; GFX8:   ; %bb.0:
+; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1
+; GFX8-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_bf16:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_bf16:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX10-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_bf16_bf16:
+; GFX11:   ; %bb.0:
+; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-NEXT:s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  ret bfloat %op
+}
+
+define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
+; GCN-LABEL: v_copysign_bf16_s_bf16:
+; GCN:   ; %bb.0:
+; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:s_and_b32 s4, s16, 0x8000
+; GCN-NEXT:s_lshr_b32 s4, s4, 16
+; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15
+; GCN-NEXT:v_or_b32_e32 v0, s4, v0
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_s_bf16:
+; GFX7:   ; %bb.0:
+; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:s_and_b32 s4, s16, 0x8000
+; GFX7-NEXT:s_lshr_b32 s4, s4, 16
+; GFX7-NEXT:v_bfe_u32 v0, v0, 16, 15
+; GFX7-NEXT:v_or_b32_e32 v0, s4, v0
+; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_s_bf16:
+; GFX8:   ; %bb.0:
+; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:v_mov_b32_e32 v1, s16
+; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1
+; GFX8-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_s_bf16:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:v_mov_b32_e32 v1, s16
+; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_s_bf16:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:v_bfi_b32 v0, 0x7fff, v0, s16
+; GFX10-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_b

[llvm-branch-commits] [llvm] AMDGPU: Add more f16 copysign tests (PR #142115)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/142115
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add more f16 copysign tests (PR #142115)

2025-05-30 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes



---

Patch is 365.90 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/142115.diff


2 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+4746-3) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+3652) 


``diff
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 4fcce8a6d623f..e99a6bf273e3b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -954,6 +954,4749 @@ define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double 
inreg %mag, bfloat inreg
   %ins.1 = insertelement <2 x i32> %ins.0, i32 %readlane1, i32 1
   ret <2 x i32> %ins.1
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add 
tests below this line:
-; GFX11FAKE16: {{.*}}
-; GFX11TRUE16: {{.*}}
+
+define amdgpu_ps i32 @s_copysign_v2bf16(<2 x bfloat> inreg %arg_mag, <2 x 
bfloat> inreg %arg_sign) {
+; GCN-LABEL: s_copysign_v2bf16:
+; GCN:   ; %bb.0:
+; GCN-NEXT:v_mul_f32_e64 v0, 1.0, s3
+; GCN-NEXT:v_mul_f32_e64 v1, 1.0, s2
+; GCN-NEXT:v_mul_f32_e64 v2, 1.0, s1
+; GCN-NEXT:v_mul_f32_e64 v3, 1.0, s0
+; GCN-NEXT:v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:v_bfe_u32 v3, v3, 16, 15
+; GCN-NEXT:v_bfe_u32 v2, v2, 16, 15
+; GCN-NEXT:v_and_b32_e32 v1, 0x8000, v1
+; GCN-NEXT:v_and_b32_e32 v0, 0x8000, v0
+; GCN-NEXT:v_or_b32_e32 v1, v3, v1
+; GCN-NEXT:v_or_b32_e32 v0, v2, v0
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:v_or_b32_e32 v0, v1, v0
+; GCN-NEXT:v_readfirstlane_b32 s0, v0
+; GCN-NEXT:; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_v2bf16:
+; GFX7:   ; %bb.0:
+; GFX7-NEXT:v_mul_f32_e64 v0, 1.0, s3
+; GFX7-NEXT:v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:v_mul_f32_e64 v1, 1.0, s2
+; GFX7-NEXT:v_mul_f32_e64 v2, 1.0, s1
+; GFX7-NEXT:v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:v_mul_f32_e64 v3, 1.0, s0
+; GFX7-NEXT:v_and_b32_e32 v0, 0x8000, v0
+; GFX7-NEXT:v_bfe_u32 v2, v2, 16, 15
+; GFX7-NEXT:v_and_b32_e32 v1, 0x8000, v1
+; GFX7-NEXT:v_bfe_u32 v3, v3, 16, 15
+; GFX7-NEXT:v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT:v_or_b32_e32 v1, v3, v1
+; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:v_or_b32_e32 v0, v1, v0
+; GFX7-NEXT:v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_v2bf16:
+; GFX8:   ; %bb.0:
+; GFX8-NEXT:s_movk_i32 s2, 0x7fff
+; GFX8-NEXT:v_mov_b32_e32 v0, s0
+; GFX8-NEXT:v_mov_b32_e32 v1, s1
+; GFX8-NEXT:s_lshr_b32 s1, s1, 16
+; GFX8-NEXT:s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:v_bfi_b32 v0, s2, v0, v1
+; GFX8-NEXT:v_mov_b32_e32 v1, s0
+; GFX8-NEXT:v_mov_b32_e32 v2, s1
+; GFX8-NEXT:v_bfi_b32 v1, s2, v1, v2
+; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_v2bf16:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:s_movk_i32 s2, 0x7fff
+; GFX9-NEXT:v_mov_b32_e32 v0, s0
+; GFX9-NEXT:v_mov_b32_e32 v1, s1
+; GFX9-NEXT:s_lshr_b32 s1, s1, 16
+; GFX9-NEXT:s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:v_bfi_b32 v0, s2, v0, v1
+; GFX9-NEXT:v_mov_b32_e32 v1, s0
+; GFX9-NEXT:v_mov_b32_e32 v2, s1
+; GFX9-NEXT:v_bfi_b32 v1, s2, v1, v2
+; GFX9-NEXT:v_and_b32_e32 v0, 0x, v0
+; GFX9-NEXT:v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-NEXT:v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_v2bf16:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:v_mov_b32_e32 v0, s1
+; GFX10-NEXT:s_lshr_b32 s1, s1, 16
+; GFX10-NEXT:v_mov_b32_e32 v1, s1
+; GFX10-NEXT:v_bfi_b32 v0, 0x7fff, s0, v0
+; GFX10-NEXT:s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:v_bfi_b32 v1, 0x7fff, s0, v1
+; GFX10-NEXT:v_and_b32_e32 v0, 0x, v0
+; GFX10-NEXT:v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_v2bf16:
+; GFX11:   ; %bb.0:
+; GFX11-NEXT:v_mov_b32_e32 v0, s1
+; GFX11-NEXT:s_lshr_b32 s1, s1, 16
+; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | 
instid1(VALU_DEP_2)
+; GFX11-NEXT:v_mov_b32_e32 v1, s1
+; GFX11-NEXT:v_bfi_b32 v0, 0x7fff, s0, v0
+; GFX11-NEXT:s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:v_bfi_b32 v1, 0x7fff, s0, v1
+; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | 
instid1(VALU_DEP_1)
+; GFX11-NEXT:v_and_b32_e32 v0, 0x, v0
+; GFX11-NEXT:v_lshl_or

[llvm-branch-commits] [mlir] [mlir] Unique property constraints where possible (PR #140849)

2025-05-30 Thread Mehdi Amini via llvm-branch-commits

https://github.com/joker-eph approved this pull request.

Nice!

https://github.com/llvm/llvm-project/pull/140849
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)

2025-05-30 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes

This avoids some ugly codegen on pre-16-bit instruction targets now
from annoying f16 legalization effects. This also avoids regressions
on newer targets in a future patch.

---
Full diff: https://github.com/llvm/llvm-project/pull/142157.diff


3 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+27-8) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+28-36) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+10-176) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index af85c6bef273d..c61c52ec5843e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11737,9 +11737,10 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   // lower half with a copy.
   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
   EVT MagVT = MagnitudeOp.getValueType();
-  if (MagVT.getScalarType() == MVT::f64) {
-unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
 
+  unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+  if (MagVT.getScalarType() == MVT::f64) {
 EVT F32VT = MagVT.isVector()
 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * 
NumElts)
 : MVT::v2f32;
@@ -11777,7 +11778,7 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
   }
 
-  if (SignVT != MVT::f64)
+  if (SignVT.getScalarType() != MVT::f64)
 return SDValue();
 
   // Reduce width of sign operand, we only need the highest bit.
@@ -11785,13 +11786,31 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   // fcopysign f64:x, f64:y ->
   //   fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
   // TODO: In some cases it might make sense to go all the way to f16.
-  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
-  SDValue SignAsF32 =
-  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
-  DAG.getConstant(1, DL, MVT::i32));
+
+  EVT F32VT = MagVT.isVector()
+  ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
+  : MVT::v2f32;
+
+  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
+
+  SmallVector F32Signs;
+  for (unsigned I = 0; I != NumElts; ++I) {
+// Take sign from odd elements of cast vector
+SDValue SignAsF32 =
+DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
+DAG.getConstant(2 * I + 1, DL, MVT::i32));
+F32Signs.push_back(SignAsF32);
+  }
+
+  SDValue NewSign =
+  NumElts == 1
+  ? F32Signs.back()
+  : DAG.getNode(ISD::BUILD_VECTOR, DL,
+EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
+F32Signs);
 
   return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
- SignAsF32);
+ NewSign);
 }
 
 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 90a368885bfdc..45bf0770ad924 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -4677,37 +4677,33 @@ define <2 x bfloat> 
@v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m
 ; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:v_cvt_f32_f64_e32 v2, v[2:3]
-; GCN-NEXT:v_cvt_f32_f64_e32 v3, v[4:5]
+; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v5
+; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3
 ; GCN-NEXT:v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:v_bfe_u32 v1, v1, 16, 15
+; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3
-; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v2
-; GCN-NEXT:v_or_b32_e32 v1, v1, v3
-; GCN-NEXT:v_or_b32_e32 v0, v0, v2
-; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:v_or_b32_e32 v1, v1, v2
+; GCN-NEXT:v_or_b32_e32 v0, v0, v3
 ; GCN-NEXT:v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64:
 ; GFX7:   ; %bb.0:
 ; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX7-NEXT:v_cvt_f32_f64_e32 v3, v[4:5]
-; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:v_and_b32_e32 v2, 0x8000, v5
 ; GFX7-NEXT:v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:v_l

[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/142114
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142156

>From 9427bd08206493b681edacd5e54da977bee8fd86 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 12:03:35 +0200
Subject: [PATCH] AMDGPU: Handle vectors in copysign magnitude sign case

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  58 +++-
 .../AMDGPU/copysign-simplify-demanded-bits.ll |   2 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll| 294 +++---
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 179 +--
 4 files changed, 242 insertions(+), 291 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 74ca3e43fce3a..af85c6bef273d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11721,29 +11721,63 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   DAGCombinerInfo &DCI) const {
   SDValue MagnitudeOp = N->getOperand(0);
   SDValue SignOp = N->getOperand(1);
+
+  // The generic combine for fcopysign + fp cast is too conservative with
+  // vectors, and also gets confused by the splitting we will perform here, so
+  // peek through FP casts.
+  if (SignOp.getOpcode() == ISD::FP_EXTEND ||
+  SignOp.getOpcode() == ISD::FP_ROUND)
+SignOp = SignOp.getOperand(0);
+
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+  EVT SignVT = SignOp.getValueType();
 
   // f64 fcopysign is really an f32 copysign on the high bits, so replace the
   // lower half with a copy.
   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
-  if (MagnitudeOp.getValueType() == MVT::f64) {
-SDValue MagAsVector =
-DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
-SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-MagAsVector, DAG.getConstant(0, DL, MVT::i32));
-SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-MagAsVector, DAG.getConstant(1, DL, MVT::i32));
+  EVT MagVT = MagnitudeOp.getValueType();
+  if (MagVT.getScalarType() == MVT::f64) {
+unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+EVT F32VT = MagVT.isVector()
+? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * 
NumElts)
+: MVT::v2f32;
+
+SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
+
+SmallVector NewElts;
+for (unsigned I = 0; I != NumElts; ++I) {
+  SDValue MagLo =
+  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+  DAG.getConstant(2 * I, DL, MVT::i32));
+  SDValue MagHi =
+  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+  DAG.getConstant(2 * I + 1, DL, MVT::i32));
 
-SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
+  SDValue SignOpElt =
+  MagVT.isVector()
+  ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 
SignVT.getScalarType(),
+SignOp, DAG.getConstant(I, DL, MVT::i32))
+  : SignOp;
+
+  SDValue HiOp =
+  DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
+
+  SDValue Vector =
+  DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+
+  SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+  NewElts.push_back(NewElt);
+}
 
-SDValue Vector =
-DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+if (NewElts.size() == 1)
+  return NewElts[0];
 
-return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
   }
 
-  if (SignOp.getValueType() != MVT::f64)
+  if (SignVT != MVT::f64)
 return SDValue();
 
   // Reduce width of sign operand, we only need the highest bit.
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll 
b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index a01c2fa152ab3..15b049d4d7563 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -131,8 +131,8 @@ define <2 x double> 
@test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd:
 ; GFX9:   ; %bb.0:
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:v_or_b32_e32 v4, 1, v4
+; GFX9-NEXT:v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[4:5], v4
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[6:7], v6
 ; GFX9-NEXT:s_brev_b32 s4, -2
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 32e3f72af516f..3bd068362410b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU

[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142157

>From ff07bad7e0442c2b4deabadda4d5242e9b190451 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 12:15:33 +0200
Subject: [PATCH] AMDGPU: Handle vectors in copysign sign type combine

This avoids some ugly codegen on pre-16-bit instruction targets now
from annoying f16 legalization effects. This also avoids regressions
on newer targets in a future patch.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  35 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 174 ---
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 501 ++---
 3 files changed, 129 insertions(+), 581 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index af85c6bef273d..c61c52ec5843e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11737,9 +11737,10 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   // lower half with a copy.
   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
   EVT MagVT = MagnitudeOp.getValueType();
-  if (MagVT.getScalarType() == MVT::f64) {
-unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
 
+  unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+  if (MagVT.getScalarType() == MVT::f64) {
 EVT F32VT = MagVT.isVector()
 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * 
NumElts)
 : MVT::v2f32;
@@ -11777,7 +11778,7 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
   }
 
-  if (SignVT != MVT::f64)
+  if (SignVT.getScalarType() != MVT::f64)
 return SDValue();
 
   // Reduce width of sign operand, we only need the highest bit.
@@ -11785,13 +11786,31 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   // fcopysign f64:x, f64:y ->
   //   fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
   // TODO: In some cases it might make sense to go all the way to f16.
-  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
-  SDValue SignAsF32 =
-  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
-  DAG.getConstant(1, DL, MVT::i32));
+
+  EVT F32VT = MagVT.isVector()
+  ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
+  : MVT::v2f32;
+
+  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
+
+  SmallVector F32Signs;
+  for (unsigned I = 0; I != NumElts; ++I) {
+// Take sign from odd elements of cast vector
+SDValue SignAsF32 =
+DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
+DAG.getConstant(2 * I + 1, DL, MVT::i32));
+F32Signs.push_back(SignAsF32);
+  }
+
+  SDValue NewSign =
+  NumElts == 1
+  ? F32Signs.back()
+  : DAG.getNode(ISD::BUILD_VECTOR, DL,
+EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
+F32Signs);
 
   return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
- SignAsF32);
+ NewSign);
 }
 
 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 3bd068362410b..26ea80a802f91 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -4677,37 +4677,33 @@ define <2 x bfloat> 
@v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m
 ; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:v_cvt_f32_f64_e32 v2, v[2:3]
-; GCN-NEXT:v_cvt_f32_f64_e32 v3, v[4:5]
+; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v5
+; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3
 ; GCN-NEXT:v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:v_bfe_u32 v1, v1, 16, 15
+; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3
-; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v2
-; GCN-NEXT:v_or_b32_e32 v1, v1, v3
-; GCN-NEXT:v_or_b32_e32 v0, v0, v2
-; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:v_or_b32_e32 v1, v1, v2
+; GCN-NEXT:v_or_b32_e32 v0, v0, v3
 ; GCN-NEXT:v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64:
 ; GFX7:   ; %bb.0:
 ; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX7-NEXT:v_cvt_f32_f64_e32 v3, v[4:5]
-; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:v_mul_f32_e32 v

[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142157

>From ff07bad7e0442c2b4deabadda4d5242e9b190451 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 12:15:33 +0200
Subject: [PATCH] AMDGPU: Handle vectors in copysign sign type combine

This avoids some ugly codegen on pre-16-bit instruction targets now
from annoying f16 legalization effects. This also avoids regressions
on newer targets in a future patch.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  35 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 174 ---
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 501 ++---
 3 files changed, 129 insertions(+), 581 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index af85c6bef273d..c61c52ec5843e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11737,9 +11737,10 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   // lower half with a copy.
   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
   EVT MagVT = MagnitudeOp.getValueType();
-  if (MagVT.getScalarType() == MVT::f64) {
-unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
 
+  unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+  if (MagVT.getScalarType() == MVT::f64) {
 EVT F32VT = MagVT.isVector()
 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * 
NumElts)
 : MVT::v2f32;
@@ -11777,7 +11778,7 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
   }
 
-  if (SignVT != MVT::f64)
+  if (SignVT.getScalarType() != MVT::f64)
 return SDValue();
 
   // Reduce width of sign operand, we only need the highest bit.
@@ -11785,13 +11786,31 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   // fcopysign f64:x, f64:y ->
   //   fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
   // TODO: In some cases it might make sense to go all the way to f16.
-  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
-  SDValue SignAsF32 =
-  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
-  DAG.getConstant(1, DL, MVT::i32));
+
+  EVT F32VT = MagVT.isVector()
+  ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
+  : MVT::v2f32;
+
+  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
+
+  SmallVector F32Signs;
+  for (unsigned I = 0; I != NumElts; ++I) {
+// Take sign from odd elements of cast vector
+SDValue SignAsF32 =
+DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
+DAG.getConstant(2 * I + 1, DL, MVT::i32));
+F32Signs.push_back(SignAsF32);
+  }
+
+  SDValue NewSign =
+  NumElts == 1
+  ? F32Signs.back()
+  : DAG.getNode(ISD::BUILD_VECTOR, DL,
+EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
+F32Signs);
 
   return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
- SignAsF32);
+ NewSign);
 }
 
 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 3bd068362410b..26ea80a802f91 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -4677,37 +4677,33 @@ define <2 x bfloat> 
@v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m
 ; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:v_cvt_f32_f64_e32 v2, v[2:3]
-; GCN-NEXT:v_cvt_f32_f64_e32 v3, v[4:5]
+; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v5
+; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3
 ; GCN-NEXT:v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:v_bfe_u32 v1, v1, 16, 15
+; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3
-; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v2
-; GCN-NEXT:v_or_b32_e32 v1, v1, v3
-; GCN-NEXT:v_or_b32_e32 v0, v0, v2
-; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:v_or_b32_e32 v1, v1, v2
+; GCN-NEXT:v_or_b32_e32 v0, v0, v3
 ; GCN-NEXT:v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64:
 ; GFX7:   ; %bb.0:
 ; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX7-NEXT:v_cvt_f32_f64_e32 v3, v[4:5]
-; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:v_mul_f32_e32 v

[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142156

>From 9427bd08206493b681edacd5e54da977bee8fd86 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 12:03:35 +0200
Subject: [PATCH] AMDGPU: Handle vectors in copysign magnitude sign case

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  58 +++-
 .../AMDGPU/copysign-simplify-demanded-bits.ll |   2 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll| 294 +++---
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 179 +--
 4 files changed, 242 insertions(+), 291 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 74ca3e43fce3a..af85c6bef273d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11721,29 +11721,63 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   DAGCombinerInfo &DCI) const {
   SDValue MagnitudeOp = N->getOperand(0);
   SDValue SignOp = N->getOperand(1);
+
+  // The generic combine for fcopysign + fp cast is too conservative with
+  // vectors, and also gets confused by the splitting we will perform here, so
+  // peek through FP casts.
+  if (SignOp.getOpcode() == ISD::FP_EXTEND ||
+  SignOp.getOpcode() == ISD::FP_ROUND)
+SignOp = SignOp.getOperand(0);
+
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+  EVT SignVT = SignOp.getValueType();
 
   // f64 fcopysign is really an f32 copysign on the high bits, so replace the
   // lower half with a copy.
   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
-  if (MagnitudeOp.getValueType() == MVT::f64) {
-SDValue MagAsVector =
-DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
-SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-MagAsVector, DAG.getConstant(0, DL, MVT::i32));
-SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-MagAsVector, DAG.getConstant(1, DL, MVT::i32));
+  EVT MagVT = MagnitudeOp.getValueType();
+  if (MagVT.getScalarType() == MVT::f64) {
+unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+EVT F32VT = MagVT.isVector()
+? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * 
NumElts)
+: MVT::v2f32;
+
+SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
+
+SmallVector NewElts;
+for (unsigned I = 0; I != NumElts; ++I) {
+  SDValue MagLo =
+  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+  DAG.getConstant(2 * I, DL, MVT::i32));
+  SDValue MagHi =
+  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+  DAG.getConstant(2 * I + 1, DL, MVT::i32));
 
-SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
+  SDValue SignOpElt =
+  MagVT.isVector()
+  ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 
SignVT.getScalarType(),
+SignOp, DAG.getConstant(I, DL, MVT::i32))
+  : SignOp;
+
+  SDValue HiOp =
+  DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
+
+  SDValue Vector =
+  DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+
+  SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+  NewElts.push_back(NewElt);
+}
 
-SDValue Vector =
-DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+if (NewElts.size() == 1)
+  return NewElts[0];
 
-return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
   }
 
-  if (SignOp.getValueType() != MVT::f64)
+  if (SignVT != MVT::f64)
 return SDValue();
 
   // Reduce width of sign operand, we only need the highest bit.
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll 
b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index a01c2fa152ab3..15b049d4d7563 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -131,8 +131,8 @@ define <2 x double> 
@test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd:
 ; GFX9:   ; %bb.0:
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:v_or_b32_e32 v4, 1, v4
+; GFX9-NEXT:v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[4:5], v4
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[6:7], v6
 ; GFX9-NEXT:s_brev_b32 s4, -2
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 32e3f72af516f..3bd068362410b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU

[llvm-branch-commits] [clang] Backport: [clang] Serialization: support hashing null template arguments (PR #141957)

2025-05-30 Thread Nikita Popov via llvm-branch-commits

https://github.com/nikic milestoned 
https://github.com/llvm/llvm-project/pull/141957
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add more f16 copysign tests (PR #142115)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/142115
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/142114

Make symmetric with other copysign tests

>From da7b0574d489d67f6f05dd396e4a8bdf95941bf8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 11:21:42 +0200
Subject: [PATCH] AMDGPU: Move bf16 copysign tests to separate file

Make symmetric with other copysign tests
---
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 959 +
 1 file changed, 959 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll

diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
new file mode 100644
index 0..4fcce8a6d623f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -0,0 +1,959 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
+; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s 
-check-prefixes=GFX7
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s 
-check-prefixes=GFX9
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s 
-check-prefixes=GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck 
%s -check-prefixes=GFX11,GFX11TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck 
%s -check-prefixes=GFX11,GFX11FAKE16
+
+declare bfloat @llvm.copysign.bf16(bfloat, bfloat)
+
+define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
+; GCN-LABEL: v_copysign_bf16_bf16:
+; GCN:   ; %bb.0:
+; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:v_and_b32_e32 v1, 0x8000, v1
+; GCN-NEXT:v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15
+; GCN-NEXT:v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_bf16:
+; GFX7:   ; %bb.0:
+; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:v_and_b32_e32 v1, 0x8000, v1
+; GFX7-NEXT:v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:v_bfe_u32 v0, v0, 16, 15
+; GFX7-NEXT:v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_bf16:
+; GFX8:   ; %bb.0:
+; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1
+; GFX8-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_bf16:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_bf16:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX10-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_bf16_bf16:
+; GFX11:   ; %bb.0:
+; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-NEXT:s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  ret bfloat %op
+}
+
+define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
+; GCN-LABEL: v_copysign_bf16_s_bf16:
+; GCN:   ; %bb.0:
+; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:s_and_b32 s4, s16, 0x8000
+; GCN-NEXT:s_lshr_b32 s4, s4, 16
+; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15
+; GCN-NEXT:v_or_b32_e32 v0, s4, v0
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_s_bf16:
+; GFX7:   ; %bb.0:
+; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:s_and_b32 s4, s16, 0x8000
+; GFX7-NEXT:s_lshr_b32 s4, s4, 16
+; GFX7-NEXT:v_bfe_u32 v0, v0, 16, 15
+; GFX7-NEXT:v_or_b32_e32 v0, s4, v0
+; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_s_bf16:
+; GFX8:   ; %bb.0:
+; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:v_mov_b32_e32 v1, s16
+; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1
+; GFX8-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_s_bf16:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:v_mov_b32_e32 v1, s16
+; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_s_bf16:
+; GFX10:   ; %bb.

[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/142114
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] f981d2e - Revert "[Doc][NFC] Fix a typo in SanitizerSpecialCaseList doc. (#142168)"

2025-05-30 Thread via llvm-branch-commits

Author: Qinkun Bao
Date: 2025-05-30T12:10:48-04:00
New Revision: f981d2ede98876d7b5bc8b3ee12a35fa4d99dcf7

URL: 
https://github.com/llvm/llvm-project/commit/f981d2ede98876d7b5bc8b3ee12a35fa4d99dcf7
DIFF: 
https://github.com/llvm/llvm-project/commit/f981d2ede98876d7b5bc8b3ee12a35fa4d99dcf7.diff

LOG: Revert "[Doc][NFC] Fix a typo in SanitizerSpecialCaseList doc. (#142168)"

This reverts commit 6a47241c9983c46d805034821f04c34a475a254f.

Added: 


Modified: 
clang/docs/SanitizerSpecialCaseList.rst

Removed: 




diff  --git a/clang/docs/SanitizerSpecialCaseList.rst 
b/clang/docs/SanitizerSpecialCaseList.rst
index 6f924cfa97a97..b82db78a9203c 100644
--- a/clang/docs/SanitizerSpecialCaseList.rst
+++ b/clang/docs/SanitizerSpecialCaseList.rst
@@ -102,7 +102,7 @@ supported sanitizers.
 char c = toobig; // also not instrumented
   }
 
-If multiple entries match the source, then the latest entry takes the
+If multiple entries match the source, than the latest entry takes the
 precedence.
 
 .. code-block:: bash



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Make v2f16/v2bf16 copysign legal (PR #142173)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/142173
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142175

>From d94e349591ac69d46c5061b2af7722a49bbb5902 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 17:46:06 +0200
Subject: [PATCH] AMDGPU: Improve v8f16/v8bf16 copysign handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |   9 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 295 +++--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 229 ++--
 3 files changed, 74 insertions(+), 459 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1c30d3f3bd883..ecfa6daf7803d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -758,7 +758,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
-   {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16},
+   {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
+MVT::v8f16, MVT::v8bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5940,9 +5941,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   EVT VT = Op.getValueType();
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
- VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 ||
- VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 ||
- VT == MVT::v32f16);
+ VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 ||
+ VT == MVT::v32i16 || VT == MVT::v32f16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 3bc1232ce3ed1..ab4cff2469467 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x 
bfloat> inreg %arg_mag, <8 x
 ;
 ; GFX8-LABEL: s_copysign_v8bf16:
 ; GFX8:   ; %bb.0:
-; GFX8-NEXT:s_movk_i32 s8, 0x7fff
+; GFX8-NEXT:s_mov_b32 s8, 0x7fff7fff
 ; GFX8-NEXT:v_mov_b32_e32 v0, s3
 ; GFX8-NEXT:v_mov_b32_e32 v1, s7
-; GFX8-NEXT:s_lshr_b32 s7, s7, 16
-; GFX8-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX8-NEXT:v_bfi_b32 v0, s8, v0, v1
-; GFX8-NEXT:v_mov_b32_e32 v1, s3
-; GFX8-NEXT:v_mov_b32_e32 v2, s7
-; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2
-; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:v_mov_b32_e32 v2, s6
-; GFX8-NEXT:s_lshr_b32 s3, s6, 16
-; GFX8-NEXT:s_lshr_b32 s2, s2, 16
 ; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2
-; GFX8-NEXT:v_mov_b32_e32 v2, s2
-; GFX8-NEXT:v_mov_b32_e32 v3, s3
-; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3
-; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:v_mov_b32_e32 v3, s5
-; GFX8-NEXT:s_lshr_b32 s2, s5, 16
-; GFX8-NEXT:s_lshr_b32 s1, s1, 16
 ; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3
-; GFX8-NEXT:v_mov_b32_e32 v3, s1
-; GFX8-NEXT:v_mov_b32_e32 v4, s2
-; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4
-; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:v_mov_b32_e32 v4, s4
-; GFX8-NEXT:s_lshr_b32 s1, s4, 16
-; GFX8-NEXT:s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4
-; GFX8-NEXT:v_mov_b32_e32 v4, s0
-; GFX8-NEXT:v_mov_b32_e32 v5, s1
-; GFX8-NEXT:v_bfi_b32 v4, s8, v4, v5
-; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_readfirstlane_b32 s0, v3
 ; GFX8-NEXT:v_readfirstlane_b32 s1, v2
 ; GFX8-NEXT:v_readfirstlane_b32 s2, v1
@@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x 
bfloat> inreg %arg_mag, <8 x
 ;
 ; GFX9-LABEL: s_copysign_v8bf16:
 ; GFX9:   ; %bb.0:
-; GFX9-NEXT:s_movk_i32 s8, 0x7fff
+; GFX9-NEXT:s_mov_b32 s8, 0x7fff7fff
 ; GFX9-NEXT:v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:v_mov_b32_e32 v1, s7
-; GFX9-NEXT:s_lshr_b32 s7, s7, 16
-; GFX9-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX9-NEXT:v_bfi_b32 v0, s8, v0, v1
-; GFX9-NEXT:v_mov_b32_e32 v1, s3
-; GF

[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142176

>From bae344390de7b6e851ed40f356091d3b5f72b48e Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 17:48:01 +0200
Subject: [PATCH] AMDGPU: Improve v16f16/v16bf16 copysign handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |   6 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 565 +++--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 431 ++--
 3 files changed, 126 insertions(+), 876 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ecfa6daf7803d..3535eb41682d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -759,7 +759,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
-MVT::v8f16, MVT::v8bf16},
+MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5942,8 +5942,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
  VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
- VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 ||
- VT == MVT::v32i16 || VT == MVT::v32f16);
+ VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index ab4cff2469467..4bbd170529ad0 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1719,87 +1719,31 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x 
bfloat> inreg %arg_mag, <16
 ;
 ; GFX8-LABEL: s_copysign_v16bf16:
 ; GFX8:   ; %bb.0:
-; GFX8-NEXT:s_movk_i32 s16, 0x7fff
+; GFX8-NEXT:s_mov_b32 s16, 0x7fff7fff
 ; GFX8-NEXT:v_mov_b32_e32 v0, s7
 ; GFX8-NEXT:v_mov_b32_e32 v1, s15
-; GFX8-NEXT:s_lshr_b32 s15, s15, 16
-; GFX8-NEXT:s_lshr_b32 s7, s7, 16
 ; GFX8-NEXT:v_bfi_b32 v0, s16, v0, v1
-; GFX8-NEXT:v_mov_b32_e32 v1, s7
-; GFX8-NEXT:v_mov_b32_e32 v2, s15
-; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2
-; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v1, s6
 ; GFX8-NEXT:v_mov_b32_e32 v2, s14
-; GFX8-NEXT:s_lshr_b32 s7, s14, 16
-; GFX8-NEXT:s_lshr_b32 s6, s6, 16
 ; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2
-; GFX8-NEXT:v_mov_b32_e32 v2, s6
-; GFX8-NEXT:v_mov_b32_e32 v3, s7
-; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3
-; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:v_mov_b32_e32 v3, s13
-; GFX8-NEXT:s_lshr_b32 s6, s13, 16
-; GFX8-NEXT:s_lshr_b32 s5, s5, 16
 ; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3
-; GFX8-NEXT:v_mov_b32_e32 v3, s5
-; GFX8-NEXT:v_mov_b32_e32 v4, s6
-; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4
-; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v3, s4
 ; GFX8-NEXT:v_mov_b32_e32 v4, s12
-; GFX8-NEXT:s_lshr_b32 s5, s12, 16
-; GFX8-NEXT:s_lshr_b32 s4, s4, 16
 ; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4
-; GFX8-NEXT:v_mov_b32_e32 v4, s4
-; GFX8-NEXT:v_mov_b32_e32 v5, s5
-; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5
-; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:v_mov_b32_e32 v5, s11
-; GFX8-NEXT:s_lshr_b32 s4, s11, 16
-; GFX8-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5
-; GFX8-NEXT:v_mov_b32_e32 v5, s3
-; GFX8-NEXT:v_mov_b32_e32 v6, s4
-; GFX8-NEXT:v_bfi_b32 v5, s16, v5, v6
-; GFX8-NEXT:v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:v_mov_b32_e32 v6, s10
-; GFX8-NEXT:s_lshr_b32 s3, s10, 16
-; GFX8-NEXT:s_lshr_b32 s2, s2, 16
 ; GFX8-NEXT:v_bfi_b

[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142177

>From 039aac3892486d499dec0c72995bf0a75e86a409 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 17:53:15 +0200
Subject: [PATCH] AMDGPU: Improve v32f16/v32bf16 copysign handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |   6 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 688 +
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 307 +
 3 files changed, 999 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3535eb41682d9..1957e442dbabb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -759,7 +759,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
-MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16},
+MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
+MVT::v32f16, MVT::v32bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5943,7 +5944,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
  VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
  VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
+ VT == MVT::v32bf16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 4bbd170529ad0..7c89a41d62fbf 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -2562,6 +2562,694 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> 
%mag, <16 x bfloat> %sign
   ret <16 x bfloat> %result
 }
 
+define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> 
%sign) {
+; GCN-LABEL: v_copysign_v32bf16:
+; GCN:   ; %bb.0:
+; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:s_waitcnt vmcnt(1)
+; GCN-NEXT:v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT:v_bfe_u32 v32, v32, 16, 15
+; GCN-NEXT:v_and_b32_e32 v31, 0x8000, v31
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:v_or_b32_e32 v31, v32, v31
+; GCN-NEXT:v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:v_bfe_u32 v30, v30, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:v_or_b32_e32 v30, v30, v32
+; GCN-NEXT:v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:v_bfe_u32 v29, v29, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT:v_or_b32_e32 v29, v29, v32
+; GCN-NEXT:v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:v_bfe_u32 v28, v28, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:v_or_b32_e32 v28, v28, v32
+; GCN-NEXT:v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:v_bfe_u32 v27, v27, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT:v_or_b32_e32 v27, v27, v32
+; GCN-NEXT:v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:v_bfe_u32 v26, v26, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:v_or_b32_e32 v26, v26, v32
+; GCN-NEXT:v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:v_bfe_u32 v25, v25, 16, 15
+; GCN-NEXT:s_w

[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142176

>From 93748937ce90b591ef40e2d75e96c7f1904758f4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 17:48:01 +0200
Subject: [PATCH] AMDGPU: Improve v16f16/v16bf16 copysign handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |   6 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 565 +++--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 431 ++--
 3 files changed, 126 insertions(+), 876 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ecfa6daf7803d..3535eb41682d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -759,7 +759,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
-MVT::v8f16, MVT::v8bf16},
+MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5942,8 +5942,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
  VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
- VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 ||
- VT == MVT::v32i16 || VT == MVT::v32f16);
+ VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index ab4cff2469467..4bbd170529ad0 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1719,87 +1719,31 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x 
bfloat> inreg %arg_mag, <16
 ;
 ; GFX8-LABEL: s_copysign_v16bf16:
 ; GFX8:   ; %bb.0:
-; GFX8-NEXT:s_movk_i32 s16, 0x7fff
+; GFX8-NEXT:s_mov_b32 s16, 0x7fff7fff
 ; GFX8-NEXT:v_mov_b32_e32 v0, s7
 ; GFX8-NEXT:v_mov_b32_e32 v1, s15
-; GFX8-NEXT:s_lshr_b32 s15, s15, 16
-; GFX8-NEXT:s_lshr_b32 s7, s7, 16
 ; GFX8-NEXT:v_bfi_b32 v0, s16, v0, v1
-; GFX8-NEXT:v_mov_b32_e32 v1, s7
-; GFX8-NEXT:v_mov_b32_e32 v2, s15
-; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2
-; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v1, s6
 ; GFX8-NEXT:v_mov_b32_e32 v2, s14
-; GFX8-NEXT:s_lshr_b32 s7, s14, 16
-; GFX8-NEXT:s_lshr_b32 s6, s6, 16
 ; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2
-; GFX8-NEXT:v_mov_b32_e32 v2, s6
-; GFX8-NEXT:v_mov_b32_e32 v3, s7
-; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3
-; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:v_mov_b32_e32 v3, s13
-; GFX8-NEXT:s_lshr_b32 s6, s13, 16
-; GFX8-NEXT:s_lshr_b32 s5, s5, 16
 ; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3
-; GFX8-NEXT:v_mov_b32_e32 v3, s5
-; GFX8-NEXT:v_mov_b32_e32 v4, s6
-; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4
-; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v3, s4
 ; GFX8-NEXT:v_mov_b32_e32 v4, s12
-; GFX8-NEXT:s_lshr_b32 s5, s12, 16
-; GFX8-NEXT:s_lshr_b32 s4, s4, 16
 ; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4
-; GFX8-NEXT:v_mov_b32_e32 v4, s4
-; GFX8-NEXT:v_mov_b32_e32 v5, s5
-; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5
-; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:v_mov_b32_e32 v5, s11
-; GFX8-NEXT:s_lshr_b32 s4, s11, 16
-; GFX8-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5
-; GFX8-NEXT:v_mov_b32_e32 v5, s3
-; GFX8-NEXT:v_mov_b32_e32 v6, s4
-; GFX8-NEXT:v_bfi_b32 v5, s16, v5, v6
-; GFX8-NEXT:v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:v_mov_b32_e32 v6, s10
-; GFX8-NEXT:s_lshr_b32 s3, s10, 16
-; GFX8-NEXT:s_lshr_b32 s2, s2, 16
 ; GFX8-NEXT:v_bfi_b

[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142156

>From 158179f7aba2fcdc96091da39f33ad99fd040af6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 12:03:35 +0200
Subject: [PATCH] AMDGPU: Handle vectors in copysign magnitude sign case

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  58 +++-
 .../AMDGPU/copysign-simplify-demanded-bits.ll |   2 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll| 294 +++---
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 179 +--
 4 files changed, 242 insertions(+), 291 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 74ca3e43fce3a..af85c6bef273d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11721,29 +11721,63 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   DAGCombinerInfo &DCI) const {
   SDValue MagnitudeOp = N->getOperand(0);
   SDValue SignOp = N->getOperand(1);
+
+  // The generic combine for fcopysign + fp cast is too conservative with
+  // vectors, and also gets confused by the splitting we will perform here, so
+  // peek through FP casts.
+  if (SignOp.getOpcode() == ISD::FP_EXTEND ||
+  SignOp.getOpcode() == ISD::FP_ROUND)
+SignOp = SignOp.getOperand(0);
+
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+  EVT SignVT = SignOp.getValueType();
 
   // f64 fcopysign is really an f32 copysign on the high bits, so replace the
   // lower half with a copy.
   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
-  if (MagnitudeOp.getValueType() == MVT::f64) {
-SDValue MagAsVector =
-DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
-SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-MagAsVector, DAG.getConstant(0, DL, MVT::i32));
-SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-MagAsVector, DAG.getConstant(1, DL, MVT::i32));
+  EVT MagVT = MagnitudeOp.getValueType();
+  if (MagVT.getScalarType() == MVT::f64) {
+unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+EVT F32VT = MagVT.isVector()
+? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * 
NumElts)
+: MVT::v2f32;
+
+SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
+
+SmallVector NewElts;
+for (unsigned I = 0; I != NumElts; ++I) {
+  SDValue MagLo =
+  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+  DAG.getConstant(2 * I, DL, MVT::i32));
+  SDValue MagHi =
+  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+  DAG.getConstant(2 * I + 1, DL, MVT::i32));
 
-SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
+  SDValue SignOpElt =
+  MagVT.isVector()
+  ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 
SignVT.getScalarType(),
+SignOp, DAG.getConstant(I, DL, MVT::i32))
+  : SignOp;
+
+  SDValue HiOp =
+  DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
+
+  SDValue Vector =
+  DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+
+  SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+  NewElts.push_back(NewElt);
+}
 
-SDValue Vector =
-DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+if (NewElts.size() == 1)
+  return NewElts[0];
 
-return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
   }
 
-  if (SignOp.getValueType() != MVT::f64)
+  if (SignVT != MVT::f64)
 return SDValue();
 
   // Reduce width of sign operand, we only need the highest bit.
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll 
b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index a01c2fa152ab3..15b049d4d7563 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -131,8 +131,8 @@ define <2 x double> 
@test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd:
 ; GFX9:   ; %bb.0:
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:v_or_b32_e32 v4, 1, v4
+; GFX9-NEXT:v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[4:5], v4
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[6:7], v6
 ; GFX9-NEXT:s_brev_b32 s4, -2
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 32e3f72af516f..3bd068362410b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU

[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142157

>From ed0712298fd1c3a625ad870d54c5bf3c21052712 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 12:15:33 +0200
Subject: [PATCH] AMDGPU: Handle vectors in copysign sign type combine

This avoids some ugly codegen on pre-16-bit instruction targets now
from annoying f16 legalization effects. This also avoids regressions
on newer targets in a future patch.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  35 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 174 ---
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 501 ++---
 3 files changed, 129 insertions(+), 581 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index af85c6bef273d..c61c52ec5843e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11737,9 +11737,10 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   // lower half with a copy.
   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
   EVT MagVT = MagnitudeOp.getValueType();
-  if (MagVT.getScalarType() == MVT::f64) {
-unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
 
+  unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+  if (MagVT.getScalarType() == MVT::f64) {
 EVT F32VT = MagVT.isVector()
 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * 
NumElts)
 : MVT::v2f32;
@@ -11777,7 +11778,7 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
   }
 
-  if (SignVT != MVT::f64)
+  if (SignVT.getScalarType() != MVT::f64)
 return SDValue();
 
   // Reduce width of sign operand, we only need the highest bit.
@@ -11785,13 +11786,31 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   // fcopysign f64:x, f64:y ->
   //   fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
   // TODO: In some cases it might make sense to go all the way to f16.
-  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
-  SDValue SignAsF32 =
-  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
-  DAG.getConstant(1, DL, MVT::i32));
+
+  EVT F32VT = MagVT.isVector()
+  ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
+  : MVT::v2f32;
+
+  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
+
+  SmallVector F32Signs;
+  for (unsigned I = 0; I != NumElts; ++I) {
+// Take sign from odd elements of cast vector
+SDValue SignAsF32 =
+DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
+DAG.getConstant(2 * I + 1, DL, MVT::i32));
+F32Signs.push_back(SignAsF32);
+  }
+
+  SDValue NewSign =
+  NumElts == 1
+  ? F32Signs.back()
+  : DAG.getNode(ISD::BUILD_VECTOR, DL,
+EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
+F32Signs);
 
   return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
- SignAsF32);
+ NewSign);
 }
 
 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 3bd068362410b..26ea80a802f91 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -4677,37 +4677,33 @@ define <2 x bfloat> 
@v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m
 ; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:v_cvt_f32_f64_e32 v2, v[2:3]
-; GCN-NEXT:v_cvt_f32_f64_e32 v3, v[4:5]
+; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v5
+; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3
 ; GCN-NEXT:v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:v_bfe_u32 v1, v1, 16, 15
+; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3
-; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v2
-; GCN-NEXT:v_or_b32_e32 v1, v1, v3
-; GCN-NEXT:v_or_b32_e32 v0, v0, v2
-; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:v_or_b32_e32 v1, v1, v2
+; GCN-NEXT:v_or_b32_e32 v0, v0, v3
 ; GCN-NEXT:v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64:
 ; GFX7:   ; %bb.0:
 ; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX7-NEXT:v_cvt_f32_f64_e32 v3, v[4:5]
-; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:v_mul_f32_e32 v

[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142156

>From 158179f7aba2fcdc96091da39f33ad99fd040af6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 12:03:35 +0200
Subject: [PATCH] AMDGPU: Handle vectors in copysign magnitude sign case

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |  58 +++-
 .../AMDGPU/copysign-simplify-demanded-bits.ll |   2 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll| 294 +++---
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 179 +--
 4 files changed, 242 insertions(+), 291 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 74ca3e43fce3a..af85c6bef273d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11721,29 +11721,63 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   DAGCombinerInfo &DCI) const {
   SDValue MagnitudeOp = N->getOperand(0);
   SDValue SignOp = N->getOperand(1);
+
+  // The generic combine for fcopysign + fp cast is too conservative with
+  // vectors, and also gets confused by the splitting we will perform here, so
+  // peek through FP casts.
+  if (SignOp.getOpcode() == ISD::FP_EXTEND ||
+  SignOp.getOpcode() == ISD::FP_ROUND)
+SignOp = SignOp.getOperand(0);
+
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+  EVT SignVT = SignOp.getValueType();
 
   // f64 fcopysign is really an f32 copysign on the high bits, so replace the
   // lower half with a copy.
   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
-  if (MagnitudeOp.getValueType() == MVT::f64) {
-SDValue MagAsVector =
-DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
-SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-MagAsVector, DAG.getConstant(0, DL, MVT::i32));
-SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-MagAsVector, DAG.getConstant(1, DL, MVT::i32));
+  EVT MagVT = MagnitudeOp.getValueType();
+  if (MagVT.getScalarType() == MVT::f64) {
+unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+EVT F32VT = MagVT.isVector()
+? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * 
NumElts)
+: MVT::v2f32;
+
+SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
+
+SmallVector NewElts;
+for (unsigned I = 0; I != NumElts; ++I) {
+  SDValue MagLo =
+  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+  DAG.getConstant(2 * I, DL, MVT::i32));
+  SDValue MagHi =
+  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+  DAG.getConstant(2 * I + 1, DL, MVT::i32));
 
-SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
+  SDValue SignOpElt =
+  MagVT.isVector()
+  ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 
SignVT.getScalarType(),
+SignOp, DAG.getConstant(I, DL, MVT::i32))
+  : SignOp;
+
+  SDValue HiOp =
+  DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
+
+  SDValue Vector =
+  DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+
+  SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+  NewElts.push_back(NewElt);
+}
 
-SDValue Vector =
-DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+if (NewElts.size() == 1)
+  return NewElts[0];
 
-return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
   }
 
-  if (SignOp.getValueType() != MVT::f64)
+  if (SignVT != MVT::f64)
 return SDValue();
 
   // Reduce width of sign operand, we only need the highest bit.
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll 
b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index a01c2fa152ab3..15b049d4d7563 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -131,8 +131,8 @@ define <2 x double> 
@test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd:
 ; GFX9:   ; %bb.0:
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:v_or_b32_e32 v4, 1, v4
+; GFX9-NEXT:v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[4:5], v4
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[6:7], v6
 ; GFX9-NEXT:s_brev_b32 s4, -2
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 32e3f72af516f..3bd068362410b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU

[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142177

>From f6e957bcc7f122fb35e0ecc7dfa82fec56b2a865 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 17:53:15 +0200
Subject: [PATCH] AMDGPU: Improve v32f16/v32bf16 copysign handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |   6 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 688 +
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 307 +
 3 files changed, 999 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3535eb41682d9..1957e442dbabb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -759,7 +759,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
-MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16},
+MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
+MVT::v32f16, MVT::v32bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5943,7 +5944,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
  VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
  VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
+ VT == MVT::v32bf16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 4bbd170529ad0..7c89a41d62fbf 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -2562,6 +2562,694 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> 
%mag, <16 x bfloat> %sign
   ret <16 x bfloat> %result
 }
 
+define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> 
%sign) {
+; GCN-LABEL: v_copysign_v32bf16:
+; GCN:   ; %bb.0:
+; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:s_waitcnt vmcnt(1)
+; GCN-NEXT:v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT:v_bfe_u32 v32, v32, 16, 15
+; GCN-NEXT:v_and_b32_e32 v31, 0x8000, v31
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:v_or_b32_e32 v31, v32, v31
+; GCN-NEXT:v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:v_bfe_u32 v30, v30, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:v_or_b32_e32 v30, v30, v32
+; GCN-NEXT:v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:v_bfe_u32 v29, v29, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT:v_or_b32_e32 v29, v29, v32
+; GCN-NEXT:v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:v_bfe_u32 v28, v28, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:v_or_b32_e32 v28, v28, v32
+; GCN-NEXT:v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:v_bfe_u32 v27, v27, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT:v_or_b32_e32 v27, v27, v32
+; GCN-NEXT:v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:v_bfe_u32 v26, v26, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:v_or_b32_e32 v26, v26, v32
+; GCN-NEXT:v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:v_bfe_u32 v25, v25, 16, 15
+; GCN-NEXT:s_w

[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/142177

None

>From e55b837d5e54d23a16219ca133838fdcab3b Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 17:53:15 +0200
Subject: [PATCH] AMDGPU: Improve v32f16/v32bf16 copysign handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |   6 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 688 +
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 307 +
 3 files changed, 999 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3535eb41682d9..1957e442dbabb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -759,7 +759,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
-MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16},
+MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
+MVT::v32f16, MVT::v32bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5943,7 +5944,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
  VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
  VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
+ VT == MVT::v32bf16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 4bbd170529ad0..7c89a41d62fbf 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -2562,6 +2562,694 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> 
%mag, <16 x bfloat> %sign
   ret <16 x bfloat> %result
 }
 
+define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> 
%sign) {
+; GCN-LABEL: v_copysign_v32bf16:
+; GCN:   ; %bb.0:
+; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:s_waitcnt vmcnt(1)
+; GCN-NEXT:v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT:v_bfe_u32 v32, v32, 16, 15
+; GCN-NEXT:v_and_b32_e32 v31, 0x8000, v31
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:v_or_b32_e32 v31, v32, v31
+; GCN-NEXT:v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:v_bfe_u32 v30, v30, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:v_or_b32_e32 v30, v30, v32
+; GCN-NEXT:v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:v_bfe_u32 v29, v29, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT:v_or_b32_e32 v29, v29, v32
+; GCN-NEXT:v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:v_bfe_u32 v28, v28, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:v_or_b32_e32 v28, v28, v32
+; GCN-NEXT:v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:v_bfe_u32 v27, v27, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT:v_or_b32_e32 v27, v27, v32
+; GCN-NEXT:v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:v_bfe_u32 v26, v26, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:v_or_b32_e32 v26, v26, v32
+; GCN-NEXT:v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:v_bfe_u32 v25, v25, 16, 15
+; GCN-NEXT: 

[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)

2025-05-30 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec approved this pull request.


https://github.com/llvm/llvm-project/pull/142156
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)

2025-05-30 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec approved this pull request.


https://github.com/llvm/llvm-project/pull/142157
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v4f16/v4bf16 copysign handling (PR #142174)

2025-05-30 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec approved this pull request.


https://github.com/llvm/llvm-project/pull/142174
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)

2025-05-30 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec approved this pull request.


https://github.com/llvm/llvm-project/pull/142175
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)

2025-05-30 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec approved this pull request.


https://github.com/llvm/llvm-project/pull/142177
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **May 30, 5:45 PM UTC**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/142157).


https://github.com/llvm/llvm-project/pull/142157
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Make v2f16/v2bf16 copysign legal (PR #142173)

2025-05-30 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec approved this pull request.

LGTM with a nit: title says it is legal, but it is custom.

https://github.com/llvm/llvm-project/pull/142173
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)

2025-05-30 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec approved this pull request.


https://github.com/llvm/llvm-project/pull/142114
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **May 30, 5:45 PM UTC**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/142156).


https://github.com/llvm/llvm-project/pull/142156
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)

2025-05-30 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec approved this pull request.


https://github.com/llvm/llvm-project/pull/142176
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Make v2f16/v2bf16 copysign legal (PR #142173)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> LGTM with a nit: title says it is legal, but it is custom.

The same type size is still treated as legal 

https://github.com/llvm/llvm-project/pull/142173
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)

2025-05-30 Thread Shilei Tian via llvm-branch-commits

shiltian wrote:

Is it a "move" or adds new tests? There doesn't seem to be any delete.

https://github.com/llvm/llvm-project/pull/142114
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142177

>From f6e957bcc7f122fb35e0ecc7dfa82fec56b2a865 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 17:53:15 +0200
Subject: [PATCH] AMDGPU: Improve v32f16/v32bf16 copysign handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |   6 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 688 +
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 307 +
 3 files changed, 999 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3535eb41682d9..1957e442dbabb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -759,7 +759,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
-MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16},
+MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
+MVT::v32f16, MVT::v32bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5943,7 +5944,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
  VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
  VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
+ VT == MVT::v32bf16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 4bbd170529ad0..7c89a41d62fbf 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -2562,6 +2562,694 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> 
%mag, <16 x bfloat> %sign
   ret <16 x bfloat> %result
 }
 
+define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> 
%sign) {
+; GCN-LABEL: v_copysign_v32bf16:
+; GCN:   ; %bb.0:
+; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:s_waitcnt vmcnt(1)
+; GCN-NEXT:v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT:v_bfe_u32 v32, v32, 16, 15
+; GCN-NEXT:v_and_b32_e32 v31, 0x8000, v31
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:v_or_b32_e32 v31, v32, v31
+; GCN-NEXT:v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:v_bfe_u32 v30, v30, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:v_or_b32_e32 v30, v30, v32
+; GCN-NEXT:v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:v_bfe_u32 v29, v29, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT:v_or_b32_e32 v29, v29, v32
+; GCN-NEXT:v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:v_bfe_u32 v28, v28, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:v_or_b32_e32 v28, v28, v32
+; GCN-NEXT:v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:v_bfe_u32 v27, v27, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT:v_or_b32_e32 v27, v27, v32
+; GCN-NEXT:v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:v_bfe_u32 v26, v26, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:v_or_b32_e32 v26, v26, v32
+; GCN-NEXT:v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:v_bfe_u32 v25, v25, 16, 15
+; GCN-NEXT:s_w

[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142176

>From 93748937ce90b591ef40e2d75e96c7f1904758f4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 17:48:01 +0200
Subject: [PATCH] AMDGPU: Improve v16f16/v16bf16 copysign handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |   6 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 565 +++--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 431 ++--
 3 files changed, 126 insertions(+), 876 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ecfa6daf7803d..3535eb41682d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -759,7 +759,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
-MVT::v8f16, MVT::v8bf16},
+MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5942,8 +5942,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
  VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
- VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 ||
- VT == MVT::v32i16 || VT == MVT::v32f16);
+ VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index ab4cff2469467..4bbd170529ad0 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1719,87 +1719,31 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x 
bfloat> inreg %arg_mag, <16
 ;
 ; GFX8-LABEL: s_copysign_v16bf16:
 ; GFX8:   ; %bb.0:
-; GFX8-NEXT:s_movk_i32 s16, 0x7fff
+; GFX8-NEXT:s_mov_b32 s16, 0x7fff7fff
 ; GFX8-NEXT:v_mov_b32_e32 v0, s7
 ; GFX8-NEXT:v_mov_b32_e32 v1, s15
-; GFX8-NEXT:s_lshr_b32 s15, s15, 16
-; GFX8-NEXT:s_lshr_b32 s7, s7, 16
 ; GFX8-NEXT:v_bfi_b32 v0, s16, v0, v1
-; GFX8-NEXT:v_mov_b32_e32 v1, s7
-; GFX8-NEXT:v_mov_b32_e32 v2, s15
-; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2
-; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v1, s6
 ; GFX8-NEXT:v_mov_b32_e32 v2, s14
-; GFX8-NEXT:s_lshr_b32 s7, s14, 16
-; GFX8-NEXT:s_lshr_b32 s6, s6, 16
 ; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2
-; GFX8-NEXT:v_mov_b32_e32 v2, s6
-; GFX8-NEXT:v_mov_b32_e32 v3, s7
-; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3
-; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:v_mov_b32_e32 v3, s13
-; GFX8-NEXT:s_lshr_b32 s6, s13, 16
-; GFX8-NEXT:s_lshr_b32 s5, s5, 16
 ; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3
-; GFX8-NEXT:v_mov_b32_e32 v3, s5
-; GFX8-NEXT:v_mov_b32_e32 v4, s6
-; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4
-; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v3, s4
 ; GFX8-NEXT:v_mov_b32_e32 v4, s12
-; GFX8-NEXT:s_lshr_b32 s5, s12, 16
-; GFX8-NEXT:s_lshr_b32 s4, s4, 16
 ; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4
-; GFX8-NEXT:v_mov_b32_e32 v4, s4
-; GFX8-NEXT:v_mov_b32_e32 v5, s5
-; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5
-; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:v_mov_b32_e32 v5, s11
-; GFX8-NEXT:s_lshr_b32 s4, s11, 16
-; GFX8-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5
-; GFX8-NEXT:v_mov_b32_e32 v5, s3
-; GFX8-NEXT:v_mov_b32_e32 v6, s4
-; GFX8-NEXT:v_bfi_b32 v5, s16, v5, v6
-; GFX8-NEXT:v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:v_mov_b32_e32 v6, s10
-; GFX8-NEXT:s_lshr_b32 s3, s10, 16
-; GFX8-NEXT:s_lshr_b32 s2, s2, 16
 ; GFX8-NEXT:v_bfi_b

[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142175

>From 883a508fa80728ad2a916d4a5963b23cf585aaa2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 17:46:06 +0200
Subject: [PATCH] AMDGPU: Improve v8f16/v8bf16 copysign handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |   9 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 295 +++--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 229 ++--
 3 files changed, 74 insertions(+), 459 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1c30d3f3bd883..ecfa6daf7803d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -758,7 +758,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
-   {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16},
+   {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
+MVT::v8f16, MVT::v8bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5940,9 +5941,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   EVT VT = Op.getValueType();
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
- VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 ||
- VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 ||
- VT == MVT::v32f16);
+ VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 ||
+ VT == MVT::v32i16 || VT == MVT::v32f16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 3bc1232ce3ed1..ab4cff2469467 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x 
bfloat> inreg %arg_mag, <8 x
 ;
 ; GFX8-LABEL: s_copysign_v8bf16:
 ; GFX8:   ; %bb.0:
-; GFX8-NEXT:s_movk_i32 s8, 0x7fff
+; GFX8-NEXT:s_mov_b32 s8, 0x7fff7fff
 ; GFX8-NEXT:v_mov_b32_e32 v0, s3
 ; GFX8-NEXT:v_mov_b32_e32 v1, s7
-; GFX8-NEXT:s_lshr_b32 s7, s7, 16
-; GFX8-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX8-NEXT:v_bfi_b32 v0, s8, v0, v1
-; GFX8-NEXT:v_mov_b32_e32 v1, s3
-; GFX8-NEXT:v_mov_b32_e32 v2, s7
-; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2
-; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:v_mov_b32_e32 v2, s6
-; GFX8-NEXT:s_lshr_b32 s3, s6, 16
-; GFX8-NEXT:s_lshr_b32 s2, s2, 16
 ; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2
-; GFX8-NEXT:v_mov_b32_e32 v2, s2
-; GFX8-NEXT:v_mov_b32_e32 v3, s3
-; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3
-; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:v_mov_b32_e32 v3, s5
-; GFX8-NEXT:s_lshr_b32 s2, s5, 16
-; GFX8-NEXT:s_lshr_b32 s1, s1, 16
 ; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3
-; GFX8-NEXT:v_mov_b32_e32 v3, s1
-; GFX8-NEXT:v_mov_b32_e32 v4, s2
-; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4
-; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:v_mov_b32_e32 v4, s4
-; GFX8-NEXT:s_lshr_b32 s1, s4, 16
-; GFX8-NEXT:s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4
-; GFX8-NEXT:v_mov_b32_e32 v4, s0
-; GFX8-NEXT:v_mov_b32_e32 v5, s1
-; GFX8-NEXT:v_bfi_b32 v4, s8, v4, v5
-; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_readfirstlane_b32 s0, v3
 ; GFX8-NEXT:v_readfirstlane_b32 s1, v2
 ; GFX8-NEXT:v_readfirstlane_b32 s2, v1
@@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x 
bfloat> inreg %arg_mag, <8 x
 ;
 ; GFX9-LABEL: s_copysign_v8bf16:
 ; GFX9:   ; %bb.0:
-; GFX9-NEXT:s_movk_i32 s8, 0x7fff
+; GFX9-NEXT:s_mov_b32 s8, 0x7fff7fff
 ; GFX9-NEXT:v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:v_mov_b32_e32 v1, s7
-; GFX9-NEXT:s_lshr_b32 s7, s7, 16
-; GFX9-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX9-NEXT:v_bfi_b32 v0, s8, v0, v1
-; GFX9-NEXT:v_mov_b32_e32 v1, s3
-; GF

[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142114

>From da7b0574d489d67f6f05dd396e4a8bdf95941bf8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 11:21:42 +0200
Subject: [PATCH 1/2] AMDGPU: Move bf16 copysign tests to separate file

Make symmetric with other copysign tests
---
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 959 +
 1 file changed, 959 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll

diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
new file mode 100644
index 0..4fcce8a6d623f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -0,0 +1,959 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN
+; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s 
-check-prefixes=GFX7
+; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s 
-check-prefixes=GFX9
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s 
-check-prefixes=GFX10
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck 
%s -check-prefixes=GFX11,GFX11TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck 
%s -check-prefixes=GFX11,GFX11FAKE16
+
+declare bfloat @llvm.copysign.bf16(bfloat, bfloat)
+
+define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
+; GCN-LABEL: v_copysign_bf16_bf16:
+; GCN:   ; %bb.0:
+; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:v_and_b32_e32 v1, 0x8000, v1
+; GCN-NEXT:v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15
+; GCN-NEXT:v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_bf16:
+; GFX7:   ; %bb.0:
+; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:v_and_b32_e32 v1, 0x8000, v1
+; GFX7-NEXT:v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:v_bfe_u32 v0, v0, 16, 15
+; GFX7-NEXT:v_or_b32_e32 v0, v0, v1
+; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_bf16:
+; GFX8:   ; %bb.0:
+; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1
+; GFX8-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_bf16:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_bf16:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX10-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_bf16_bf16:
+; GFX11:   ; %bb.0:
+; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-NEXT:s_setpc_b64 s[30:31]
+  %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
+  ret bfloat %op
+}
+
+define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
+; GCN-LABEL: v_copysign_bf16_s_bf16:
+; GCN:   ; %bb.0:
+; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:s_and_b32 s4, s16, 0x8000
+; GCN-NEXT:s_lshr_b32 s4, s4, 16
+; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15
+; GCN-NEXT:v_or_b32_e32 v0, s4, v0
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_s_bf16:
+; GFX7:   ; %bb.0:
+; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:s_and_b32 s4, s16, 0x8000
+; GFX7-NEXT:s_lshr_b32 s4, s4, 16
+; GFX7-NEXT:v_bfe_u32 v0, v0, 16, 15
+; GFX7-NEXT:v_or_b32_e32 v0, s4, v0
+; GFX7-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_s_bf16:
+; GFX8:   ; %bb.0:
+; GFX8-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:v_mov_b32_e32 v1, s16
+; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1
+; GFX8-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_s_bf16:
+; GFX9:   ; %bb.0:
+; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:v_mov_b32_e32 v1, s16
+; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT:s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_s_bf16:
+; GFX10:   ; %bb.0:
+; GFX10-NEXT:s_waitcnt vmcnt(0

[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142157

>From ed0712298fd1c3a625ad870d54c5bf3c21052712 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 12:15:33 +0200
Subject: [PATCH] AMDGPU: Handle vectors in copysign sign type combine

This avoids some ugly codegen on pre-16-bit instruction targets now
from annoying f16 legalization effects. This also avoids regressions
on newer targets in a future patch.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  35 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 174 ---
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 501 ++---
 3 files changed, 129 insertions(+), 581 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index af85c6bef273d..c61c52ec5843e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11737,9 +11737,10 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   // lower half with a copy.
   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
   EVT MagVT = MagnitudeOp.getValueType();
-  if (MagVT.getScalarType() == MVT::f64) {
-unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
 
+  unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+  if (MagVT.getScalarType() == MVT::f64) {
 EVT F32VT = MagVT.isVector()
 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * 
NumElts)
 : MVT::v2f32;
@@ -11777,7 +11778,7 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
   }
 
-  if (SignVT != MVT::f64)
+  if (SignVT.getScalarType() != MVT::f64)
 return SDValue();
 
   // Reduce width of sign operand, we only need the highest bit.
@@ -11785,13 +11786,31 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   // fcopysign f64:x, f64:y ->
   //   fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
   // TODO: In some cases it might make sense to go all the way to f16.
-  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
-  SDValue SignAsF32 =
-  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
-  DAG.getConstant(1, DL, MVT::i32));
+
+  EVT F32VT = MagVT.isVector()
+  ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
+  : MVT::v2f32;
+
+  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
+
+  SmallVector F32Signs;
+  for (unsigned I = 0; I != NumElts; ++I) {
+// Take sign from odd elements of cast vector
+SDValue SignAsF32 =
+DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
+DAG.getConstant(2 * I + 1, DL, MVT::i32));
+F32Signs.push_back(SignAsF32);
+  }
+
+  SDValue NewSign =
+  NumElts == 1
+  ? F32Signs.back()
+  : DAG.getNode(ISD::BUILD_VECTOR, DL,
+EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
+F32Signs);
 
   return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
- SignAsF32);
+ NewSign);
 }
 
 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 3bd068362410b..26ea80a802f91 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -4677,37 +4677,33 @@ define <2 x bfloat> 
@v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m
 ; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:v_cvt_f32_f64_e32 v2, v[2:3]
-; GCN-NEXT:v_cvt_f32_f64_e32 v3, v[4:5]
+; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v5
+; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3
 ; GCN-NEXT:v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:v_bfe_u32 v1, v1, 16, 15
+; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3
-; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v2
-; GCN-NEXT:v_or_b32_e32 v1, v1, v3
-; GCN-NEXT:v_or_b32_e32 v0, v0, v2
-; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:v_or_b32_e32 v1, v1, v2
+; GCN-NEXT:v_or_b32_e32 v0, v0, v3
 ; GCN-NEXT:v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64:
 ; GFX7:   ; %bb.0:
 ; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX7-NEXT:v_cvt_f32_f64_e32 v3, v[4:5]
-; GFX7-NEXT:v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:v_mul_f32_e32 v

[llvm-branch-commits] [llvm] AMDGPU: Make v2f16/v2bf16 copysign legal (PR #142173)

2025-05-30 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes

Fixes #141931

---

Patch is 153.02 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/142173.diff


6 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+31) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+1) 
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+10) 
- (modified) llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll (+1-6) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+515-610) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+550-649) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c61c52ec5843e..ab3c316f76deb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -756,6 +756,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 // allows matching fneg (fabs x) patterns)
 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
 
+// Can do this in one BFI plus a constant materialize.
+setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16}, Custom);
+
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, 
Legal);
 
@@ -6088,6 +6091,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
SelectionDAG &DAG) const {
   case ISD::SADDSAT:
   case ISD::SSUBSAT:
 return splitBinaryVectorOp(Op, DAG);
+  case ISD::FCOPYSIGN:
+return lowerFCOPYSIGN(Op, DAG);
   case ISD::MUL:
 return lowerMUL(Op, DAG);
   case ISD::SMULO:
@@ -7115,6 +7120,32 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue 
Op,
   return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
 }
 
+SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Mag = Op.getOperand(0);
+  SDValue Sign = Op.getOperand(1);
+
+  EVT MagVT = Mag.getValueType();
+  EVT SignVT = Sign.getValueType();
+
+  assert(MagVT.isVector());
+
+  if (MagVT == SignVT)
+return Op;
+
+  assert(MagVT.getVectorNumElements() == 2);
+
+  // fcopysign v2f16:mag, v2f32:sign ->
+  //   fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
+
+  SDLoc SL(Op);
+  SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
+  SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, 
SignAsInt32);
+
+  SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
+
+  return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
+}
+
 // Custom lowering for vector multiplications and s_mul_u64.
 SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h 
b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c42366a1c04c8..283f8136d352a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -149,6 +149,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
   SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
+  SDValue lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td 
b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2e2913d88cc54..28557ad516865 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2062,6 +2062,16 @@ def : GCNPat <
 >;
 } // End foreach fp16vt = [f16, bf16]
 
+
+foreach fp16vt = [v2f16, v2bf16] in {
+
+def : GCNPat <
+  (fcopysign fp16vt:$src0, fp16vt:$src1),
+  (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src0, $src1)
+>;
+
+}
+
 /** == **/
 /** Immediate Patterns **/
 /** == **/
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll 
b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index 15b049d4d7563..021104114d796 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -36,17 +36,12 @@ define <2 x half> 
@test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3
 ; GFX9-NEXT:v_cvt_f32_i32_e32 v2, v2
 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:v_and_b32_e32 v3, 0x7fff7fff, v0
-; GFX9-NEXT:s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:s_mov_b32 s4, 0x7fff7fff
 ; GFX9-NEXT:v_cvt_f16_f32_e32 v2, v2
 ; GFX9-NEXT:v_cvt_f16_f32_e32 v1, v1
 ; GFX9-NEXT:v_pack_b32_f16 v1, v1, v2
 ; GFX9-NEXT:v_pk_mul_f16 v1, v3, v

[llvm-branch-commits] [mlir] [MLIR] Add apply_patterns.vector.arm_sve.lower_contraction TD Op (PR #140572)

2025-05-30 Thread Andrzej Warzyński via llvm-branch-commits

https://github.com/banach-space approved this pull request.

LGTM, thanks!

https://github.com/llvm/llvm-project/pull/140572
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/142176

None

>From a05dedab56153ae13dfa3ed168e73b42d4188bb0 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 17:48:01 +0200
Subject: [PATCH] AMDGPU: Improve v16f16/v16bf16 copysign handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |   6 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 565 +++--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 431 ++--
 3 files changed, 126 insertions(+), 876 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ecfa6daf7803d..3535eb41682d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -759,7 +759,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
-MVT::v8f16, MVT::v8bf16},
+MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5942,8 +5942,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
  VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
- VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 ||
- VT == MVT::v32i16 || VT == MVT::v32f16);
+ VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index ab4cff2469467..4bbd170529ad0 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1719,87 +1719,31 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x 
bfloat> inreg %arg_mag, <16
 ;
 ; GFX8-LABEL: s_copysign_v16bf16:
 ; GFX8:   ; %bb.0:
-; GFX8-NEXT:s_movk_i32 s16, 0x7fff
+; GFX8-NEXT:s_mov_b32 s16, 0x7fff7fff
 ; GFX8-NEXT:v_mov_b32_e32 v0, s7
 ; GFX8-NEXT:v_mov_b32_e32 v1, s15
-; GFX8-NEXT:s_lshr_b32 s15, s15, 16
-; GFX8-NEXT:s_lshr_b32 s7, s7, 16
 ; GFX8-NEXT:v_bfi_b32 v0, s16, v0, v1
-; GFX8-NEXT:v_mov_b32_e32 v1, s7
-; GFX8-NEXT:v_mov_b32_e32 v2, s15
-; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2
-; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v1, s6
 ; GFX8-NEXT:v_mov_b32_e32 v2, s14
-; GFX8-NEXT:s_lshr_b32 s7, s14, 16
-; GFX8-NEXT:s_lshr_b32 s6, s6, 16
 ; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2
-; GFX8-NEXT:v_mov_b32_e32 v2, s6
-; GFX8-NEXT:v_mov_b32_e32 v3, s7
-; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3
-; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:v_mov_b32_e32 v3, s13
-; GFX8-NEXT:s_lshr_b32 s6, s13, 16
-; GFX8-NEXT:s_lshr_b32 s5, s5, 16
 ; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3
-; GFX8-NEXT:v_mov_b32_e32 v3, s5
-; GFX8-NEXT:v_mov_b32_e32 v4, s6
-; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4
-; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v3, s4
 ; GFX8-NEXT:v_mov_b32_e32 v4, s12
-; GFX8-NEXT:s_lshr_b32 s5, s12, 16
-; GFX8-NEXT:s_lshr_b32 s4, s4, 16
 ; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4
-; GFX8-NEXT:v_mov_b32_e32 v4, s4
-; GFX8-NEXT:v_mov_b32_e32 v5, s5
-; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5
-; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:v_mov_b32_e32 v5, s11
-; GFX8-NEXT:s_lshr_b32 s4, s11, 16
-; GFX8-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5
-; GFX8-NEXT:v_mov_b32_e32 v5, s3
-; GFX8-NEXT:v_mov_b32_e32 v6, s4
-; GFX8-NEXT:v_bfi_b32 v5, s16, v5, v6
-; GFX8-NEXT:v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:v_mov_b32_e32 v6, s10
-; GFX8-NEXT:s_lshr_b32 s3, s10, 16
-; GFX8-NEXT:s_lshr_b32 s2, s2, 16
 ; GFX8-NEXT:v

[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#142177** https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142176** https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#142175** https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142174** https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142173** https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/142176
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/142175

None

>From 196b0107e162236bb902c52ddfba2e732dbc1db2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 17:46:06 +0200
Subject: [PATCH] AMDGPU: Improve v8f16/v8bf16 copysign handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |   9 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 295 +++--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 229 ++--
 3 files changed, 74 insertions(+), 459 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1c30d3f3bd883..ecfa6daf7803d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -758,7 +758,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
-   {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16},
+   {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
+MVT::v8f16, MVT::v8bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5940,9 +5941,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   EVT VT = Op.getValueType();
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
- VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 ||
- VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 ||
- VT == MVT::v32f16);
+ VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 ||
+ VT == MVT::v32i16 || VT == MVT::v32f16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 3bc1232ce3ed1..ab4cff2469467 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x 
bfloat> inreg %arg_mag, <8 x
 ;
 ; GFX8-LABEL: s_copysign_v8bf16:
 ; GFX8:   ; %bb.0:
-; GFX8-NEXT:s_movk_i32 s8, 0x7fff
+; GFX8-NEXT:s_mov_b32 s8, 0x7fff7fff
 ; GFX8-NEXT:v_mov_b32_e32 v0, s3
 ; GFX8-NEXT:v_mov_b32_e32 v1, s7
-; GFX8-NEXT:s_lshr_b32 s7, s7, 16
-; GFX8-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX8-NEXT:v_bfi_b32 v0, s8, v0, v1
-; GFX8-NEXT:v_mov_b32_e32 v1, s3
-; GFX8-NEXT:v_mov_b32_e32 v2, s7
-; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2
-; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:v_mov_b32_e32 v2, s6
-; GFX8-NEXT:s_lshr_b32 s3, s6, 16
-; GFX8-NEXT:s_lshr_b32 s2, s2, 16
 ; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2
-; GFX8-NEXT:v_mov_b32_e32 v2, s2
-; GFX8-NEXT:v_mov_b32_e32 v3, s3
-; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3
-; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:v_mov_b32_e32 v3, s5
-; GFX8-NEXT:s_lshr_b32 s2, s5, 16
-; GFX8-NEXT:s_lshr_b32 s1, s1, 16
 ; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3
-; GFX8-NEXT:v_mov_b32_e32 v3, s1
-; GFX8-NEXT:v_mov_b32_e32 v4, s2
-; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4
-; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:v_mov_b32_e32 v4, s4
-; GFX8-NEXT:s_lshr_b32 s1, s4, 16
-; GFX8-NEXT:s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4
-; GFX8-NEXT:v_mov_b32_e32 v4, s0
-; GFX8-NEXT:v_mov_b32_e32 v5, s1
-; GFX8-NEXT:v_bfi_b32 v4, s8, v4, v5
-; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_readfirstlane_b32 s0, v3
 ; GFX8-NEXT:v_readfirstlane_b32 s1, v2
 ; GFX8-NEXT:v_readfirstlane_b32 s2, v1
@@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x 
bfloat> inreg %arg_mag, <8 x
 ;
 ; GFX9-LABEL: s_copysign_v8bf16:
 ; GFX9:   ; %bb.0:
-; GFX9-NEXT:s_movk_i32 s8, 0x7fff
+; GFX9-NEXT:s_mov_b32 s8, 0x7fff7fff
 ; GFX9-NEXT:v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:v_mov_b32_e32 v1, s7
-; GFX9-NEXT:s_lshr_b32 s7, s7, 16
-; GFX9-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX9-NEXT:v_bfi_b32 v0, s8, v0, v1
-; GFX9-NEXT:v_mov_b32_e32 v1, s3

[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#142177** https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#142176** https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142175** https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142174** https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142173** https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/142177
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v4f16/v4bf16 copysign handling (PR #142174)

2025-05-30 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes



---

Patch is 284.89 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/142174.diff


3 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+13-11) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+938-1162) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+1059-1305) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ab3c316f76deb..1c30d3f3bd883 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -757,7 +757,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
 
 // Can do this in one BFI plus a constant materialize.
-setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16}, Custom);
+setOperationAction(ISD::FCOPYSIGN,
+   {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16},
+   Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, 
Legal);
@@ -5936,10 +5938,11 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue 
Op,
   SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
-  assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
- VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
- VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
+  assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
+ VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
+ VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 ||
+ VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 ||
+ VT == MVT::v32f16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
@@ -7122,18 +7125,17 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue 
Op,
 
 SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Mag = Op.getOperand(0);
-  SDValue Sign = Op.getOperand(1);
-
   EVT MagVT = Mag.getValueType();
-  EVT SignVT = Sign.getValueType();
 
-  assert(MagVT.isVector());
+  if (MagVT.getVectorNumElements() > 2)
+return splitBinaryVectorOp(Op, DAG);
+
+  SDValue Sign = Op.getOperand(1);
+  EVT SignVT = Sign.getValueType();
 
   if (MagVT == SignVT)
 return Op;
 
-  assert(MagVT.getVectorNumElements() == 2);
-
   // fcopysign v2f16:mag, v2f32:sign ->
   //   fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
 
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index a5a36d7122f68..3bc1232ce3ed1 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1090,40 +1090,26 @@ define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x 
bfloat> inreg %arg_mag, <3 x
 ;
 ; GFX8-LABEL: s_copysign_v3bf16:
 ; GFX8:   ; %bb.0:
-; GFX8-NEXT:s_movk_i32 s4, 0x7fff
+; GFX8-NEXT:s_mov_b32 s4, 0x7fff7fff
 ; GFX8-NEXT:v_mov_b32_e32 v0, s1
 ; GFX8-NEXT:v_mov_b32_e32 v1, s3
-; GFX8-NEXT:s_lshr_b32 s1, s2, 16
-; GFX8-NEXT:s_lshr_b32 s3, s0, 16
 ; GFX8-NEXT:v_bfi_b32 v0, s4, v0, v1
-; GFX8-NEXT:v_mov_b32_e32 v1, s3
-; GFX8-NEXT:v_mov_b32_e32 v2, s1
+; GFX8-NEXT:v_mov_b32_e32 v1, s0
+; GFX8-NEXT:v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:v_bfi_b32 v1, s4, v1, v2
-; GFX8-NEXT:v_mov_b32_e32 v2, s0
-; GFX8-NEXT:v_mov_b32_e32 v3, s2
-; GFX8-NEXT:v_bfi_b32 v2, s4, v2, v3
-; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_readfirstlane_b32 s0, v1
 ; GFX8-NEXT:v_readfirstlane_b32 s1, v0
 ; GFX8-NEXT:; return to shader part epilog
 ;
 ; GFX9-LABEL: s_copysign_v3bf16:
 ; GFX9:   ; %bb.0:
-; GFX9-NEXT:s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:s_mov_b32 s4, 0x7fff7fff
 ; GFX9-NEXT:v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v1
 ; GFX9-NEXT:v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:v_mov_b32_e32 v2, s2
-; GFX9-NEXT:s_lshr_b32 s1, s2, 16
-; GFX9-NEXT:s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:v_bfi_b32 v1, s4, v1, v2
-; GFX9-NEXT:v_mov_b32_e32 v2, s0
-; GFX9-NEXT:v_mov_b32_e32 v3, s1
-; GFX9-NEXT:v_bfi_b32 v2, s4, v2, v3
-; GFX9-NEXT:v_and_b32_e32 v1, 0x, v1
-; GFX9-NEXT:v_lshl_or_b32 v1, v2, 16, v1
 ; GFX9-NEXT:v_readfirstlane_b32 s0, v1
 ; GFX9-NEXT:v_readfirstlane_b32 s1, v0
 ; GFX

[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#142177** https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142176** https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142175** https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#142174** https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142173** https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/142175
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)

2025-05-30 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes



---

Patch is 56.04 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/142176.diff


3 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+75-490) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+48-383) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ecfa6daf7803d..3535eb41682d9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -759,7 +759,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
-MVT::v8f16, MVT::v8bf16},
+MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5942,8 +5942,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
  VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
- VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 ||
- VT == MVT::v32i16 || VT == MVT::v32f16);
+ VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index ab4cff2469467..4bbd170529ad0 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1719,87 +1719,31 @@ define amdgpu_ps <8 x i32> @s_copysign_v16bf16(<16 x 
bfloat> inreg %arg_mag, <16
 ;
 ; GFX8-LABEL: s_copysign_v16bf16:
 ; GFX8:   ; %bb.0:
-; GFX8-NEXT:s_movk_i32 s16, 0x7fff
+; GFX8-NEXT:s_mov_b32 s16, 0x7fff7fff
 ; GFX8-NEXT:v_mov_b32_e32 v0, s7
 ; GFX8-NEXT:v_mov_b32_e32 v1, s15
-; GFX8-NEXT:s_lshr_b32 s15, s15, 16
-; GFX8-NEXT:s_lshr_b32 s7, s7, 16
 ; GFX8-NEXT:v_bfi_b32 v0, s16, v0, v1
-; GFX8-NEXT:v_mov_b32_e32 v1, s7
-; GFX8-NEXT:v_mov_b32_e32 v2, s15
-; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2
-; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v1, s6
 ; GFX8-NEXT:v_mov_b32_e32 v2, s14
-; GFX8-NEXT:s_lshr_b32 s7, s14, 16
-; GFX8-NEXT:s_lshr_b32 s6, s6, 16
 ; GFX8-NEXT:v_bfi_b32 v1, s16, v1, v2
-; GFX8-NEXT:v_mov_b32_e32 v2, s6
-; GFX8-NEXT:v_mov_b32_e32 v3, s7
-; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3
-; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v2, s5
 ; GFX8-NEXT:v_mov_b32_e32 v3, s13
-; GFX8-NEXT:s_lshr_b32 s6, s13, 16
-; GFX8-NEXT:s_lshr_b32 s5, s5, 16
 ; GFX8-NEXT:v_bfi_b32 v2, s16, v2, v3
-; GFX8-NEXT:v_mov_b32_e32 v3, s5
-; GFX8-NEXT:v_mov_b32_e32 v4, s6
-; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4
-; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v3, s4
 ; GFX8-NEXT:v_mov_b32_e32 v4, s12
-; GFX8-NEXT:s_lshr_b32 s5, s12, 16
-; GFX8-NEXT:s_lshr_b32 s4, s4, 16
 ; GFX8-NEXT:v_bfi_b32 v3, s16, v3, v4
-; GFX8-NEXT:v_mov_b32_e32 v4, s4
-; GFX8-NEXT:v_mov_b32_e32 v5, s5
-; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5
-; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:v_mov_b32_e32 v5, s11
-; GFX8-NEXT:s_lshr_b32 s4, s11, 16
-; GFX8-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX8-NEXT:v_bfi_b32 v4, s16, v4, v5
-; GFX8-NEXT:v_mov_b32_e32 v5, s3
-; GFX8-NEXT:v_mov_b32_e32 v6, s4
-; GFX8-NEXT:v_bfi_b32 v5, s16, v5, v6
-; GFX8-NEXT:v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:v_mov_b32_e32 v6, s10
-; GFX8-NEXT:s_lshr_b32 s3, s10, 16
-; GFX8-NEXT:s_lshr_b32 s2, s2, 16
 ; GFX8-NEXT:v_bfi_b32 v5, s16, v5, v6
-; GFX8-NEXT:v_mov_b32_e32 v6, s2
-; GFX8-NEXT:v_

[llvm-branch-commits] [llvm] AMDGPU: Improve v16f16/v16bf16 copysign handling (PR #142176)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/142176
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/142177
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Make v2f16/v2bf16 copysign legal (PR #142173)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#142177** https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142176** https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142175** https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142174** https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142173** https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/142173
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v4f16/v4bf16 copysign handling (PR #142174)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#142177** https://app.graphite.dev/github/pr/llvm/llvm-project/142177?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142176** https://app.graphite.dev/github/pr/llvm/llvm-project/142176?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142175** https://app.graphite.dev/github/pr/llvm/llvm-project/142175?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142174** https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142174?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#142173** https://app.graphite.dev/github/pr/llvm/llvm-project/142173?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/142174
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v4f16/v4bf16 copysign handling (PR #142174)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/142174
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/142175
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)

2025-05-30 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes



---

Patch is 32.54 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/142175.diff


3 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+5-4) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+41-254) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+28-201) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1c30d3f3bd883..ecfa6daf7803d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -758,7 +758,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
-   {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16},
+   {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
+MVT::v8f16, MVT::v8bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5940,9 +5941,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   EVT VT = Op.getValueType();
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
- VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 ||
- VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 ||
- VT == MVT::v32f16);
+ VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 ||
+ VT == MVT::v32i16 || VT == MVT::v32f16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 3bc1232ce3ed1..ab4cff2469467 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x 
bfloat> inreg %arg_mag, <8 x
 ;
 ; GFX8-LABEL: s_copysign_v8bf16:
 ; GFX8:   ; %bb.0:
-; GFX8-NEXT:s_movk_i32 s8, 0x7fff
+; GFX8-NEXT:s_mov_b32 s8, 0x7fff7fff
 ; GFX8-NEXT:v_mov_b32_e32 v0, s3
 ; GFX8-NEXT:v_mov_b32_e32 v1, s7
-; GFX8-NEXT:s_lshr_b32 s7, s7, 16
-; GFX8-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX8-NEXT:v_bfi_b32 v0, s8, v0, v1
-; GFX8-NEXT:v_mov_b32_e32 v1, s3
-; GFX8-NEXT:v_mov_b32_e32 v2, s7
-; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2
-; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:v_mov_b32_e32 v2, s6
-; GFX8-NEXT:s_lshr_b32 s3, s6, 16
-; GFX8-NEXT:s_lshr_b32 s2, s2, 16
 ; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2
-; GFX8-NEXT:v_mov_b32_e32 v2, s2
-; GFX8-NEXT:v_mov_b32_e32 v3, s3
-; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3
-; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:v_mov_b32_e32 v3, s5
-; GFX8-NEXT:s_lshr_b32 s2, s5, 16
-; GFX8-NEXT:s_lshr_b32 s1, s1, 16
 ; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3
-; GFX8-NEXT:v_mov_b32_e32 v3, s1
-; GFX8-NEXT:v_mov_b32_e32 v4, s2
-; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4
-; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:v_mov_b32_e32 v4, s4
-; GFX8-NEXT:s_lshr_b32 s1, s4, 16
-; GFX8-NEXT:s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4
-; GFX8-NEXT:v_mov_b32_e32 v4, s0
-; GFX8-NEXT:v_mov_b32_e32 v5, s1
-; GFX8-NEXT:v_bfi_b32 v4, s8, v4, v5
-; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_readfirstlane_b32 s0, v3
 ; GFX8-NEXT:v_readfirstlane_b32 s1, v2
 ; GFX8-NEXT:v_readfirstlane_b32 s2, v1
@@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x 
bfloat> inreg %arg_mag, <8 x
 ;
 ; GFX9-LABEL: s_copysign_v8bf16:
 ; GFX9:   ; %bb.0:
-; GFX9-NEXT:s_movk_i32 s8, 0x7fff
+; GFX9-NEXT:s_mov_b32 s8, 0x7fff7fff
 ; GFX9-NEXT:v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:v_mov_b32_e32 v1, s7
-; GFX9-NEXT:s_lshr_b32 s7, s7, 16
-; GFX9-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX9-NEXT:v_bfi_b32 v0, s8, v0, v1
-; GFX9-NEXT:v_mov_b32_e32 v1, s3
-; GFX9-NEXT:v_mov_b32_e32 v2, s7
-; GFX9-NEXT:v_bfi_b32 v1, s8, v1, v

[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)

2025-05-30 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes



---

Patch is 46.39 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/142177.diff


3 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+4-2) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+688) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+307) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3535eb41682d9..1957e442dbabb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -759,7 +759,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
-MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16},
+MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
+MVT::v32f16, MVT::v32bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5943,7 +5944,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
  VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
  VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
+ VT == MVT::v32bf16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 4bbd170529ad0..7c89a41d62fbf 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -2562,6 +2562,694 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> 
%mag, <16 x bfloat> %sign
   ret <16 x bfloat> %result
 }
 
+define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> 
%sign) {
+; GCN-LABEL: v_copysign_v32bf16:
+; GCN:   ; %bb.0:
+; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:s_waitcnt vmcnt(1)
+; GCN-NEXT:v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT:v_bfe_u32 v32, v32, 16, 15
+; GCN-NEXT:v_and_b32_e32 v31, 0x8000, v31
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:v_or_b32_e32 v31, v32, v31
+; GCN-NEXT:v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:v_bfe_u32 v30, v30, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:v_or_b32_e32 v30, v30, v32
+; GCN-NEXT:v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:v_bfe_u32 v29, v29, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT:v_or_b32_e32 v29, v29, v32
+; GCN-NEXT:v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:v_bfe_u32 v28, v28, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:v_or_b32_e32 v28, v28, v32
+; GCN-NEXT:v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:v_bfe_u32 v27, v27, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT:v_or_b32_e32 v27, v27, v32
+; GCN-NEXT:v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:v_bfe_u32 v26, v26, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT:v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT:v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT:buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:v_or_b32_e32 v26, v26, v32
+; GCN-NEXT:v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:v_bfe_u32 v25, v25, 16, 15
+; GCN-NEXT:s_waitcnt vmcnt(0)
+; GCN-NEXT:v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: 

[llvm-branch-commits] [llvm] AMDGPU: Make v2f16/v2bf16 copysign legal (PR #142173)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

Note GloblISel currently just expands G_COPYSIGN, and there is TODO to check 
the expansion does the right thing to form a BFI, but it does not. We should 
probably match the custom lowering / directly legal there 

https://github.com/llvm/llvm-project/pull/142173
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v8f16/v8bf16 copysign handling (PR #142175)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm updated 
https://github.com/llvm/llvm-project/pull/142175

>From 883a508fa80728ad2a916d4a5963b23cf585aaa2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 17:46:06 +0200
Subject: [PATCH] AMDGPU: Improve v8f16/v8bf16 copysign handling

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |   9 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 295 +++--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 229 ++--
 3 files changed, 74 insertions(+), 459 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1c30d3f3bd883..ecfa6daf7803d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -758,7 +758,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
 // Can do this in one BFI plus a constant materialize.
 setOperationAction(ISD::FCOPYSIGN,
-   {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16},
+   {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
+MVT::v8f16, MVT::v8bf16},
Custom);
 
 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5940,9 +5941,9 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   EVT VT = Op.getValueType();
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
  VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
- VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 ||
- VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 ||
- VT == MVT::v32f16);
+ VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
+ VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32 ||
+ VT == MVT::v32i16 || VT == MVT::v32f16);
 
   auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
   auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 3bc1232ce3ed1..ab4cff2469467 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1390,47 +1390,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x 
bfloat> inreg %arg_mag, <8 x
 ;
 ; GFX8-LABEL: s_copysign_v8bf16:
 ; GFX8:   ; %bb.0:
-; GFX8-NEXT:s_movk_i32 s8, 0x7fff
+; GFX8-NEXT:s_mov_b32 s8, 0x7fff7fff
 ; GFX8-NEXT:v_mov_b32_e32 v0, s3
 ; GFX8-NEXT:v_mov_b32_e32 v1, s7
-; GFX8-NEXT:s_lshr_b32 s7, s7, 16
-; GFX8-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX8-NEXT:v_bfi_b32 v0, s8, v0, v1
-; GFX8-NEXT:v_mov_b32_e32 v1, s3
-; GFX8-NEXT:v_mov_b32_e32 v2, s7
-; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2
-; GFX8-NEXT:v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:v_mov_b32_e32 v2, s6
-; GFX8-NEXT:s_lshr_b32 s3, s6, 16
-; GFX8-NEXT:s_lshr_b32 s2, s2, 16
 ; GFX8-NEXT:v_bfi_b32 v1, s8, v1, v2
-; GFX8-NEXT:v_mov_b32_e32 v2, s2
-; GFX8-NEXT:v_mov_b32_e32 v3, s3
-; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3
-; GFX8-NEXT:v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:v_mov_b32_e32 v3, s5
-; GFX8-NEXT:s_lshr_b32 s2, s5, 16
-; GFX8-NEXT:s_lshr_b32 s1, s1, 16
 ; GFX8-NEXT:v_bfi_b32 v2, s8, v2, v3
-; GFX8-NEXT:v_mov_b32_e32 v3, s1
-; GFX8-NEXT:v_mov_b32_e32 v4, s2
-; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4
-; GFX8-NEXT:v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:v_mov_b32_e32 v4, s4
-; GFX8-NEXT:s_lshr_b32 s1, s4, 16
-; GFX8-NEXT:s_lshr_b32 s0, s0, 16
 ; GFX8-NEXT:v_bfi_b32 v3, s8, v3, v4
-; GFX8-NEXT:v_mov_b32_e32 v4, s0
-; GFX8-NEXT:v_mov_b32_e32 v5, s1
-; GFX8-NEXT:v_bfi_b32 v4, s8, v4, v5
-; GFX8-NEXT:v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD 
src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:v_readfirstlane_b32 s0, v3
 ; GFX8-NEXT:v_readfirstlane_b32 s1, v2
 ; GFX8-NEXT:v_readfirstlane_b32 s2, v1
@@ -1439,47 +1411,19 @@ define amdgpu_ps <4 x i32> @s_copysign_v8bf16(<8 x 
bfloat> inreg %arg_mag, <8 x
 ;
 ; GFX9-LABEL: s_copysign_v8bf16:
 ; GFX9:   ; %bb.0:
-; GFX9-NEXT:s_movk_i32 s8, 0x7fff
+; GFX9-NEXT:s_mov_b32 s8, 0x7fff7fff
 ; GFX9-NEXT:v_mov_b32_e32 v0, s3
 ; GFX9-NEXT:v_mov_b32_e32 v1, s7
-; GFX9-NEXT:s_lshr_b32 s7, s7, 16
-; GFX9-NEXT:s_lshr_b32 s3, s3, 16
 ; GFX9-NEXT:v_bfi_b32 v0, s8, v0, v1
-; GFX9-NEXT:v_mov_b32_e32 v1, s3
-; GF

[llvm-branch-commits] [llvm] AMDGPU: Add more f16 copysign tests (PR #142115)

2025-05-30 Thread Stanislav Mekhanoshin via llvm-branch-commits

https://github.com/rampitec approved this pull request.


https://github.com/llvm/llvm-project/pull/142115
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [OpenMP] Add directive spellings introduced in spec v6.0 (PR #141772)

2025-05-30 Thread Michael Klemm via llvm-branch-commits

https://github.com/mjklemm approved this pull request.

LGTM

https://github.com/llvm/llvm-project/pull/141772
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] [KeyInstr][Clang] Coerced store atoms (PR #134653)

2025-05-30 Thread Stephen Tozer via llvm-branch-commits

https://github.com/SLTozer approved this pull request.


https://github.com/llvm/llvm-project/pull/134653
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [UBSan][Ignorelist] Expanding =sanitize to global. (PR #142077)

2025-05-30 Thread Qinkun Bao via llvm-branch-commits

https://github.com/qinkunbao closed 
https://github.com/llvm/llvm-project/pull/142077
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [clang] b9503fe - Revert "Add IR Profile-Guided Optimization (IR PGO) support to the Flang comp…"

2025-05-30 Thread via llvm-branch-commits

Author: Tarun Prabhu
Date: 2025-05-30T08:22:15-06:00
New Revision: b9503fe262c416111ee77be30767a791cf750fb8

URL: 
https://github.com/llvm/llvm-project/commit/b9503fe262c416111ee77be30767a791cf750fb8
DIFF: 
https://github.com/llvm/llvm-project/commit/b9503fe262c416111ee77be30767a791cf750fb8.diff

LOG: Revert "Add IR Profile-Guided Optimization (IR PGO) support to the Flang 
comp…"

This reverts commit d27a210a77af63568db9f829702b4b2c98473a46.

Added: 


Modified: 
clang/include/clang/Basic/CodeGenOptions.def
clang/include/clang/Basic/CodeGenOptions.h
clang/include/clang/Basic/ProfileList.h
clang/include/clang/Driver/Options.td
clang/lib/Basic/ProfileList.cpp
clang/lib/CodeGen/BackendUtil.cpp
clang/lib/CodeGen/CodeGenAction.cpp
clang/lib/CodeGen/CodeGenFunction.cpp
clang/lib/CodeGen/CodeGenModule.cpp
clang/lib/Driver/ToolChains/Flang.cpp
clang/lib/Frontend/CompilerInvocation.cpp
flang/include/flang/Frontend/CodeGenOptions.def
flang/include/flang/Frontend/CodeGenOptions.h
flang/lib/Frontend/CompilerInvocation.cpp
flang/lib/Frontend/FrontendActions.cpp
flang/test/Driver/flang-f-opts.f90
llvm/include/llvm/Frontend/Driver/CodeGenOptions.h
llvm/lib/Frontend/Driver/CodeGenOptions.cpp

Removed: 
flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext
flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext
flang/test/Profile/gcc-flag-compatibility.f90



diff  --git a/clang/include/clang/Basic/CodeGenOptions.def 
b/clang/include/clang/Basic/CodeGenOptions.def
index 11dad53a52efe..aad4e107cbeb3 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -223,11 +223,9 @@ AFFECTING_VALUE_CODEGENOPT(OptimizeSize, 2, 0) ///< If -Os 
(==1) or -Oz (==2) is
 CODEGENOPT(AtomicProfileUpdate , 1, 0) ///< Set -fprofile-update=atomic
 CODEGENOPT(ContinuousProfileSync, 1, 0) ///< Enable continuous instrumentation 
profiling
 /// Choose profile instrumenation kind or no instrumentation.
-
-ENUM_CODEGENOPT(ProfileInstr, llvm::driver::ProfileInstrKind, 4, 
llvm::driver::ProfileInstrKind::ProfileNone)
-
+ENUM_CODEGENOPT(ProfileInstr, ProfileInstrKind, 4, ProfileNone)
 /// Choose profile kind for PGO use compilation.
-ENUM_CODEGENOPT(ProfileUse, llvm::driver::ProfileInstrKind, 2, 
llvm::driver::ProfileInstrKind::ProfileNone)
+ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileNone)
 /// Partition functions into N groups and select only functions in group i to 
be
 /// instrumented. Selected group numbers can be 0 to N-1 inclusive.
 VALUE_CODEGENOPT(ProfileTotalFunctionGroups, 32, 1)

diff  --git a/clang/include/clang/Basic/CodeGenOptions.h 
b/clang/include/clang/Basic/CodeGenOptions.h
index bffbd00b1bd72..278803f7bb960 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -518,41 +518,35 @@ class CodeGenOptions : public CodeGenOptionsBase {
 
   /// Check if Clang profile instrumenation is on.
   bool hasProfileClangInstr() const {
-return getProfileInstr() ==
-   llvm::driver::ProfileInstrKind::ProfileClangInstr;
+return getProfileInstr() == ProfileClangInstr;
   }
 
   /// Check if IR level profile instrumentation is on.
   bool hasProfileIRInstr() const {
-return getProfileInstr() == llvm::driver::ProfileInstrKind::ProfileIRInstr;
+return getProfileInstr() == ProfileIRInstr;
   }
 
   /// Check if CS IR level profile instrumentation is on.
   bool hasProfileCSIRInstr() const {
-return getProfileInstr() ==
-   llvm::driver::ProfileInstrKind::ProfileCSIRInstr;
+return getProfileInstr() == ProfileCSIRInstr;
   }
 
   /// Check if any form of instrumentation is on.
-  bool hasProfileInstr() const {
-return getProfileInstr() != llvm::driver::ProfileInstrKind::ProfileNone;
-  }
+  bool hasProfileInstr() const { return getProfileInstr() != ProfileNone; }
 
   /// Check if Clang profile use is on.
   bool hasProfileClangUse() const {
-return getProfileUse() == 
llvm::driver::ProfileInstrKind::ProfileClangInstr;
+return getProfileUse() == ProfileClangInstr;
   }
 
   /// Check if IR level profile use is on.
   bool hasProfileIRUse() const {
-return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileIRInstr ||
-   getProfileUse() == llvm::driver::ProfileInstrKind::ProfileCSIRInstr;
+return getProfileUse() == ProfileIRInstr ||
+   getProfileUse() == ProfileCSIRInstr;
   }
 
   /// Check if CSIR profile use is on.
-  bool hasProfileCSIRUse() const {
-return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileCSIRInstr;
-  }
+  bool hasProfileCSIRUse() const { return getProfileUse() == ProfileCSIRInstr; 
}
 
   /// Check if type and variable info should be emitted.
   bool hasReducedDebugInfo() const {

diff  --git a/clang/include/clang/Basic/ProfileList.h 
b/c

[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/142157

This avoids some ugly codegen on pre-16-bit instruction targets now
from annoying f16 legalization effects. This also avoids regressions
on newer targets in a future patch.

>From ad2fdd8df6f80fb7c3792b33012b0ecba28d656b Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 12:15:33 +0200
Subject: [PATCH] AMDGPU: Handle vectors in copysign sign type combine

This avoids some ugly codegen on pre-16-bit instruction targets now
from annoying f16 legalization effects. This also avoids regressions
on newer targets in a future patch.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp  |  35 +++-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll |  64 ---
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 186 ++---
 3 files changed, 65 insertions(+), 220 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index af85c6bef273d..c61c52ec5843e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11737,9 +11737,10 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   // lower half with a copy.
   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
   EVT MagVT = MagnitudeOp.getValueType();
-  if (MagVT.getScalarType() == MVT::f64) {
-unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
 
+  unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+  if (MagVT.getScalarType() == MVT::f64) {
 EVT F32VT = MagVT.isVector()
 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * 
NumElts)
 : MVT::v2f32;
@@ -11777,7 +11778,7 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
   }
 
-  if (SignVT != MVT::f64)
+  if (SignVT.getScalarType() != MVT::f64)
 return SDValue();
 
   // Reduce width of sign operand, we only need the highest bit.
@@ -11785,13 +11786,31 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   // fcopysign f64:x, f64:y ->
   //   fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
   // TODO: In some cases it might make sense to go all the way to f16.
-  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
-  SDValue SignAsF32 =
-  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
-  DAG.getConstant(1, DL, MVT::i32));
+
+  EVT F32VT = MagVT.isVector()
+  ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
+  : MVT::v2f32;
+
+  SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
+
+  SmallVector F32Signs;
+  for (unsigned I = 0; I != NumElts; ++I) {
+// Take sign from odd elements of cast vector
+SDValue SignAsF32 =
+DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
+DAG.getConstant(2 * I + 1, DL, MVT::i32));
+F32Signs.push_back(SignAsF32);
+  }
+
+  SDValue NewSign =
+  NumElts == 1
+  ? F32Signs.back()
+  : DAG.getNode(ISD::BUILD_VECTOR, DL,
+EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
+F32Signs);
 
   return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
- SignAsF32);
+ NewSign);
 }
 
 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 90a368885bfdc..45bf0770ad924 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -4677,37 +4677,33 @@ define <2 x bfloat> 
@v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m
 ; GCN-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:v_cvt_f32_f64_e32 v2, v[2:3]
-; GCN-NEXT:v_cvt_f32_f64_e32 v3, v[4:5]
+; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v5
+; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3
 ; GCN-NEXT:v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:v_bfe_u32 v1, v1, 16, 15
+; GCN-NEXT:v_lshrrev_b32_e32 v3, 16, v3
 ; GCN-NEXT:v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT:v_and_b32_e32 v3, 0x8000, v3
-; GCN-NEXT:v_and_b32_e32 v2, 0x8000, v2
-; GCN-NEXT:v_or_b32_e32 v1, v1, v3
-; GCN-NEXT:v_or_b32_e32 v0, v0, v2
-; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:v_or_b32_e32 v1, v1, v2
+; GCN-NEXT:v_or_b32_e32 v0, v0, v3
 ; GCN-NEXT:v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT:v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64:
 ; GFX7:   ; %bb.0:
 ; GFX7-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgk

[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/142156

None

>From 41692703e0fea3a91ffcb910eb56b5921f2b9ed1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Fri, 30 May 2025 12:03:35 +0200
Subject: [PATCH] AMDGPU: Handle vectors in copysign magnitude sign case

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 58 ---
 .../AMDGPU/copysign-simplify-demanded-bits.ll |  2 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll| 99 ---
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 65 ++--
 4 files changed, 117 insertions(+), 107 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 74ca3e43fce3a..af85c6bef273d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11721,29 +11721,63 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   DAGCombinerInfo &DCI) const {
   SDValue MagnitudeOp = N->getOperand(0);
   SDValue SignOp = N->getOperand(1);
+
+  // The generic combine for fcopysign + fp cast is too conservative with
+  // vectors, and also gets confused by the splitting we will perform here, so
+  // peek through FP casts.
+  if (SignOp.getOpcode() == ISD::FP_EXTEND ||
+  SignOp.getOpcode() == ISD::FP_ROUND)
+SignOp = SignOp.getOperand(0);
+
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+  EVT SignVT = SignOp.getValueType();
 
   // f64 fcopysign is really an f32 copysign on the high bits, so replace the
   // lower half with a copy.
   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
-  if (MagnitudeOp.getValueType() == MVT::f64) {
-SDValue MagAsVector =
-DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
-SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-MagAsVector, DAG.getConstant(0, DL, MVT::i32));
-SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-MagAsVector, DAG.getConstant(1, DL, MVT::i32));
+  EVT MagVT = MagnitudeOp.getValueType();
+  if (MagVT.getScalarType() == MVT::f64) {
+unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+EVT F32VT = MagVT.isVector()
+? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * 
NumElts)
+: MVT::v2f32;
+
+SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
+
+SmallVector NewElts;
+for (unsigned I = 0; I != NumElts; ++I) {
+  SDValue MagLo =
+  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+  DAG.getConstant(2 * I, DL, MVT::i32));
+  SDValue MagHi =
+  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+  DAG.getConstant(2 * I + 1, DL, MVT::i32));
 
-SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
+  SDValue SignOpElt =
+  MagVT.isVector()
+  ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 
SignVT.getScalarType(),
+SignOp, DAG.getConstant(I, DL, MVT::i32))
+  : SignOp;
+
+  SDValue HiOp =
+  DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
+
+  SDValue Vector =
+  DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+
+  SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+  NewElts.push_back(NewElt);
+}
 
-SDValue Vector =
-DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+if (NewElts.size() == 1)
+  return NewElts[0];
 
-return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
   }
 
-  if (SignOp.getValueType() != MVT::f64)
+  if (SignVT != MVT::f64)
 return SDValue();
 
   // Reduce width of sign operand, we only need the highest bit.
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll 
b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index a01c2fa152ab3..15b049d4d7563 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -131,8 +131,8 @@ define <2 x double> 
@test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd:
 ; GFX9:   ; %bb.0:
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:v_or_b32_e32 v4, 1, v4
+; GFX9-NEXT:v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[4:5], v4
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[6:7], v6
 ; GFX9-NEXT:s_brev_b32 s4, -2
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index e99a6bf273e3b..90a368885bfdc 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/Cod

[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/142156
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#142157** https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈 https://app.graphite.dev/github/pr/llvm/llvm-project/142157?utm_source=stack-comment-view-in-graphite";
 target="_blank">(View in Graphite)
* **#142156** https://app.graphite.dev/github/pr/llvm/llvm-project/142156?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142115** https://app.graphite.dev/github/pr/llvm/llvm-project/142115?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142114** https://app.graphite.dev/github/pr/llvm/llvm-project/142114?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#142113** https://app.graphite.dev/github/pr/llvm/llvm-project/142113?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`




This stack of pull requests is managed by https://graphite.dev?utm-source=stack-comment";>Graphite. Learn 
more about https://stacking.dev/?utm_source=stack-comment";>stacking.


https://github.com/llvm/llvm-project/pull/142157
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/142156
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)

2025-05-30 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes



---
Full diff: https://github.com/llvm/llvm-project/pull/142156.diff


4 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+46-12) 
- (modified) llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+40-59) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+30-35) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp 
b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 74ca3e43fce3a..af85c6bef273d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11721,29 +11721,63 @@ SDValue 
SITargetLowering::performFCopySignCombine(SDNode *N,
   DAGCombinerInfo &DCI) const {
   SDValue MagnitudeOp = N->getOperand(0);
   SDValue SignOp = N->getOperand(1);
+
+  // The generic combine for fcopysign + fp cast is too conservative with
+  // vectors, and also gets confused by the splitting we will perform here, so
+  // peek through FP casts.
+  if (SignOp.getOpcode() == ISD::FP_EXTEND ||
+  SignOp.getOpcode() == ISD::FP_ROUND)
+SignOp = SignOp.getOperand(0);
+
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+  EVT SignVT = SignOp.getValueType();
 
   // f64 fcopysign is really an f32 copysign on the high bits, so replace the
   // lower half with a copy.
   // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
-  if (MagnitudeOp.getValueType() == MVT::f64) {
-SDValue MagAsVector =
-DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
-SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-MagAsVector, DAG.getConstant(0, DL, MVT::i32));
-SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
-MagAsVector, DAG.getConstant(1, DL, MVT::i32));
+  EVT MagVT = MagnitudeOp.getValueType();
+  if (MagVT.getScalarType() == MVT::f64) {
+unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+EVT F32VT = MagVT.isVector()
+? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * 
NumElts)
+: MVT::v2f32;
+
+SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
+
+SmallVector NewElts;
+for (unsigned I = 0; I != NumElts; ++I) {
+  SDValue MagLo =
+  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+  DAG.getConstant(2 * I, DL, MVT::i32));
+  SDValue MagHi =
+  DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+  DAG.getConstant(2 * I + 1, DL, MVT::i32));
 
-SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
+  SDValue SignOpElt =
+  MagVT.isVector()
+  ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, 
SignVT.getScalarType(),
+SignOp, DAG.getConstant(I, DL, MVT::i32))
+  : SignOp;
+
+  SDValue HiOp =
+  DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
+
+  SDValue Vector =
+  DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+
+  SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+  NewElts.push_back(NewElt);
+}
 
-SDValue Vector =
-DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+if (NewElts.size() == 1)
+  return NewElts[0];
 
-return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
   }
 
-  if (SignOp.getValueType() != MVT::f64)
+  if (SignVT != MVT::f64)
 return SDValue();
 
   // Reduce width of sign operand, we only need the highest bit.
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll 
b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index a01c2fa152ab3..15b049d4d7563 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -131,8 +131,8 @@ define <2 x double> 
@test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd:
 ; GFX9:   ; %bb.0:
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:v_or_b32_e32 v4, 1, v4
+; GFX9-NEXT:v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[4:5], v4
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[6:7], v6
 ; GFX9-NEXT:s_brev_b32 s4, -2
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index e99a6bf273e3b..90a368885bfdc 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -4055,50 +4055,38 @@ define <2 x double> 
@v_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x do

[llvm-branch-commits] [llvm] AMDGPU: Handle vectors in copysign sign type combine (PR #142157)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/142157
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [HLSL] Adding support for root descriptors in root signature metadata representation (PR #139781)

2025-05-30 Thread via llvm-branch-commits


@@ -105,6 +113,56 @@ static bool parseRootConstants(LLVMContext *Ctx, 
mcdxbc::RootSignatureDesc &RSD,
   return false;
 }
 
+static bool parseRootDescriptors(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *RootDescriptorNode) {
+
+  if (RootDescriptorNode->getNumOperands() != 5)
+return reportError(Ctx, "Invalid format for RootConstants Element");
+
+  std::optional ElementText =
+  extractMdStringValue(RootDescriptorNode, 0);
+  assert(!ElementText->empty());
+
+  dxbc::RootParameterHeader Header;
+  Header.ParameterType =
+  StringSwitch(*ElementText)
+  .Case("RootCBV", llvm::to_underlying(dxbc::RootParameterType::CBV))
+  .Case("RootSRV", llvm::to_underlying(dxbc::RootParameterType::SRV))
+  .Case("RootUAV", llvm::to_underlying(dxbc::RootParameterType::UAV));

joaosaffran wrote:

I did some research, it seems that it would be undefined behavior, @bogner 
correct me if I am wrong please. 

Will update to handle it better

https://github.com/llvm/llvm-project/pull/139781
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Move bf16 copysign tests to separate file (PR #142114)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **May 30, 5:45 PM UTC**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/142114).


https://github.com/llvm/llvm-project/pull/142114
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Add more f16 copysign tests (PR #142115)

2025-05-30 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

### Merge activity

* **May 30, 5:45 PM UTC**: A user started a stack merge that includes this pull 
request via 
[Graphite](https://app.graphite.dev/github/pr/llvm/llvm-project/142115).


https://github.com/llvm/llvm-project/pull/142115
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Improve v4f16/v4bf16 copysign handling (PR #142174)

2025-05-30 Thread Changpeng Fang via llvm-branch-commits

https://github.com/changpeng approved this pull request.


https://github.com/llvm/llvm-project/pull/142174
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [HLSL] Adding support for root descriptors in root signature metadata representation (PR #139781)

2025-05-30 Thread via llvm-branch-commits


@@ -105,6 +113,56 @@ static bool parseRootConstants(LLVMContext *Ctx, 
mcdxbc::RootSignatureDesc &RSD,
   return false;
 }
 
+static bool parseRootDescriptors(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *RootDescriptorNode) {
+
+  if (RootDescriptorNode->getNumOperands() != 5)
+return reportError(Ctx, "Invalid format for RootConstants Element");
+
+  std::optional ElementText =
+  extractMdStringValue(RootDescriptorNode, 0);
+  assert(!ElementText->empty());
+
+  dxbc::RootParameterHeader Header;
+  Header.ParameterType =
+  StringSwitch(*ElementText)
+  .Case("RootCBV", llvm::to_underlying(dxbc::RootParameterType::CBV))
+  .Case("RootSRV", llvm::to_underlying(dxbc::RootParameterType::SRV))
+  .Case("RootUAV", llvm::to_underlying(dxbc::RootParameterType::UAV));

joaosaffran wrote:

Took a look into this, I don't think a default scenario is needed here, this is 
previously checked and error handled when this method is called.

https://github.com/llvm/llvm-project/pull/139781
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] [HLSL] Adding support for root descriptors in root signature metadata representation (PR #139781)

2025-05-30 Thread via llvm-branch-commits

https://github.com/joaosaffran updated 
https://github.com/llvm/llvm-project/pull/139781

>From f871e2049418d6f09bedaa685e72d3a76f15 Mon Sep 17 00:00:00 2001
From: joaosaffran 
Date: Tue, 13 May 2025 02:07:31 +
Subject: [PATCH 1/5] adding support for root descriptors

---
 llvm/lib/Target/DirectX/DXILRootSignature.cpp | 64 ++-
 llvm/lib/Target/DirectX/DXILRootSignature.h   |  3 +-
 .../RootSignature-RootDescriptor.ll   | 34 ++
 3 files changed, 99 insertions(+), 2 deletions(-)
 create mode 100644 
llvm/test/CodeGen/DirectX/ContainerData/RootSignature-RootDescriptor.ll

diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp 
b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
index 1bd816b026fec..77bdb2c2f588f 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
@@ -55,6 +55,14 @@ static std::optional extractMdIntValue(MDNode 
*Node,
   return std::nullopt;
 }
 
+static std::optional extractMdStringValue(MDNode *Node,
+ unsigned int OpId) {
+  MDString *NodeText = cast(Node->getOperand(OpId));
+  if (NodeText == nullptr)
+return std::nullopt;
+  return NodeText->getString();
+}
+
 static bool parseRootFlags(LLVMContext *Ctx, mcdxbc::RootSignatureDesc &RSD,
MDNode *RootFlagNode) {
 
@@ -105,6 +113,56 @@ static bool parseRootConstants(LLVMContext *Ctx, 
mcdxbc::RootSignatureDesc &RSD,
   return false;
 }
 
+static bool parseRootDescriptors(LLVMContext *Ctx,
+ mcdxbc::RootSignatureDesc &RSD,
+ MDNode *RootDescriptorNode) {
+
+  if (RootDescriptorNode->getNumOperands() != 5)
+return reportError(Ctx, "Invalid format for RootConstants Element");
+
+  std::optional ElementText =
+  extractMdStringValue(RootDescriptorNode, 0);
+  assert(!ElementText->empty());
+
+  dxbc::RootParameterHeader Header;
+  Header.ParameterType =
+  StringSwitch(*ElementText)
+  .Case("RootCBV", llvm::to_underlying(dxbc::RootParameterType::CBV))
+  .Case("RootSRV", llvm::to_underlying(dxbc::RootParameterType::SRV))
+  .Case("RootUAV", llvm::to_underlying(dxbc::RootParameterType::UAV));
+
+  if (std::optional Val = extractMdIntValue(RootDescriptorNode, 1))
+Header.ShaderVisibility = *Val;
+  else
+return reportError(Ctx, "Invalid value for ShaderVisibility");
+
+  dxbc::RTS0::v1::RootDescriptor Descriptor;
+  if (std::optional Val = extractMdIntValue(RootDescriptorNode, 2))
+Descriptor.ShaderRegister = *Val;
+  else
+return reportError(Ctx, "Invalid value for ShaderRegister");
+
+  if (std::optional Val = extractMdIntValue(RootDescriptorNode, 3))
+Descriptor.RegisterSpace = *Val;
+  else
+return reportError(Ctx, "Invalid value for RegisterSpace");
+
+  if (RSD.Version == 1) {
+RSD.ParametersContainer.addParameter(Header, Descriptor);
+return false;
+  }
+  assert(RSD.Version > 1);
+  dxbc::RTS0::v2::RootDescriptor DescriptorV2(Descriptor);
+
+  if (std::optional Val = extractMdIntValue(RootDescriptorNode, 4))
+DescriptorV2.Flags = *Val;
+  else
+return reportError(Ctx, "Invalid value for Root Descriptor Flags");
+
+  RSD.ParametersContainer.addParameter(Header, DescriptorV2);
+  return false;
+}
+
 static bool parseRootSignatureElement(LLVMContext *Ctx,
   mcdxbc::RootSignatureDesc &RSD,
   MDNode *Element) {
@@ -116,6 +174,9 @@ static bool parseRootSignatureElement(LLVMContext *Ctx,
   StringSwitch(ElementText->getString())
   .Case("RootFlags", RootSignatureElementKind::RootFlags)
   .Case("RootConstants", RootSignatureElementKind::RootConstants)
+  .Case("RootCBV", RootSignatureElementKind::RootDescriptors)
+  .Case("RootSRV", RootSignatureElementKind::RootDescriptors)
+  .Case("RootUAV", RootSignatureElementKind::RootDescriptors)
   .Default(RootSignatureElementKind::Error);
 
   switch (ElementKind) {
@@ -124,7 +185,8 @@ static bool parseRootSignatureElement(LLVMContext *Ctx,
 return parseRootFlags(Ctx, RSD, Element);
   case RootSignatureElementKind::RootConstants:
 return parseRootConstants(Ctx, RSD, Element);
-break;
+  case RootSignatureElementKind::RootDescriptors:
+return parseRootDescriptors(Ctx, RSD, Element);
   case RootSignatureElementKind::Error:
 return reportError(Ctx, "Invalid Root Signature Element: " +
 ElementText->getString());
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.h 
b/llvm/lib/Target/DirectX/DXILRootSignature.h
index 93ec614f1ab85..b8742d1b1fdfd 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.h
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.h
@@ -27,7 +27,8 @@ namespace dxil {
 enum class RootSignatureElementKind {
   Error = 0,
   RootFlags = 1,
-  RootConstants = 2
+  RootConstants = 2,
+  RootDes