[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on fcopysign sign value (PR #97151)

2024-06-29 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/97151

Math library code has quite a few places with complex bit
logic that are ultimately fed into a copysign. This helps
avoid some regressions in a future patch.

This assumes the position in the float type, which should
at least be valid for IEEE types. Not sure if we need to guard
against ppc_fp128 or anything else weird.

There appears to be some value in simplifying the value operand
as well, but I'll address that separately.

>From 1cc5dcac0c7c83adaef02ad2d092764de1922e4b Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sat, 29 Jun 2024 07:42:27 +0200
Subject: [PATCH] DAG: Call SimplifyDemandedBits on fcopysign sign value

Math library code has quite a few places with complex bit
logic that are ultimately fed into a copysign. This helps
avoid some regressions in a future patch.

This assumes the position in the float type, which should
at least be valid for IEEE types. Not sure if we need to guard
against ppc_fp128 or anything else weird.

There appears to be some value in simplifying the value operand
as well, but I'll address that separately.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  6 ++
 .../AMDGPU/copysign-simplify-demanded-bits.ll | 22 +++-
 .../PowerPC/fp128-bitcast-after-operation.ll  | 55 +--
 llvm/test/CodeGen/RISCV/double-arith.ll   |  4 +-
 .../RISCV/double-bitmanip-dagcombines.ll  | 12 ++--
 .../RISCV/float-bitmanip-dagcombines.ll   | 10 ++--
 6 files changed, 47 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 
b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 254d63abdf805..eaab2b3421190 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17565,6 +17565,12 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0));
 
+  // We only take the sign bit from the sign operand.
+  EVT SignVT = N1.getValueType();
+  if (SimplifyDemandedBits(N1,
+   APInt::getSignMask(SignVT.getScalarSizeInBits(
+return SDValue(N, 0);
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll 
b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index 1eccb55202f02..af4f236c783c6 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -11,11 +11,10 @@ define half @test_pown_reduced_fast_f16_known_odd(half %x, 
i32 %y.arg) #0 {
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:v_or_b32_e32 v1, 1, v1
 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:v_and_b32_e32 v2, 0x8000, v0
 ; GFX9-NEXT:s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:v_mul_f16_e64 v0, |v0|, v1
-; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v2
+; GFX9-NEXT:v_mul_f16_e64 v1, |v0|, v1
+; GFX9-NEXT:v_bfi_b32 v0, s4, v1, v0
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %fabs = call half @llvm.fabs.f16(half %x)
@@ -37,10 +36,9 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 
x half> %x, <2 x i3
 ; GFX9-NEXT:v_cvt_f32_i32_e32 v2, v2
 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:v_and_b32_e32 v3, 0x7fff7fff, v0
-; GFX9-NEXT:v_and_b32_e32 v0, 0x80008000, v0
+; GFX9-NEXT:s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:v_cvt_f16_f32_e32 v2, v2
 ; GFX9-NEXT:v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:v_pack_b32_f16 v1, v1, v2
 ; GFX9-NEXT:v_pk_mul_f16 v1, v3, v1
 ; GFX9-NEXT:v_bfi_b32 v2, s4, v1, v0
@@ -67,10 +65,9 @@ define float @test_pown_reduced_fast_f32_known_odd(float %x, 
i32 %y.arg) #0 {
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:v_or_b32_e32 v1, 1, v1
 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:v_and_b32_e32 v2, 0x8000, v0
 ; GFX9-NEXT:s_brev_b32 s4, -2
-; GFX9-NEXT:v_mul_f32_e64 v0, |v0|, v1
-; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v2
+; GFX9-NEXT:v_mul_f32_e64 v1, |v0|, v1
+; GFX9-NEXT:v_bfi_b32 v0, s4, v1, v0
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -94,8 +91,6 @@ define <2 x float> @test_pown_reduced_fast_v2f32_known_odd(<2 
x float> %x, <2 x
 ; GFX9-NEXT:s_brev_b32 s4, -2
 ; GFX9-NEXT:v_mul_f32_e64 v3, |v1|, v3
 ; GFX9-NEXT:v_mul_f32_e64 v2, |v0|, v2
-; GFX9-NEXT:v_and_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT:v_and_b32_e32 v0, 0x8000, v0
 ; GFX9-NEXT:v_bfi_b32 v0, s4, v2, v0
 ; GFX9-NEXT:v_bfi_b32 v1, s4, v3, v1
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
@@ -118,8 +113,7 @@ define double @test_pown_reduced_fast_f64_known_odd(double 
%x, i32 %y.arg) #0 {
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[2:3], v2
 ;

[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on fcopysign sign value (PR #97151)

2024-06-29 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/97151?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#97151** https://app.graphite.dev/github/pr/llvm/llvm-project/97151?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#97150** https://app.graphite.dev/github/pr/llvm/llvm-project/97150?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/97151
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on fcopysign sign value (PR #97151)

2024-06-29 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/97151
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on fcopysign sign value (PR #97151)

2024-06-29 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-llvm-selectiondag

Author: Matt Arsenault (arsenm)


Changes

Math library code has quite a few places with complex bit
logic that are ultimately fed into a copysign. This helps
avoid some regressions in a future patch.

This assumes the position in the float type, which should
at least be valid for IEEE types. Not sure if we need to guard
against ppc_fp128 or anything else weird.

There appears to be some value in simplifying the value operand
as well, but I'll address that separately.

---
Full diff: https://github.com/llvm/llvm-project/pull/97151.diff


6 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+6) 
- (modified) llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll 
(+7-15) 
- (modified) llvm/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll 
(+25-30) 
- (modified) llvm/test/CodeGen/RISCV/double-arith.ll (+1-3) 
- (modified) llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll (+4-8) 
- (modified) llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll (+4-6) 


``diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 
b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 254d63abdf805..eaab2b3421190 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17565,6 +17565,12 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0));
 
+  // We only take the sign bit from the sign operand.
+  EVT SignVT = N1.getValueType();
+  if (SimplifyDemandedBits(N1,
+   APInt::getSignMask(SignVT.getScalarSizeInBits(
+return SDValue(N, 0);
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll 
b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index 1eccb55202f02..af4f236c783c6 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -11,11 +11,10 @@ define half @test_pown_reduced_fast_f16_known_odd(half %x, 
i32 %y.arg) #0 {
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:v_or_b32_e32 v1, 1, v1
 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:v_and_b32_e32 v2, 0x8000, v0
 ; GFX9-NEXT:s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:v_mul_f16_e64 v0, |v0|, v1
-; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v2
+; GFX9-NEXT:v_mul_f16_e64 v1, |v0|, v1
+; GFX9-NEXT:v_bfi_b32 v0, s4, v1, v0
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %fabs = call half @llvm.fabs.f16(half %x)
@@ -37,10 +36,9 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 
x half> %x, <2 x i3
 ; GFX9-NEXT:v_cvt_f32_i32_e32 v2, v2
 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:v_and_b32_e32 v3, 0x7fff7fff, v0
-; GFX9-NEXT:v_and_b32_e32 v0, 0x80008000, v0
+; GFX9-NEXT:s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:v_cvt_f16_f32_e32 v2, v2
 ; GFX9-NEXT:v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:v_pack_b32_f16 v1, v1, v2
 ; GFX9-NEXT:v_pk_mul_f16 v1, v3, v1
 ; GFX9-NEXT:v_bfi_b32 v2, s4, v1, v0
@@ -67,10 +65,9 @@ define float @test_pown_reduced_fast_f32_known_odd(float %x, 
i32 %y.arg) #0 {
 ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:v_or_b32_e32 v1, 1, v1
 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:v_and_b32_e32 v2, 0x8000, v0
 ; GFX9-NEXT:s_brev_b32 s4, -2
-; GFX9-NEXT:v_mul_f32_e64 v0, |v0|, v1
-; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v2
+; GFX9-NEXT:v_mul_f32_e64 v1, |v0|, v1
+; GFX9-NEXT:v_bfi_b32 v0, s4, v1, v0
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
   %fabs = call float @llvm.fabs.f32(float %x)
@@ -94,8 +91,6 @@ define <2 x float> @test_pown_reduced_fast_v2f32_known_odd(<2 
x float> %x, <2 x
 ; GFX9-NEXT:s_brev_b32 s4, -2
 ; GFX9-NEXT:v_mul_f32_e64 v3, |v1|, v3
 ; GFX9-NEXT:v_mul_f32_e64 v2, |v0|, v2
-; GFX9-NEXT:v_and_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT:v_and_b32_e32 v0, 0x8000, v0
 ; GFX9-NEXT:v_bfi_b32 v0, s4, v2, v0
 ; GFX9-NEXT:v_bfi_b32 v1, s4, v3, v1
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
@@ -118,8 +113,7 @@ define double @test_pown_reduced_fast_f64_known_odd(double 
%x, i32 %y.arg) #0 {
 ; GFX9-NEXT:v_cvt_f64_i32_e32 v[2:3], v2
 ; GFX9-NEXT:s_brev_b32 s4, -2
 ; GFX9-NEXT:v_mul_f64 v[2:3], |v[0:1]|, v[2:3]
-; GFX9-NEXT:v_and_b32_e32 v0, 0x8000, v1
-; GFX9-NEXT:v_bfi_b32 v1, s4, v3, v0
+; GFX9-NEXT:v_bfi_b32 v1, s4, v3, v1
 ; GFX9-NEXT:v_mov_b32_e32 v0, v2
 ; GFX9-NEXT:s_setpc_b64 s[30:31]
   %y = or i32 %y.arg, 1
@@ -144,10 +138,8 @@ define <2 x double> 
@test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
 ; GFX9-NEXT:s_brev_b32 s4, -2
 ; GFX9-NEXT:v_mul_f6

[llvm-branch-commits] [llvm] AMDGPU: Use real copysign in fast pow (PR #97152)

2024-06-29 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/97152

Previously this would introduce some codegen regressions, but
those have been avoided by simplifying demanded bits on copysign
operations.

>From 339a76086df9e50218561f568d70683f14ef1631 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sat, 26 Aug 2023 13:19:06 -0400
Subject: [PATCH] AMDGPU: Use real copysign in fast pow

Previously this would introduce some codegen regressions, but
those have been avoided by simplifying demanded bits on copysign
operations.
---
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp |  7 ++-
 .../AMDGPU/amdgpu-simplify-libcall-pow.ll | 52 
 .../AMDGPU/amdgpu-simplify-libcall-pown.ll| 59 ---
 llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll | 30 +-
 4 files changed, 68 insertions(+), 80 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 456f3cb332cf8..27fa67ce5b45a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1131,17 +1131,18 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, 
IRBuilder<> &B,
   if (needcopysign) {
 Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
 Type *nTy = FPOp->getType()->getWithNewType(nTyS);
-unsigned size = nTy->getScalarSizeInBits();
 Value *opr_n = FPOp->getOperand(1);
 if (opr_n->getType()->getScalarType()->isIntegerTy())
   opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
 else
   opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
 
+unsigned size = nTy->getScalarSizeInBits();
 Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
 sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
-nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign);
-nval = B.CreateBitCast(nval, opr0->getType());
+
+nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()),
+nullptr, "__pow_sign");
   }
 
   LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll 
b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
index 6b4b0f881f3be..ab52c8ff8d399 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
@@ -1783,7 +1783,8 @@ define float @test_pow_afn_f32_nnan_ninf__y_10(float %x) {
 define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison(<2 x float> %x) {
 ; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison
 ; CHECK-SAME: (<2 x float> [[X:%.*]]) {
-; CHECK-NEXT:ret <2 x float> poison
+; CHECK-NEXT:[[__EXP2:%.*]] = call nnan ninf afn <2 x float> 
@llvm.exp2.v2f32(<2 x float> poison)
+; CHECK-NEXT:ret <2 x float> [[__EXP2]]
 ;
   %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 
x float> poison)
   ret <2 x float> %pow
@@ -2215,10 +2216,9 @@ define float 
@test_pow_afn_nnan_ninf_f32_known_integral_sitofp(float %x, i32 %y)
 ; CHECK-NEXT:[[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:[[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:[[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:[[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:[[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:[[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:ret float [[TMP5]]
+; CHECK-NEXT:[[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:[[__POW_SIGN1:%.*]] = call nnan ninf afn float 
@llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:ret float [[__POW_SIGN1]]
 ;
   %y.cast = sitofp i32 %y to float
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2303,10 +2303,9 @@ define float 
@test_pow_afn_nnan_ninf_f32_known_integral_uitofp(float %x, i32 %y)
 ; CHECK-NEXT:[[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:[[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:[[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:[[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:[[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:[[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:ret float [[TMP5]]
+; CHECK-NEXT:[[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:[[__POW_SIGN1:%.*]] = call nnan ninf afn float 
@llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:ret float [[__POW_SIGN1]]
 ;
   %y.cast = uitofp i32 %y to float
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2352,10 +2351,9 @@ define float 
@test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256(float %x, i2
 ; CHECK-NEXT:[[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:[[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:[[__POW_SIGN:%.*]] = and 

[llvm-branch-commits] [llvm] AMDGPU: Use real copysign in fast pow (PR #97152)

2024-06-29 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/97152?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#97152** https://app.graphite.dev/github/pr/llvm/llvm-project/97152?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#97151** https://app.graphite.dev/github/pr/llvm/llvm-project/97151?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* **#97150** https://app.graphite.dev/github/pr/llvm/llvm-project/97150?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/97152
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] AMDGPU: Use real copysign in fast pow (PR #97152)

2024-06-29 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes

Previously this would introduce some codegen regressions, but
those have been avoided by simplifying demanded bits on copysign
operations.

---
Full diff: https://github.com/llvm/llvm-project/pull/97152.diff


4 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp (+4-3) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll (+23-29) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll (+26-33) 
- (modified) llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll (+15-15) 


``diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 456f3cb332cf8..27fa67ce5b45a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1131,17 +1131,18 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, 
IRBuilder<> &B,
   if (needcopysign) {
 Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
 Type *nTy = FPOp->getType()->getWithNewType(nTyS);
-unsigned size = nTy->getScalarSizeInBits();
 Value *opr_n = FPOp->getOperand(1);
 if (opr_n->getType()->getScalarType()->isIntegerTy())
   opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
 else
   opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
 
+unsigned size = nTy->getScalarSizeInBits();
 Value *sign = B.CreateShl(opr_n, size-1, "__yeven");
 sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign");
-nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign);
-nval = B.CreateBitCast(nval, opr0->getType());
+
+nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()),
+nullptr, "__pow_sign");
   }
 
   LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll 
b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
index 6b4b0f881f3be..ab52c8ff8d399 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll
@@ -1783,7 +1783,8 @@ define float @test_pow_afn_f32_nnan_ninf__y_10(float %x) {
 define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison(<2 x float> %x) {
 ; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison
 ; CHECK-SAME: (<2 x float> [[X:%.*]]) {
-; CHECK-NEXT:ret <2 x float> poison
+; CHECK-NEXT:[[__EXP2:%.*]] = call nnan ninf afn <2 x float> 
@llvm.exp2.v2f32(<2 x float> poison)
+; CHECK-NEXT:ret <2 x float> [[__EXP2]]
 ;
   %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 
x float> poison)
   ret <2 x float> %pow
@@ -2215,10 +2216,9 @@ define float 
@test_pow_afn_nnan_ninf_f32_known_integral_sitofp(float %x, i32 %y)
 ; CHECK-NEXT:[[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:[[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:[[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:[[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:[[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:[[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:ret float [[TMP5]]
+; CHECK-NEXT:[[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:[[__POW_SIGN1:%.*]] = call nnan ninf afn float 
@llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:ret float [[__POW_SIGN1]]
 ;
   %y.cast = sitofp i32 %y to float
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2303,10 +2303,9 @@ define float 
@test_pow_afn_nnan_ninf_f32_known_integral_uitofp(float %x, i32 %y)
 ; CHECK-NEXT:[[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:[[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:[[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:[[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:[[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:[[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; CHECK-NEXT:ret float [[TMP5]]
+; CHECK-NEXT:[[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float
+; CHECK-NEXT:[[__POW_SIGN1:%.*]] = call nnan ninf afn float 
@llvm.copysign.f32(float [[__EXP2]], float [[TMP3]])
+; CHECK-NEXT:ret float [[__POW_SIGN1]]
 ;
   %y.cast = uitofp i32 %y to float
   %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@@ -2352,10 +2351,9 @@ define float 
@test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256(float %x, i2
 ; CHECK-NEXT:[[__YEVEN:%.*]] = shl i32 [[TMP1]], 31
 ; CHECK-NEXT:[[TMP2:%.*]] = bitcast float [[X]] to i32
 ; CHECK-NEXT:[[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]]
-; CHECK-NEXT:[[TMP3:%.*]] = bitcast float [[__EXP2]] to i32
-; CHECK-NEXT:[[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]]
-; CHECK-NEXT:[[TMP5:%.*]] = bitcast i32 [[TMP4]] to float
-; C

[llvm-branch-commits] [llvm] AMDGPU: Use real copysign in fast pow (PR #97152)

2024-06-29 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/97152
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on copysign value operand (PR #97180)

2024-06-29 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm created 
https://github.com/llvm/llvm-project/pull/97180

So far the only cases that seem to benefit are the weird
copysign with different typed inputs.

>From a064f83bf3579016ec5f0af50a11f2b2d3a31cf4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault 
Date: Sat, 29 Jun 2024 09:36:18 +0200
Subject: [PATCH] DAG: Call SimplifyDemandedBits on copysign value operand

So far the only cases that seem to benefit are the weird
copysign with different typed inputs.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  5 +++
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 40 +++
 2 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 
b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index eaab2b3421190..5d72b5e1e30dd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17571,6 +17571,11 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
APInt::getSignMask(SignVT.getScalarSizeInBits(
 return SDValue(N, 0);
 
+  // We only take the non-sign bits from the value operand
+  if (SimplifyDemandedBits(N0,
+   APInt::getSignedMaxValue(VT.getScalarSizeInBits(
+return SDValue(N, 0);
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index b8936911f0576..eda1709e4fd59 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1639,10 +1639,7 @@ define amdgpu_kernel void 
@s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; VI-NEXT:v_or_b32_e32 v2, 0x7c00, v2
 ; VI-NEXT:v_mov_b32_e32 v3, s2
 ; VI-NEXT:s_cselect_b64 vcc, -1, 0
-; VI-NEXT:s_lshr_b32 s0, s7, 16
 ; VI-NEXT:v_cndmask_b32_e32 v2, v3, v2, vcc
-; VI-NEXT:s_and_b32 s0, s0, 0x8000
-; VI-NEXT:v_or_b32_e32 v2, s0, v2
 ; VI-NEXT:s_movk_i32 s0, 0x7fff
 ; VI-NEXT:v_mov_b32_e32 v3, s8
 ; VI-NEXT:v_bfi_b32 v2, s0, v2, v3
@@ -1673,36 +1670,33 @@ define amdgpu_kernel void 
@s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX9-NEXT:v_lshlrev_b32_e64 v1, v1, s2
 ; GFX9-NEXT:v_cmp_ne_u32_e32 vcc, s0, v1
 ; GFX9-NEXT:v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:s_add_i32 s9, s1, 0xfc10
+; GFX9-NEXT:s_add_i32 s7, s1, 0xfc10
 ; GFX9-NEXT:v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:s_lshl_b32 s1, s9, 12
+; GFX9-NEXT:s_lshl_b32 s1, s7, 12
 ; GFX9-NEXT:s_or_b32 s0, s2, s0
 ; GFX9-NEXT:s_or_b32 s1, s6, s1
-; GFX9-NEXT:s_cmp_lt_i32 s9, 1
-; GFX9-NEXT:s_cselect_b32 s10, s0, s1
-; GFX9-NEXT:s_and_b32 s2, s10, 7
+; GFX9-NEXT:s_cmp_lt_i32 s7, 1
+; GFX9-NEXT:s_cselect_b32 s9, s0, s1
+; GFX9-NEXT:s_and_b32 s2, s9, 7
 ; GFX9-NEXT:s_cmp_gt_i32 s2, 5
 ; GFX9-NEXT:s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:s_cmp_eq_u32 s2, 3
 ; GFX9-NEXT:s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:s_lshr_b32 s2, s10, 2
+; GFX9-NEXT:s_lshr_b32 s2, s9, 2
 ; GFX9-NEXT:s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:s_addc_u32 s0, s2, 0
-; GFX9-NEXT:s_cmp_lt_i32 s9, 31
+; GFX9-NEXT:s_cmp_lt_i32 s7, 31
 ; GFX9-NEXT:s_cselect_b32 s2, s0, 0x7c00
 ; GFX9-NEXT:s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:v_lshlrev_b32_e32 v1, 9, v1
-; GFX9-NEXT:s_cmpk_eq_i32 s9, 0x40f
+; GFX9-NEXT:s_cmpk_eq_i32 s7, 0x40f
 ; GFX9-NEXT:v_or_b32_e32 v1, 0x7c00, v1
 ; GFX9-NEXT:v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:s_lshr_b32 s0, s7, 16
 ; GFX9-NEXT:v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:s_and_b32 s0, s0, 0x8000
-; GFX9-NEXT:v_or_b32_e32 v1, s0, v1
 ; GFX9-NEXT:s_movk_i32 s0, 0x7fff
 ; GFX9-NEXT:v_mov_b32_e32 v2, s8
 ; GFX9-NEXT:v_bfi_b32 v1, s0, v1, v2
@@ -1728,13 +1722,13 @@ define amdgpu_kernel void 
@s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT:s_addk_i32 s1, 0xfc10
 ; GFX11-NEXT:v_med3_i32 v1, s3, 0, 13
 ; GFX11-NEXT:v_readfirstlane_b32 s3, v0
-; GFX11-NEXT:s_lshl_b32 s8, s1, 12
+; GFX11-NEXT:s_lshl_b32 s7, s1, 12
 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | 
instid1(VALU_DEP_2)
 ; GFX11-NEXT:v_readfirstlane_b32 s6, v1
 ; GFX11-NEXT:s_or_b32 s2, s2, s3
 ; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | 
instid1(VALU_DEP_1)
 ; GFX11-NEXT:s_or_b32 s3, s2, 0x1000
-; GFX11-NEXT:s_or_b32 s8, s2, s8
+; GFX11-NEXT:s_or_b32 s7, s2, s7
 ; GFX11-NEXT:s_lshr_b32 s6, s3, s6
 ; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | 
instid1(VALU_DEP_2)
 ; GFX11-NEXT:v_lshlrev_b32_e64 v0, v1, s6
@@ -1745,15 +1739,15 @@ define amdgpu_kernel void 
@s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT: 

[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on copysign value operand (PR #97180)

2024-06-29 Thread Matt Arsenault via llvm-branch-commits

arsenm wrote:

> [!WARNING]
> This pull request is not mergeable via GitHub because a downstack PR is 
> open. Once all requirements are satisfied, merge this PR as a stack  href="https://app.graphite.dev/github/pr/llvm/llvm-project/97180?utm_source=stack-comment-downstack-mergeability-warning";
>  >on Graphite.
> https://graphite.dev/docs/merge-pull-requests";>Learn more

* **#97180** https://app.graphite.dev/github/pr/llvm/llvm-project/97180?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/> 👈
* **#97151** https://app.graphite.dev/github/pr/llvm/llvm-project/97151?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>: 1 other dependent PR 
([#97152](https://github.com/llvm/llvm-project/pull/97152) https://app.graphite.dev/github/pr/llvm/llvm-project/97152?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>)
* **#97150** https://app.graphite.dev/github/pr/llvm/llvm-project/97150?utm_source=stack-comment-icon";
 target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="10px" height="10px"/>
* `main`

This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about 
stacking.


 Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" 
width="11px" height="11px"/> Graphite
  

https://github.com/llvm/llvm-project/pull/97180
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on copysign value operand (PR #97180)

2024-06-29 Thread Matt Arsenault via llvm-branch-commits

https://github.com/arsenm ready_for_review 
https://github.com/llvm/llvm-project/pull/97180
___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits


[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on copysign value operand (PR #97180)

2024-06-29 Thread via llvm-branch-commits

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)


Changes

So far the only cases that seem to benefit are the weird
copysign with different typed inputs.

---
Full diff: https://github.com/llvm/llvm-project/pull/97180.diff


2 Files Affected:

- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+5) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+15-25) 


``diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 
b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index eaab2b3421190..5d72b5e1e30dd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17571,6 +17571,11 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
APInt::getSignMask(SignVT.getScalarSizeInBits(
 return SDValue(N, 0);
 
+  // We only take the non-sign bits from the value operand
+  if (SimplifyDemandedBits(N0,
+   APInt::getSignedMaxValue(VT.getScalarSizeInBits(
+return SDValue(N, 0);
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll 
b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index b8936911f0576..eda1709e4fd59 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1639,10 +1639,7 @@ define amdgpu_kernel void 
@s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; VI-NEXT:v_or_b32_e32 v2, 0x7c00, v2
 ; VI-NEXT:v_mov_b32_e32 v3, s2
 ; VI-NEXT:s_cselect_b64 vcc, -1, 0
-; VI-NEXT:s_lshr_b32 s0, s7, 16
 ; VI-NEXT:v_cndmask_b32_e32 v2, v3, v2, vcc
-; VI-NEXT:s_and_b32 s0, s0, 0x8000
-; VI-NEXT:v_or_b32_e32 v2, s0, v2
 ; VI-NEXT:s_movk_i32 s0, 0x7fff
 ; VI-NEXT:v_mov_b32_e32 v3, s8
 ; VI-NEXT:v_bfi_b32 v2, s0, v2, v3
@@ -1673,36 +1670,33 @@ define amdgpu_kernel void 
@s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX9-NEXT:v_lshlrev_b32_e64 v1, v1, s2
 ; GFX9-NEXT:v_cmp_ne_u32_e32 vcc, s0, v1
 ; GFX9-NEXT:v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT:s_add_i32 s9, s1, 0xfc10
+; GFX9-NEXT:s_add_i32 s7, s1, 0xfc10
 ; GFX9-NEXT:v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:s_lshl_b32 s1, s9, 12
+; GFX9-NEXT:s_lshl_b32 s1, s7, 12
 ; GFX9-NEXT:s_or_b32 s0, s2, s0
 ; GFX9-NEXT:s_or_b32 s1, s6, s1
-; GFX9-NEXT:s_cmp_lt_i32 s9, 1
-; GFX9-NEXT:s_cselect_b32 s10, s0, s1
-; GFX9-NEXT:s_and_b32 s2, s10, 7
+; GFX9-NEXT:s_cmp_lt_i32 s7, 1
+; GFX9-NEXT:s_cselect_b32 s9, s0, s1
+; GFX9-NEXT:s_and_b32 s2, s9, 7
 ; GFX9-NEXT:s_cmp_gt_i32 s2, 5
 ; GFX9-NEXT:s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:s_cmp_eq_u32 s2, 3
 ; GFX9-NEXT:s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:s_lshr_b32 s2, s10, 2
+; GFX9-NEXT:s_lshr_b32 s2, s9, 2
 ; GFX9-NEXT:s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:s_addc_u32 s0, s2, 0
-; GFX9-NEXT:s_cmp_lt_i32 s9, 31
+; GFX9-NEXT:s_cmp_lt_i32 s7, 31
 ; GFX9-NEXT:s_cselect_b32 s2, s0, 0x7c00
 ; GFX9-NEXT:s_cmp_lg_u32 s6, 0
 ; GFX9-NEXT:s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:v_cndmask_b32_e64 v1, 0, 1, s[0:1]
 ; GFX9-NEXT:v_lshlrev_b32_e32 v1, 9, v1
-; GFX9-NEXT:s_cmpk_eq_i32 s9, 0x40f
+; GFX9-NEXT:s_cmpk_eq_i32 s7, 0x40f
 ; GFX9-NEXT:v_or_b32_e32 v1, 0x7c00, v1
 ; GFX9-NEXT:v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:s_lshr_b32 s0, s7, 16
 ; GFX9-NEXT:v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:s_and_b32 s0, s0, 0x8000
-; GFX9-NEXT:v_or_b32_e32 v1, s0, v1
 ; GFX9-NEXT:s_movk_i32 s0, 0x7fff
 ; GFX9-NEXT:v_mov_b32_e32 v2, s8
 ; GFX9-NEXT:v_bfi_b32 v1, s0, v1, v2
@@ -1728,13 +1722,13 @@ define amdgpu_kernel void 
@s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT:s_addk_i32 s1, 0xfc10
 ; GFX11-NEXT:v_med3_i32 v1, s3, 0, 13
 ; GFX11-NEXT:v_readfirstlane_b32 s3, v0
-; GFX11-NEXT:s_lshl_b32 s8, s1, 12
+; GFX11-NEXT:s_lshl_b32 s7, s1, 12
 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | 
instid1(VALU_DEP_2)
 ; GFX11-NEXT:v_readfirstlane_b32 s6, v1
 ; GFX11-NEXT:s_or_b32 s2, s2, s3
 ; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | 
instid1(VALU_DEP_1)
 ; GFX11-NEXT:s_or_b32 s3, s2, 0x1000
-; GFX11-NEXT:s_or_b32 s8, s2, s8
+; GFX11-NEXT:s_or_b32 s7, s2, s7
 ; GFX11-NEXT:s_lshr_b32 s6, s3, s6
 ; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | 
instid1(VALU_DEP_2)
 ; GFX11-NEXT:v_lshlrev_b32_e64 v0, v1, s6
@@ -1745,15 +1739,15 @@ define amdgpu_kernel void 
@s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
 ; GFX11-NEXT:v_readfirstlane_b32 s3, v0
 ; GFX11-NEXT:s_or_b32 s3, s6, s3
 ; GFX11-NEXT:s_cmp_lt_i32 s1, 1
-; GFX11-NEXT:s_cselect_b32 s3, s3, s8
+; GFX11-NEXT:s_cselect_b32 s3, s3, s7
 ; GFX11-NEXT:s_delay_alu instid0(SALU_CY