[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on fcopysign sign value (PR #97151)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/97151 Math library code has quite a few places with complex bit logic that are ultimately fed into a copysign. This helps avoid some regressions in a future patch. This assumes the position in the float type, which should at least be valid for IEEE types. Not sure if we need to guard against ppc_fp128 or anything else weird. There appears to be some value in simplifying the value operand as well, but I'll address that separately. >From 1cc5dcac0c7c83adaef02ad2d092764de1922e4b Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 29 Jun 2024 07:42:27 +0200 Subject: [PATCH] DAG: Call SimplifyDemandedBits on fcopysign sign value Math library code has quite a few places with complex bit logic that are ultimately fed into a copysign. This helps avoid some regressions in a future patch. This assumes the position in the float type, which should at least be valid for IEEE types. Not sure if we need to guard against ppc_fp128 or anything else weird. There appears to be some value in simplifying the value operand as well, but I'll address that separately. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 ++ .../AMDGPU/copysign-simplify-demanded-bits.ll | 22 +++- .../PowerPC/fp128-bitcast-after-operation.ll | 55 +-- llvm/test/CodeGen/RISCV/double-arith.ll | 4 +- .../RISCV/double-bitmanip-dagcombines.ll | 12 ++-- .../RISCV/float-bitmanip-dagcombines.ll | 10 ++-- 6 files changed, 47 insertions(+), 62 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 254d63abdf805..eaab2b3421190 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17565,6 +17565,12 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { if (CanCombineFCOPYSIGN_EXTEND_ROUND(N)) return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0)); + // We only take the sign bit from the sign operand. + EVT SignVT = N1.getValueType(); + if (SimplifyDemandedBits(N1, + APInt::getSignMask(SignVT.getScalarSizeInBits( +return SDValue(N, 0); + return SDValue(); } diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index 1eccb55202f02..af4f236c783c6 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -11,11 +11,10 @@ define half @test_pown_reduced_fast_f16_known_odd(half %x, i32 %y.arg) #0 { ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT:v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT:v_and_b32_e32 v2, 0x8000, v0 ; GFX9-NEXT:s_movk_i32 s4, 0x7fff ; GFX9-NEXT:v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT:v_mul_f16_e64 v0, |v0|, v1 -; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v2 +; GFX9-NEXT:v_mul_f16_e64 v1, |v0|, v1 +; GFX9-NEXT:v_bfi_b32 v0, s4, v1, v0 ; GFX9-NEXT:s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 %fabs = call half @llvm.fabs.f16(half %x) @@ -37,10 +36,9 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3 ; GFX9-NEXT:v_cvt_f32_i32_e32 v2, v2 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT:v_and_b32_e32 v3, 0x7fff7fff, v0 -; GFX9-NEXT:v_and_b32_e32 v0, 0x80008000, v0 +; GFX9-NEXT:s_movk_i32 s4, 0x7fff ; GFX9-NEXT:v_cvt_f16_f32_e32 v2, v2 ; GFX9-NEXT:v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT:s_movk_i32 s4, 0x7fff ; GFX9-NEXT:v_pack_b32_f16 v1, v1, v2 ; GFX9-NEXT:v_pk_mul_f16 v1, v3, v1 ; GFX9-NEXT:v_bfi_b32 v2, s4, v1, v0 @@ -67,10 +65,9 @@ define float @test_pown_reduced_fast_f32_known_odd(float %x, i32 %y.arg) #0 { ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT:v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT:v_and_b32_e32 v2, 0x8000, v0 ; GFX9-NEXT:s_brev_b32 s4, -2 -; GFX9-NEXT:v_mul_f32_e64 v0, |v0|, v1 -; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v2 +; GFX9-NEXT:v_mul_f32_e64 v1, |v0|, v1 +; GFX9-NEXT:v_bfi_b32 v0, s4, v1, v0 ; GFX9-NEXT:s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 %fabs = call float @llvm.fabs.f32(float %x) @@ -94,8 +91,6 @@ define <2 x float> @test_pown_reduced_fast_v2f32_known_odd(<2 x float> %x, <2 x ; GFX9-NEXT:s_brev_b32 s4, -2 ; GFX9-NEXT:v_mul_f32_e64 v3, |v1|, v3 ; GFX9-NEXT:v_mul_f32_e64 v2, |v0|, v2 -; GFX9-NEXT:v_and_b32_e32 v1, 0x8000, v1 -; GFX9-NEXT:v_and_b32_e32 v0, 0x8000, v0 ; GFX9-NEXT:v_bfi_b32 v0, s4, v2, v0 ; GFX9-NEXT:v_bfi_b32 v1, s4, v3, v1 ; GFX9-NEXT:s_setpc_b64 s[30:31] @@ -118,8 +113,7 @@ define double @test_pown_reduced_fast_f64_known_odd(double %x, i32 %y.arg) #0 { ; GFX9-NEXT:v_cvt_f64_i32_e32 v[2:3], v2 ;
[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on fcopysign sign value (PR #97151)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/97151?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#97151** https://app.graphite.dev/github/pr/llvm/llvm-project/97151?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 * **#97150** https://app.graphite.dev/github/pr/llvm/llvm-project/97150?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about stacking. Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="11px" height="11px"/> Graphite https://github.com/llvm/llvm-project/pull/97151 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on fcopysign sign value (PR #97151)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/97151 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on fcopysign sign value (PR #97151)
llvmbot wrote: @llvm/pr-subscribers-llvm-selectiondag Author: Matt Arsenault (arsenm) Changes Math library code has quite a few places with complex bit logic that are ultimately fed into a copysign. This helps avoid some regressions in a future patch. This assumes the position in the float type, which should at least be valid for IEEE types. Not sure if we need to guard against ppc_fp128 or anything else weird. There appears to be some value in simplifying the value operand as well, but I'll address that separately. --- Full diff: https://github.com/llvm/llvm-project/pull/97151.diff 6 Files Affected: - (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+6) - (modified) llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll (+7-15) - (modified) llvm/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll (+25-30) - (modified) llvm/test/CodeGen/RISCV/double-arith.ll (+1-3) - (modified) llvm/test/CodeGen/RISCV/double-bitmanip-dagcombines.ll (+4-8) - (modified) llvm/test/CodeGen/RISCV/float-bitmanip-dagcombines.ll (+4-6) ``diff diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 254d63abdf805..eaab2b3421190 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17565,6 +17565,12 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { if (CanCombineFCOPYSIGN_EXTEND_ROUND(N)) return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0)); + // We only take the sign bit from the sign operand. + EVT SignVT = N1.getValueType(); + if (SimplifyDemandedBits(N1, + APInt::getSignMask(SignVT.getScalarSizeInBits( +return SDValue(N, 0); + return SDValue(); } diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index 1eccb55202f02..af4f236c783c6 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -11,11 +11,10 @@ define half @test_pown_reduced_fast_f16_known_odd(half %x, i32 %y.arg) #0 { ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT:v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT:v_and_b32_e32 v2, 0x8000, v0 ; GFX9-NEXT:s_movk_i32 s4, 0x7fff ; GFX9-NEXT:v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT:v_mul_f16_e64 v0, |v0|, v1 -; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v2 +; GFX9-NEXT:v_mul_f16_e64 v1, |v0|, v1 +; GFX9-NEXT:v_bfi_b32 v0, s4, v1, v0 ; GFX9-NEXT:s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 %fabs = call half @llvm.fabs.f16(half %x) @@ -37,10 +36,9 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3 ; GFX9-NEXT:v_cvt_f32_i32_e32 v2, v2 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT:v_and_b32_e32 v3, 0x7fff7fff, v0 -; GFX9-NEXT:v_and_b32_e32 v0, 0x80008000, v0 +; GFX9-NEXT:s_movk_i32 s4, 0x7fff ; GFX9-NEXT:v_cvt_f16_f32_e32 v2, v2 ; GFX9-NEXT:v_cvt_f16_f32_e32 v1, v1 -; GFX9-NEXT:s_movk_i32 s4, 0x7fff ; GFX9-NEXT:v_pack_b32_f16 v1, v1, v2 ; GFX9-NEXT:v_pk_mul_f16 v1, v3, v1 ; GFX9-NEXT:v_bfi_b32 v2, s4, v1, v0 @@ -67,10 +65,9 @@ define float @test_pown_reduced_fast_f32_known_odd(float %x, i32 %y.arg) #0 { ; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT:v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT:v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT:v_and_b32_e32 v2, 0x8000, v0 ; GFX9-NEXT:s_brev_b32 s4, -2 -; GFX9-NEXT:v_mul_f32_e64 v0, |v0|, v1 -; GFX9-NEXT:v_bfi_b32 v0, s4, v0, v2 +; GFX9-NEXT:v_mul_f32_e64 v1, |v0|, v1 +; GFX9-NEXT:v_bfi_b32 v0, s4, v1, v0 ; GFX9-NEXT:s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 %fabs = call float @llvm.fabs.f32(float %x) @@ -94,8 +91,6 @@ define <2 x float> @test_pown_reduced_fast_v2f32_known_odd(<2 x float> %x, <2 x ; GFX9-NEXT:s_brev_b32 s4, -2 ; GFX9-NEXT:v_mul_f32_e64 v3, |v1|, v3 ; GFX9-NEXT:v_mul_f32_e64 v2, |v0|, v2 -; GFX9-NEXT:v_and_b32_e32 v1, 0x8000, v1 -; GFX9-NEXT:v_and_b32_e32 v0, 0x8000, v0 ; GFX9-NEXT:v_bfi_b32 v0, s4, v2, v0 ; GFX9-NEXT:v_bfi_b32 v1, s4, v3, v1 ; GFX9-NEXT:s_setpc_b64 s[30:31] @@ -118,8 +113,7 @@ define double @test_pown_reduced_fast_f64_known_odd(double %x, i32 %y.arg) #0 { ; GFX9-NEXT:v_cvt_f64_i32_e32 v[2:3], v2 ; GFX9-NEXT:s_brev_b32 s4, -2 ; GFX9-NEXT:v_mul_f64 v[2:3], |v[0:1]|, v[2:3] -; GFX9-NEXT:v_and_b32_e32 v0, 0x8000, v1 -; GFX9-NEXT:v_bfi_b32 v1, s4, v3, v0 +; GFX9-NEXT:v_bfi_b32 v1, s4, v3, v1 ; GFX9-NEXT:v_mov_b32_e32 v0, v2 ; GFX9-NEXT:s_setpc_b64 s[30:31] %y = or i32 %y.arg, 1 @@ -144,10 +138,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2 ; GFX9-NEXT:s_brev_b32 s4, -2 ; GFX9-NEXT:v_mul_f6
[llvm-branch-commits] [llvm] AMDGPU: Use real copysign in fast pow (PR #97152)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/97152 Previously this would introduce some codegen regressions, but those have been avoided by simplifying demanded bits on copysign operations. >From 339a76086df9e50218561f568d70683f14ef1631 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 26 Aug 2023 13:19:06 -0400 Subject: [PATCH] AMDGPU: Use real copysign in fast pow Previously this would introduce some codegen regressions, but those have been avoided by simplifying demanded bits on copysign operations. --- llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp | 7 ++- .../AMDGPU/amdgpu-simplify-libcall-pow.ll | 52 .../AMDGPU/amdgpu-simplify-libcall-pown.ll| 59 --- llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll | 30 +- 4 files changed, 68 insertions(+), 80 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 456f3cb332cf8..27fa67ce5b45a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -1131,17 +1131,18 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, if (needcopysign) { Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits()); Type *nTy = FPOp->getType()->getWithNewType(nTyS); -unsigned size = nTy->getScalarSizeInBits(); Value *opr_n = FPOp->getOperand(1); if (opr_n->getType()->getScalarType()->isIntegerTy()) opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou"); else opr_n = B.CreateFPToSI(opr1, nTy, "__ytou"); +unsigned size = nTy->getScalarSizeInBits(); Value *sign = B.CreateShl(opr_n, size-1, "__yeven"); sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign"); -nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign); -nval = B.CreateBitCast(nval, opr0->getType()); + +nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()), +nullptr, "__pow_sign"); } LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll index 6b4b0f881f3be..ab52c8ff8d399 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll @@ -1783,7 +1783,8 @@ define float @test_pow_afn_f32_nnan_ninf__y_10(float %x) { define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison(<2 x float> %x) { ; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison ; CHECK-SAME: (<2 x float> [[X:%.*]]) { -; CHECK-NEXT:ret <2 x float> poison +; CHECK-NEXT:[[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> poison) +; CHECK-NEXT:ret <2 x float> [[__EXP2]] ; %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> poison) ret <2 x float> %pow @@ -2215,10 +2216,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp(float %x, i32 %y) ; CHECK-NEXT:[[__YEVEN:%.*]] = shl i32 [[TMP1]], 31 ; CHECK-NEXT:[[TMP2:%.*]] = bitcast float [[X]] to i32 ; CHECK-NEXT:[[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]] -; CHECK-NEXT:[[TMP3:%.*]] = bitcast float [[__EXP2]] to i32 -; CHECK-NEXT:[[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]] -; CHECK-NEXT:[[TMP5:%.*]] = bitcast i32 [[TMP4]] to float -; CHECK-NEXT:ret float [[TMP5]] +; CHECK-NEXT:[[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float +; CHECK-NEXT:[[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]]) +; CHECK-NEXT:ret float [[__POW_SIGN1]] ; %y.cast = sitofp i32 %y to float %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast) @@ -2303,10 +2303,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp(float %x, i32 %y) ; CHECK-NEXT:[[__YEVEN:%.*]] = shl i32 [[TMP1]], 31 ; CHECK-NEXT:[[TMP2:%.*]] = bitcast float [[X]] to i32 ; CHECK-NEXT:[[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]] -; CHECK-NEXT:[[TMP3:%.*]] = bitcast float [[__EXP2]] to i32 -; CHECK-NEXT:[[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]] -; CHECK-NEXT:[[TMP5:%.*]] = bitcast i32 [[TMP4]] to float -; CHECK-NEXT:ret float [[TMP5]] +; CHECK-NEXT:[[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float +; CHECK-NEXT:[[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]]) +; CHECK-NEXT:ret float [[__POW_SIGN1]] ; %y.cast = uitofp i32 %y to float %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast) @@ -2352,10 +2351,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256(float %x, i2 ; CHECK-NEXT:[[__YEVEN:%.*]] = shl i32 [[TMP1]], 31 ; CHECK-NEXT:[[TMP2:%.*]] = bitcast float [[X]] to i32 ; CHECK-NEXT:[[__POW_SIGN:%.*]] = and
[llvm-branch-commits] [llvm] AMDGPU: Use real copysign in fast pow (PR #97152)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/97152?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#97152** https://app.graphite.dev/github/pr/llvm/llvm-project/97152?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 * **#97151** https://app.graphite.dev/github/pr/llvm/llvm-project/97151?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * **#97150** https://app.graphite.dev/github/pr/llvm/llvm-project/97150?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about stacking. Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="11px" height="11px"/> Graphite https://github.com/llvm/llvm-project/pull/97152 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Use real copysign in fast pow (PR #97152)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes Previously this would introduce some codegen regressions, but those have been avoided by simplifying demanded bits on copysign operations. --- Full diff: https://github.com/llvm/llvm-project/pull/97152.diff 4 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp (+4-3) - (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll (+23-29) - (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll (+26-33) - (modified) llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll (+15-15) ``diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 456f3cb332cf8..27fa67ce5b45a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -1131,17 +1131,18 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, if (needcopysign) { Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits()); Type *nTy = FPOp->getType()->getWithNewType(nTyS); -unsigned size = nTy->getScalarSizeInBits(); Value *opr_n = FPOp->getOperand(1); if (opr_n->getType()->getScalarType()->isIntegerTy()) opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou"); else opr_n = B.CreateFPToSI(opr1, nTy, "__ytou"); +unsigned size = nTy->getScalarSizeInBits(); Value *sign = B.CreateShl(opr_n, size-1, "__yeven"); sign = B.CreateAnd(B.CreateBitCast(opr0, nTy), sign, "__pow_sign"); -nval = B.CreateOr(B.CreateBitCast(nval, nTy), sign); -nval = B.CreateBitCast(nval, opr0->getType()); + +nval = B.CreateCopySign(nval, B.CreateBitCast(sign, nval->getType()), +nullptr, "__pow_sign"); } LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll index 6b4b0f881f3be..ab52c8ff8d399 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow.ll @@ -1783,7 +1783,8 @@ define float @test_pow_afn_f32_nnan_ninf__y_10(float %x) { define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison(<2 x float> %x) { ; CHECK-LABEL: define <2 x float> @test_pow_afn_v2f32_nnan_ninf__y_poison ; CHECK-SAME: (<2 x float> [[X:%.*]]) { -; CHECK-NEXT:ret <2 x float> poison +; CHECK-NEXT:[[__EXP2:%.*]] = call nnan ninf afn <2 x float> @llvm.exp2.v2f32(<2 x float> poison) +; CHECK-NEXT:ret <2 x float> [[__EXP2]] ; %pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> poison) ret <2 x float> %pow @@ -2215,10 +2216,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp(float %x, i32 %y) ; CHECK-NEXT:[[__YEVEN:%.*]] = shl i32 [[TMP1]], 31 ; CHECK-NEXT:[[TMP2:%.*]] = bitcast float [[X]] to i32 ; CHECK-NEXT:[[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]] -; CHECK-NEXT:[[TMP3:%.*]] = bitcast float [[__EXP2]] to i32 -; CHECK-NEXT:[[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]] -; CHECK-NEXT:[[TMP5:%.*]] = bitcast i32 [[TMP4]] to float -; CHECK-NEXT:ret float [[TMP5]] +; CHECK-NEXT:[[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float +; CHECK-NEXT:[[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]]) +; CHECK-NEXT:ret float [[__POW_SIGN1]] ; %y.cast = sitofp i32 %y to float %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast) @@ -2303,10 +2303,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp(float %x, i32 %y) ; CHECK-NEXT:[[__YEVEN:%.*]] = shl i32 [[TMP1]], 31 ; CHECK-NEXT:[[TMP2:%.*]] = bitcast float [[X]] to i32 ; CHECK-NEXT:[[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]] -; CHECK-NEXT:[[TMP3:%.*]] = bitcast float [[__EXP2]] to i32 -; CHECK-NEXT:[[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]] -; CHECK-NEXT:[[TMP5:%.*]] = bitcast i32 [[TMP4]] to float -; CHECK-NEXT:ret float [[TMP5]] +; CHECK-NEXT:[[TMP3:%.*]] = bitcast i32 [[__POW_SIGN]] to float +; CHECK-NEXT:[[__POW_SIGN1:%.*]] = call nnan ninf afn float @llvm.copysign.f32(float [[__EXP2]], float [[TMP3]]) +; CHECK-NEXT:ret float [[__POW_SIGN1]] ; %y.cast = uitofp i32 %y to float %pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast) @@ -2352,10 +2351,9 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256(float %x, i2 ; CHECK-NEXT:[[__YEVEN:%.*]] = shl i32 [[TMP1]], 31 ; CHECK-NEXT:[[TMP2:%.*]] = bitcast float [[X]] to i32 ; CHECK-NEXT:[[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP2]] -; CHECK-NEXT:[[TMP3:%.*]] = bitcast float [[__EXP2]] to i32 -; CHECK-NEXT:[[TMP4:%.*]] = or disjoint i32 [[__POW_SIGN]], [[TMP3]] -; CHECK-NEXT:[[TMP5:%.*]] = bitcast i32 [[TMP4]] to float -; C
[llvm-branch-commits] [llvm] AMDGPU: Use real copysign in fast pow (PR #97152)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/97152 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on copysign value operand (PR #97180)
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/97180 So far the only cases that seem to benefit are the weird copysign with different typed inputs. >From a064f83bf3579016ec5f0af50a11f2b2d3a31cf4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 29 Jun 2024 09:36:18 +0200 Subject: [PATCH] DAG: Call SimplifyDemandedBits on copysign value operand So far the only cases that seem to benefit are the weird copysign with different typed inputs. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +++ llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 40 +++ 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index eaab2b3421190..5d72b5e1e30dd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17571,6 +17571,11 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { APInt::getSignMask(SignVT.getScalarSizeInBits( return SDValue(N, 0); + // We only take the non-sign bits from the value operand + if (SimplifyDemandedBits(N0, + APInt::getSignedMaxValue(VT.getScalarSizeInBits( +return SDValue(N, 0); + return SDValue(); } diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index b8936911f0576..eda1709e4fd59 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1639,10 +1639,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; VI-NEXT:v_or_b32_e32 v2, 0x7c00, v2 ; VI-NEXT:v_mov_b32_e32 v3, s2 ; VI-NEXT:s_cselect_b64 vcc, -1, 0 -; VI-NEXT:s_lshr_b32 s0, s7, 16 ; VI-NEXT:v_cndmask_b32_e32 v2, v3, v2, vcc -; VI-NEXT:s_and_b32 s0, s0, 0x8000 -; VI-NEXT:v_or_b32_e32 v2, s0, v2 ; VI-NEXT:s_movk_i32 s0, 0x7fff ; VI-NEXT:v_mov_b32_e32 v3, s8 ; VI-NEXT:v_bfi_b32 v2, s0, v2, v3 @@ -1673,36 +1670,33 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX9-NEXT:v_lshlrev_b32_e64 v1, v1, s2 ; GFX9-NEXT:v_cmp_ne_u32_e32 vcc, s0, v1 ; GFX9-NEXT:v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT:s_add_i32 s9, s1, 0xfc10 +; GFX9-NEXT:s_add_i32 s7, s1, 0xfc10 ; GFX9-NEXT:v_readfirstlane_b32 s0, v1 -; GFX9-NEXT:s_lshl_b32 s1, s9, 12 +; GFX9-NEXT:s_lshl_b32 s1, s7, 12 ; GFX9-NEXT:s_or_b32 s0, s2, s0 ; GFX9-NEXT:s_or_b32 s1, s6, s1 -; GFX9-NEXT:s_cmp_lt_i32 s9, 1 -; GFX9-NEXT:s_cselect_b32 s10, s0, s1 -; GFX9-NEXT:s_and_b32 s2, s10, 7 +; GFX9-NEXT:s_cmp_lt_i32 s7, 1 +; GFX9-NEXT:s_cselect_b32 s9, s0, s1 +; GFX9-NEXT:s_and_b32 s2, s9, 7 ; GFX9-NEXT:s_cmp_gt_i32 s2, 5 ; GFX9-NEXT:s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT:s_cmp_eq_u32 s2, 3 ; GFX9-NEXT:s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT:s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT:s_lshr_b32 s2, s10, 2 +; GFX9-NEXT:s_lshr_b32 s2, s9, 2 ; GFX9-NEXT:s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT:s_addc_u32 s0, s2, 0 -; GFX9-NEXT:s_cmp_lt_i32 s9, 31 +; GFX9-NEXT:s_cmp_lt_i32 s7, 31 ; GFX9-NEXT:s_cselect_b32 s2, s0, 0x7c00 ; GFX9-NEXT:s_cmp_lg_u32 s6, 0 ; GFX9-NEXT:s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT:v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT:v_lshlrev_b32_e32 v1, 9, v1 -; GFX9-NEXT:s_cmpk_eq_i32 s9, 0x40f +; GFX9-NEXT:s_cmpk_eq_i32 s7, 0x40f ; GFX9-NEXT:v_or_b32_e32 v1, 0x7c00, v1 ; GFX9-NEXT:v_mov_b32_e32 v2, s2 ; GFX9-NEXT:s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT:s_lshr_b32 s0, s7, 16 ; GFX9-NEXT:v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT:s_and_b32 s0, s0, 0x8000 -; GFX9-NEXT:v_or_b32_e32 v1, s0, v1 ; GFX9-NEXT:s_movk_i32 s0, 0x7fff ; GFX9-NEXT:v_mov_b32_e32 v2, s8 ; GFX9-NEXT:v_bfi_b32 v1, s0, v1, v2 @@ -1728,13 +1722,13 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT:s_addk_i32 s1, 0xfc10 ; GFX11-NEXT:v_med3_i32 v1, s3, 0, 13 ; GFX11-NEXT:v_readfirstlane_b32 s3, v0 -; GFX11-NEXT:s_lshl_b32 s8, s1, 12 +; GFX11-NEXT:s_lshl_b32 s7, s1, 12 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT:v_readfirstlane_b32 s6, v1 ; GFX11-NEXT:s_or_b32 s2, s2, s3 ; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT:s_or_b32 s3, s2, 0x1000 -; GFX11-NEXT:s_or_b32 s8, s2, s8 +; GFX11-NEXT:s_or_b32 s7, s2, s7 ; GFX11-NEXT:s_lshr_b32 s6, s3, s6 ; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT:v_lshlrev_b32_e64 v0, v1, s6 @@ -1745,15 +1739,15 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT:
[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on copysign value operand (PR #97180)
arsenm wrote: > [!WARNING] > This pull request is not mergeable via GitHub because a downstack PR is > open. Once all requirements are satisfied, merge this PR as a stack href="https://app.graphite.dev/github/pr/llvm/llvm-project/97180?utm_source=stack-comment-downstack-mergeability-warning"; > >on Graphite. > https://graphite.dev/docs/merge-pull-requests";>Learn more * **#97180** https://app.graphite.dev/github/pr/llvm/llvm-project/97180?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> 👈 * **#97151** https://app.graphite.dev/github/pr/llvm/llvm-project/97151?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/>: 1 other dependent PR ([#97152](https://github.com/llvm/llvm-project/pull/97152) https://app.graphite.dev/github/pr/llvm/llvm-project/97152?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/>) * **#97150** https://app.graphite.dev/github/pr/llvm/llvm-project/97150?utm_source=stack-comment-icon"; target="_blank">https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="10px" height="10px"/> * `main` This stack of pull requests is managed by Graphite. https://stacking.dev/?utm_source=stack-comment";>Learn more about stacking. Join @arsenm and the rest of your teammates on https://graphite.dev?utm-source=stack-comment";>https://static.graphite.dev/graphite-32x32-black.png"; alt="Graphite" width="11px" height="11px"/> Graphite https://github.com/llvm/llvm-project/pull/97180 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on copysign value operand (PR #97180)
https://github.com/arsenm ready_for_review https://github.com/llvm/llvm-project/pull/97180 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] DAG: Call SimplifyDemandedBits on copysign value operand (PR #97180)
llvmbot wrote: @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) Changes So far the only cases that seem to benefit are the weird copysign with different typed inputs. --- Full diff: https://github.com/llvm/llvm-project/pull/97180.diff 2 Files Affected: - (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+5) - (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+15-25) ``diff diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index eaab2b3421190..5d72b5e1e30dd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17571,6 +17571,11 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { APInt::getSignMask(SignVT.getScalarSizeInBits( return SDValue(N, 0); + // We only take the non-sign bits from the value operand + if (SimplifyDemandedBits(N0, + APInt::getSignedMaxValue(VT.getScalarSizeInBits( +return SDValue(N, 0); + return SDValue(); } diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index b8936911f0576..eda1709e4fd59 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1639,10 +1639,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; VI-NEXT:v_or_b32_e32 v2, 0x7c00, v2 ; VI-NEXT:v_mov_b32_e32 v3, s2 ; VI-NEXT:s_cselect_b64 vcc, -1, 0 -; VI-NEXT:s_lshr_b32 s0, s7, 16 ; VI-NEXT:v_cndmask_b32_e32 v2, v3, v2, vcc -; VI-NEXT:s_and_b32 s0, s0, 0x8000 -; VI-NEXT:v_or_b32_e32 v2, s0, v2 ; VI-NEXT:s_movk_i32 s0, 0x7fff ; VI-NEXT:v_mov_b32_e32 v3, s8 ; VI-NEXT:v_bfi_b32 v2, s0, v2, v3 @@ -1673,36 +1670,33 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX9-NEXT:v_lshlrev_b32_e64 v1, v1, s2 ; GFX9-NEXT:v_cmp_ne_u32_e32 vcc, s0, v1 ; GFX9-NEXT:v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-NEXT:s_add_i32 s9, s1, 0xfc10 +; GFX9-NEXT:s_add_i32 s7, s1, 0xfc10 ; GFX9-NEXT:v_readfirstlane_b32 s0, v1 -; GFX9-NEXT:s_lshl_b32 s1, s9, 12 +; GFX9-NEXT:s_lshl_b32 s1, s7, 12 ; GFX9-NEXT:s_or_b32 s0, s2, s0 ; GFX9-NEXT:s_or_b32 s1, s6, s1 -; GFX9-NEXT:s_cmp_lt_i32 s9, 1 -; GFX9-NEXT:s_cselect_b32 s10, s0, s1 -; GFX9-NEXT:s_and_b32 s2, s10, 7 +; GFX9-NEXT:s_cmp_lt_i32 s7, 1 +; GFX9-NEXT:s_cselect_b32 s9, s0, s1 +; GFX9-NEXT:s_and_b32 s2, s9, 7 ; GFX9-NEXT:s_cmp_gt_i32 s2, 5 ; GFX9-NEXT:s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT:s_cmp_eq_u32 s2, 3 ; GFX9-NEXT:s_cselect_b64 s[2:3], -1, 0 ; GFX9-NEXT:s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT:s_lshr_b32 s2, s10, 2 +; GFX9-NEXT:s_lshr_b32 s2, s9, 2 ; GFX9-NEXT:s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT:s_addc_u32 s0, s2, 0 -; GFX9-NEXT:s_cmp_lt_i32 s9, 31 +; GFX9-NEXT:s_cmp_lt_i32 s7, 31 ; GFX9-NEXT:s_cselect_b32 s2, s0, 0x7c00 ; GFX9-NEXT:s_cmp_lg_u32 s6, 0 ; GFX9-NEXT:s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT:v_cndmask_b32_e64 v1, 0, 1, s[0:1] ; GFX9-NEXT:v_lshlrev_b32_e32 v1, 9, v1 -; GFX9-NEXT:s_cmpk_eq_i32 s9, 0x40f +; GFX9-NEXT:s_cmpk_eq_i32 s7, 0x40f ; GFX9-NEXT:v_or_b32_e32 v1, 0x7c00, v1 ; GFX9-NEXT:v_mov_b32_e32 v2, s2 ; GFX9-NEXT:s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT:s_lshr_b32 s0, s7, 16 ; GFX9-NEXT:v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT:s_and_b32 s0, s0, 0x8000 -; GFX9-NEXT:v_or_b32_e32 v1, s0, v1 ; GFX9-NEXT:s_movk_i32 s0, 0x7fff ; GFX9-NEXT:v_mov_b32_e32 v2, s8 ; GFX9-NEXT:v_bfi_b32 v1, s0, v1, v2 @@ -1728,13 +1722,13 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT:s_addk_i32 s1, 0xfc10 ; GFX11-NEXT:v_med3_i32 v1, s3, 0, 13 ; GFX11-NEXT:v_readfirstlane_b32 s3, v0 -; GFX11-NEXT:s_lshl_b32 s8, s1, 12 +; GFX11-NEXT:s_lshl_b32 s7, s1, 12 ; GFX11-NEXT:s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT:v_readfirstlane_b32 s6, v1 ; GFX11-NEXT:s_or_b32 s2, s2, s3 ; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT:s_or_b32 s3, s2, 0x1000 -; GFX11-NEXT:s_or_b32 s8, s2, s8 +; GFX11-NEXT:s_or_b32 s7, s2, s7 ; GFX11-NEXT:s_lshr_b32 s6, s3, s6 ; GFX11-NEXT:s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT:v_lshlrev_b32_e64 v0, v1, s6 @@ -1745,15 +1739,15 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX11-NEXT:v_readfirstlane_b32 s3, v0 ; GFX11-NEXT:s_or_b32 s3, s6, s3 ; GFX11-NEXT:s_cmp_lt_i32 s1, 1 -; GFX11-NEXT:s_cselect_b32 s3, s3, s8 +; GFX11-NEXT:s_cselect_b32 s3, s3, s7 ; GFX11-NEXT:s_delay_alu instid0(SALU_CY