[llvm-branch-commits] [llvm] 92be640 - [FPEnv][AMDGPU] Disable FSUB(-0, X)->FNEG(X) DAGCombine when subnormals are flushed
Author: Cameron McInally Date: 2021-01-04T14:44:10-06:00 New Revision: 92be640bd7d4fbc8e032a0aa81381a0246efa0be URL: https://github.com/llvm/llvm-project/commit/92be640bd7d4fbc8e032a0aa81381a0246efa0be DIFF: https://github.com/llvm/llvm-project/commit/92be640bd7d4fbc8e032a0aa81381a0246efa0be.diff LOG: [FPEnv][AMDGPU] Disable FSUB(-0,X)->FNEG(X) DAGCombine when subnormals are flushed This patch disables the FSUB(-0,X)->FNEG(X) DAG combine when we're flushing subnormals. It requires updating the existing AMDGPU tests to use the fneg IR instruction, in place of the old fsub(-0,X) canonical form, since AMDGPU is the only backend currently checking the DenormalMode flags. Note that this will require follow-up optimizations to make sure the FSUB(-0,X) form is handled appropriately Differential Revision: https://reviews.llvm.org/D93243 Added: Modified: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm/test/CodeGen/AMDGPU/clamp-modifier.ll llvm/test/CodeGen/AMDGPU/clamp.ll llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll llvm/test/CodeGen/AMDGPU/fma-combine.ll llvm/test/CodeGen/AMDGPU/fneg-combines.ll llvm/test/CodeGen/AMDGPU/fpext-free.ll llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll llvm/test/CodeGen/AMDGPU/known-never-snan.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll llvm/test/CodeGen/AMDGPU/mad-combine.ll llvm/test/CodeGen/AMDGPU/mad-mix.ll llvm/test/CodeGen/AMDGPU/rcp-pattern.ll llvm/test/CodeGen/AMDGPU/rsq.ll llvm/test/CodeGen/AMDGPU/v_mac.ll llvm/test/CodeGen/AMDGPU/v_mac_f16.ll Removed: diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 92b23df9e3af..6b1bd721a993 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13367,18 +13367,21 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { } // (fsub -0.0, N1) -> -N1 - // NOTE: It is safe to transform an FSUB(-0.0,X) into an FNEG(X), since the - // FSUB does not specify the sign bit of a NaN. Also note that for - // the same reason, the inverse transform is not safe, unless fast math - // flags are in play. if (N0CFP && N0CFP->isZero()) { if (N0CFP->isNegative() || (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) { - if (SDValue NegN1 = - TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize)) -return NegN1; - if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) -return DAG.getNode(ISD::FNEG, DL, VT, N1); + // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are + // flushed to zero, unless all users treat denorms as zero (DAZ). + // FIXME: This transform will change the sign of a NaN and the behavior + // of a signaling NaN. It is only valid when a NoNaN flag is present. + DenormalMode DenormMode = DAG.getDenormalMode(VT); + if (DenormMode == DenormalMode::getIEEE()) { +if (SDValue NegN1 = +TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize)) + return NegN1; +if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) + return DAG.getNode(ISD::FNEG, DL, VT, N1); + } } } diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index 5a56a1a264af..4f3d6442da44 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -62,7 +62,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(float addrspace(1)* %out, flo %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 %floor = call float @llvm.floor.f32(float %a) - %neg.floor = fsub float -0.0, %floor + %neg.floor = fneg float %floor %max = call float @llvm.maxnum.f32(float %neg.floor, float 0.0) %clamp = call float @llvm.minnum.f32(float %max, float 1.0) store float %clamp, float addrspace(1)* %out.gep diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index 1e18b2fa1c1b..256bea7fb7fb 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrs %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid %a = load float, float addrspace(1)* %gep0 - %fneg.a = fsub float -0.0, %a + %fneg.a = fneg float %a %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0) %med = call float @llvm.minnum.f32(float %max, float 1.0) @@ -42,7 +42,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float ad %out.gep = getelementptr float, float addrspace(1)*
[llvm-branch-commits] [llvm] f401335 - [SVE] Add unpacked scalable floating point ZIP/UZP/TRN patterns
Author: Cameron McInally Date: 2021-01-07T09:56:53-06:00 New Revision: f4013359b3da2c78e94a64245de8638460f96c1a URL: https://github.com/llvm/llvm-project/commit/f4013359b3da2c78e94a64245de8638460f96c1a DIFF: https://github.com/llvm/llvm-project/commit/f4013359b3da2c78e94a64245de8638460f96c1a.diff LOG: [SVE] Add unpacked scalable floating point ZIP/UZP/TRN patterns Differential Revision: https://reviews.llvm.org/D94193 Added: Modified: llvm/lib/Target/AArch64/SVEInstrFormats.td llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll Removed: diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 1020a81a3494..b4416135eeb5 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -2270,6 +2270,8 @@ multiclass sve_int_perm_bin_perm_zz opc, string asm, def : SVE_2_Op_Pat(NAME # _H)>; def : SVE_2_Op_Pat(NAME # _S)>; def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; + def : SVE_2_Op_Pat(NAME # _D)>; def : SVE_2_Op_Pat(NAME # _D)>; def : SVE_2_Op_Pat(NAME # _H)>; diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll index b248d209f44a..9f0d71e6a7b6 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll @@ -1277,6 +1277,15 @@ define @trn1_i64( %a, %b ret %out } +define @trn1_f16_v2( %a, %b) { +; CHECK-LABEL: trn1_f16_v2: +; CHECK: trn1 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.trn1.nxv2f16( %a, + %b) + ret %out +} + define @trn1_f16_v4( %a, %b) { ; CHECK-LABEL: trn1_f16_v4: ; CHECK: trn1 z0.s, z0.s, z1.s @@ -1304,6 +1313,15 @@ define @trn1_f16( %a, ret %out } +define @trn1_f32_v2( %a, %b) { +; CHECK-LABEL: trn1_f32_v2: +; CHECK: trn1 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.trn1.nxv2f32( %a, + %b) + ret %out +} + define @trn1_f32( %a, %b) { ; CHECK-LABEL: trn1_f32: ; CHECK: trn1 z0.s, z0.s, z1.s @@ -1398,6 +1416,15 @@ define @trn2_i64( %a, %b ret %out } +define @trn2_f16_v2( %a, %b) { +; CHECK-LABEL: trn2_f16_v2: +; CHECK: trn2 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.trn2.nxv2f16( %a, + %b) + ret %out +} + define @trn2_f16_v4( %a, %b) { ; CHECK-LABEL: trn2_f16_v4: ; CHECK: trn2 z0.s, z0.s, z1.s @@ -1425,6 +1452,15 @@ define @trn2_f16( %a, ret %out } +define @trn2_f32_v2( %a, %b) { +; CHECK-LABEL: trn2_f32_v2: +; CHECK: trn2 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.trn2.nxv2f32( %a, + %b) + ret %out +} + define @trn2_f32( %a, %b) { ; CHECK-LABEL: trn2_f32: ; CHECK: trn2 z0.s, z0.s, z1.s @@ -1519,6 +1555,15 @@ define @uzp1_i64( %a, %b ret %out } +define @uzp1_f16_v2( %a, %b) { +; CHECK-LABEL: uzp1_f16_v2: +; CHECK: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.uzp1.nxv2f16( %a, + %b) + ret %out +} + define @uzp1_f16_v4( %a, %b) { ; CHECK-LABEL: uzp1_f16_v4: ; CHECK: uzp1 z0.s, z0.s, z1.s @@ -1546,6 +1591,15 @@ define @uzp1_f16( %a, ret %out } +define @uzp1_f32_v2( %a, %b) { +; CHECK-LABEL: uzp1_f32_v2: +; CHECK: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.uzp1.nxv2f32( %a, + %b) + ret %out +} + define @uzp1_f32( %a, %b) { ; CHECK-LABEL: uzp1_f32: ; CHECK: uzp1 z0.s, z0.s, z1.s @@ -1640,6 +1694,15 @@ define @uzp2_i64( %a, %b ret %out } +define @uzp2_f16_v2( %a, %b) { +; CHECK-LABEL: uzp2_f16_v2: +; CHECK: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.uzp2.nxv2f16( %a, + %b) + ret %out +} + define @uzp2_f16_v4( %a, %b) { ; CHECK-LABEL: uzp2_f16_v4: ; CHECK: uzp2 z0.s, z0.s, z1.s @@ -1667,6 +1730,15 @@ define @uzp2_f16( %a, ret %out } +define @uzp2_f32_v2( %a, %b) { +; CHECK-LABEL: uzp2_f32_v2: +; CHECK: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: ret + %out = call @llvm.aarch64.sve.uzp2.nxv2f32( %a, + %b) + ret %out +} + define @uzp2_f32( %a, %b) { ; CHECK-LABEL: uzp2_f32: ; CHECK: uzp2 z0.s, z0.s, z1.s @@ -1761,6 +1833,15 @@ define @zip1_i64( %a, %b ret %out } +define @zip1_f16_v2( %a, %b) { +; CHECK-LABEL: zip1_f16_v2: +; CHECK: zip1 z0.d, z