[llvm-branch-commits] [llvm] 92be640 - [FPEnv][AMDGPU] Disable FSUB(-0, X)->FNEG(X) DAGCombine when subnormals are flushed

2021-01-04 Thread Cameron McInally via llvm-branch-commits

Author: Cameron McInally
Date: 2021-01-04T14:44:10-06:00
New Revision: 92be640bd7d4fbc8e032a0aa81381a0246efa0be

URL: 
https://github.com/llvm/llvm-project/commit/92be640bd7d4fbc8e032a0aa81381a0246efa0be
DIFF: 
https://github.com/llvm/llvm-project/commit/92be640bd7d4fbc8e032a0aa81381a0246efa0be.diff

LOG: [FPEnv][AMDGPU] Disable FSUB(-0,X)->FNEG(X) DAGCombine when subnormals are 
flushed

This patch disables the FSUB(-0,X)->FNEG(X) DAG combine when we're flushing 
subnormals. It requires updating the existing AMDGPU tests to use the fneg IR 
instruction, in place of the old fsub(-0,X) canonical form, since AMDGPU is the 
only backend currently checking the DenormalMode flags.

Note that this will require follow-up optimizations to make sure the FSUB(-0,X) 
form is handled appropriately

Differential Revision: https://reviews.llvm.org/D93243

Added: 


Modified: 
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
llvm/test/CodeGen/AMDGPU/clamp.ll
llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
llvm/test/CodeGen/AMDGPU/fma-combine.ll
llvm/test/CodeGen/AMDGPU/fneg-combines.ll
llvm/test/CodeGen/AMDGPU/fpext-free.ll
llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
llvm/test/CodeGen/AMDGPU/known-never-snan.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmad.ftz.ll
llvm/test/CodeGen/AMDGPU/mad-combine.ll
llvm/test/CodeGen/AMDGPU/mad-mix.ll
llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
llvm/test/CodeGen/AMDGPU/rsq.ll
llvm/test/CodeGen/AMDGPU/v_mac.ll
llvm/test/CodeGen/AMDGPU/v_mac_f16.ll

Removed: 




diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 
b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 92b23df9e3af..6b1bd721a993 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13367,18 +13367,21 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   }
 
   // (fsub -0.0, N1) -> -N1
-  // NOTE: It is safe to transform an FSUB(-0.0,X) into an FNEG(X), since the
-  //   FSUB does not specify the sign bit of a NaN. Also note that for
-  //   the same reason, the inverse transform is not safe, unless fast math
-  //   flags are in play.
   if (N0CFP && N0CFP->isZero()) {
 if (N0CFP->isNegative() ||
 (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {
-  if (SDValue NegN1 =
-  TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
-return NegN1;
-  if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
-return DAG.getNode(ISD::FNEG, DL, VT, N1);
+  // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are
+  // flushed to zero, unless all users treat denorms as zero (DAZ).
+  // FIXME: This transform will change the sign of a NaN and the behavior
+  // of a signaling NaN. It is only valid when a NoNaN flag is present.
+  DenormalMode DenormMode = DAG.getDenormalMode(VT);
+  if (DenormMode == DenormalMode::getIEEE()) {
+if (SDValue NegN1 =
+TLI.getNegatedExpression(N1, DAG, LegalOperations, 
ForCodeSize))
+  return NegN1;
+if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+  return DAG.getNode(ISD::FNEG, DL, VT, N1);
+  }
 }
   }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll 
b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index 5a56a1a264af..4f3d6442da44 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -62,7 +62,7 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(float 
addrspace(1)* %out, flo
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
   %a = load float, float addrspace(1)* %gep0
   %floor = call float @llvm.floor.f32(float %a)
-  %neg.floor = fsub float -0.0, %floor
+  %neg.floor = fneg float %floor
   %max = call float @llvm.maxnum.f32(float %neg.floor, float 0.0)
   %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
   store float %clamp, float addrspace(1)* %out.gep

diff  --git a/llvm/test/CodeGen/AMDGPU/clamp.ll 
b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 1e18b2fa1c1b..256bea7fb7fb 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -25,7 +25,7 @@ define amdgpu_kernel void @v_clamp_neg_f32(float 
addrspace(1)* %out, float addrs
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
   %a = load float, float addrspace(1)* %gep0
-  %fneg.a = fsub float -0.0, %a
+  %fneg.a = fneg float %a
   %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
   %med = call float @llvm.minnum.f32(float %max, float 1.0)
 
@@ -42,7 +42,7 @@ define amdgpu_kernel void @v_clamp_negabs_f32(float 
addrspace(1)* %out, float ad
   %out.gep = getelementptr float, float addrspace(1)* 

[llvm-branch-commits] [llvm] f401335 - [SVE] Add unpacked scalable floating point ZIP/UZP/TRN patterns

2021-01-07 Thread Cameron McInally via llvm-branch-commits

Author: Cameron McInally
Date: 2021-01-07T09:56:53-06:00
New Revision: f4013359b3da2c78e94a64245de8638460f96c1a

URL: 
https://github.com/llvm/llvm-project/commit/f4013359b3da2c78e94a64245de8638460f96c1a
DIFF: 
https://github.com/llvm/llvm-project/commit/f4013359b3da2c78e94a64245de8638460f96c1a.diff

LOG: [SVE] Add unpacked scalable floating point ZIP/UZP/TRN patterns

Differential Revision: https://reviews.llvm.org/D94193

Added: 


Modified: 
llvm/lib/Target/AArch64/SVEInstrFormats.td
llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll

Removed: 




diff  --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td 
b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 1020a81a3494..b4416135eeb5 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2270,6 +2270,8 @@ multiclass sve_int_perm_bin_perm_zz opc, string 
asm,
   def : SVE_2_Op_Pat(NAME # 
_H)>;
   def : SVE_2_Op_Pat(NAME # 
_S)>;
   def : SVE_2_Op_Pat(NAME # 
_S)>;
+  def : SVE_2_Op_Pat(NAME # 
_D)>;
+  def : SVE_2_Op_Pat(NAME # 
_D)>;
   def : SVE_2_Op_Pat(NAME # 
_D)>;
 
   def : SVE_2_Op_Pat(NAME 
# _H)>;

diff  --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll 
b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
index b248d209f44a..9f0d71e6a7b6 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-perm-select.ll
@@ -1277,6 +1277,15 @@ define  @trn1_i64( 
%a,  %b
   ret  %out
 }
 
+define  @trn1_f16_v2( %a,  %b) {
+; CHECK-LABEL: trn1_f16_v2:
+; CHECK: trn1 z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call  @llvm.aarch64.sve.trn1.nxv2f16( %a,
+  %b)
+  ret  %out
+}
+
 define  @trn1_f16_v4( %a,  %b) {
 ; CHECK-LABEL: trn1_f16_v4:
 ; CHECK: trn1 z0.s, z0.s, z1.s
@@ -1304,6 +1313,15 @@ define  @trn1_f16( 
%a, 
   ret  %out
 }
 
+define  @trn1_f32_v2( %a,  %b) {
+; CHECK-LABEL: trn1_f32_v2:
+; CHECK: trn1 z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call  @llvm.aarch64.sve.trn1.nxv2f32( %a,
+   %b)
+  ret  %out
+}
+
 define  @trn1_f32( %a,  %b) {
 ; CHECK-LABEL: trn1_f32:
 ; CHECK: trn1 z0.s, z0.s, z1.s
@@ -1398,6 +1416,15 @@ define  @trn2_i64( 
%a,  %b
   ret  %out
 }
 
+define  @trn2_f16_v2( %a,  %b) {
+; CHECK-LABEL: trn2_f16_v2:
+; CHECK: trn2 z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call  @llvm.aarch64.sve.trn2.nxv2f16( %a,
+  %b)
+  ret  %out
+}
+
 define  @trn2_f16_v4( %a,  %b) {
 ; CHECK-LABEL: trn2_f16_v4:
 ; CHECK: trn2 z0.s, z0.s, z1.s
@@ -1425,6 +1452,15 @@ define  @trn2_f16( 
%a, 
   ret  %out
 }
 
+define  @trn2_f32_v2( %a,  %b) {
+; CHECK-LABEL: trn2_f32_v2:
+; CHECK: trn2 z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call  @llvm.aarch64.sve.trn2.nxv2f32( %a,
+   %b)
+  ret  %out
+}
+
 define  @trn2_f32( %a,  %b) {
 ; CHECK-LABEL: trn2_f32:
 ; CHECK: trn2 z0.s, z0.s, z1.s
@@ -1519,6 +1555,15 @@ define  @uzp1_i64( 
%a,  %b
   ret  %out
 }
 
+define  @uzp1_f16_v2( %a,  %b) {
+; CHECK-LABEL: uzp1_f16_v2:
+; CHECK: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call  @llvm.aarch64.sve.uzp1.nxv2f16( %a,
+  %b)
+  ret  %out
+}
+
 define  @uzp1_f16_v4( %a,  %b) {
 ; CHECK-LABEL: uzp1_f16_v4:
 ; CHECK: uzp1 z0.s, z0.s, z1.s
@@ -1546,6 +1591,15 @@ define  @uzp1_f16( 
%a, 
   ret  %out
 }
 
+define  @uzp1_f32_v2( %a,  %b) {
+; CHECK-LABEL: uzp1_f32_v2:
+; CHECK: uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call  @llvm.aarch64.sve.uzp1.nxv2f32( %a,
+   %b)
+  ret  %out
+}
+
 define  @uzp1_f32( %a,  %b) {
 ; CHECK-LABEL: uzp1_f32:
 ; CHECK: uzp1 z0.s, z0.s, z1.s
@@ -1640,6 +1694,15 @@ define  @uzp2_i64( 
%a,  %b
   ret  %out
 }
 
+define  @uzp2_f16_v2( %a,  %b) {
+; CHECK-LABEL: uzp2_f16_v2:
+; CHECK: uzp2 z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call  @llvm.aarch64.sve.uzp2.nxv2f16( %a,
+  %b)
+  ret  %out
+}
+
 define  @uzp2_f16_v4( %a,  %b) {
 ; CHECK-LABEL: uzp2_f16_v4:
 ; CHECK: uzp2 z0.s, z0.s, z1.s
@@ -1667,6 +1730,15 @@ define  @uzp2_f16( 
%a, 
   ret  %out
 }
 
+define  @uzp2_f32_v2( %a,  %b) {
+; CHECK-LABEL: uzp2_f32_v2:
+; CHECK: uzp2 z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call  @llvm.aarch64.sve.uzp2.nxv2f32( %a,
+   %b)
+  ret  %out
+}
+
 define  @uzp2_f32( %a,  %b) {
 ; CHECK-LABEL: uzp2_f32:
 ; CHECK: uzp2 z0.s, z0.s, z1.s
@@ -1761,6 +1833,15 @@ define  @zip1_i64( 
%a,  %b
   ret  %out
 }
 
+define  @zip1_f16_v2( %a,  %b) {
+; CHECK-LABEL: zip1_f16_v2:
+; CHECK: zip1 z0.d, z