Author: Simon Pilgrim Date: 2021-01-18T10:14:45Z New Revision: 770d1e0a8828010a7c95de4596e24d54ed2527c3
URL: https://github.com/llvm/llvm-project/commit/770d1e0a8828010a7c95de4596e24d54ed2527c3 DIFF: https://github.com/llvm/llvm-project/commit/770d1e0a8828010a7c95de4596e24d54ed2527c3.diff LOG: [X86][SSE] isHorizontalBinOp - reuse any existing horizontal ops. If we already have similar horizontal ops using the same args, then match that, even if we are on a target with slow horizontal ops. Added: Modified: llvm/lib/Target/X86/X86ISelLowering.cpp llvm/test/CodeGen/X86/haddsub-shuf.ll llvm/test/CodeGen/X86/haddsub-undef.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6bee21747bce..78a5d4a6dfbf 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45628,8 +45628,9 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form /// A horizontal-op B, for some already available A and B, and if so then LHS is /// set to A, RHS to B, and the routine returns 'true'. -static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, - const X86Subtarget &Subtarget, bool IsCommutative, +static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, + SelectionDAG &DAG, const X86Subtarget &Subtarget, + bool IsCommutative, SmallVectorImpl<int> &PostShuffleMask) { // If either operand is undef, bail out. The binop should be simplified. if (LHS.isUndef() || RHS.isUndef()) @@ -45790,9 +45791,20 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask)) return false; + // If the source nodes are already used in HorizOps then always accept this. + // Shuffle folding should merge these back together. + bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) { + return User->getOpcode() == HOpcode && User->getValueType(0) == VT; + }); + bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) { + return User->getOpcode() == HOpcode && User->getValueType(0) == VT; + }); + bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS; + // Assume a SingleSource HOP if we only shuffle one input and don't need to // shuffle the result. - if (!shouldUseHorizontalOp(NewLHS == NewRHS && + if (!ForceHorizOp && + !shouldUseHorizontalOp(NewLHS == NewRHS && (NumShuffles < 2 || !IsIdentityPostShuffle), DAG, Subtarget)) return false; @@ -45816,7 +45828,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, SmallVector<int, 8> PostShuffleMask; if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && - isHorizontalBinOp(LHS, RHS, DAG, Subtarget, IsFadd, PostShuffleMask)) { + isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsFadd, + PostShuffleMask)) { SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); if (!PostShuffleMask.empty()) HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, @@ -48931,17 +48944,18 @@ static SDValue combineAddOrSubToHADDorHSUB(SDNode *N, SelectionDAG &DAG, SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); bool IsAdd = N->getOpcode() == ISD::ADD; + auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB; assert((IsAdd || N->getOpcode() == ISD::SUB) && "Wrong opcode"); SmallVector<int, 8> PostShuffleMask; if ((VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasSSSE3() && - isHorizontalBinOp(Op0, Op1, DAG, Subtarget, IsAdd, PostShuffleMask)) { - auto HOpBuilder = [IsAdd](SelectionDAG &DAG, const SDLoc &DL, - ArrayRef<SDValue> Ops) { - return DAG.getNode(IsAdd ? X86ISD::HADD : X86ISD::HSUB, DL, - Ops[0].getValueType(), Ops); + isHorizontalBinOp(HorizOpcode, Op0, Op1, DAG, Subtarget, IsAdd, + PostShuffleMask)) { + auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL, + ArrayRef<SDValue> Ops) { + return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops); }; SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HOpBuilder); diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index 37eedcd54441..282ef37f6e52 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -873,45 +873,15 @@ define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) { declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) { -; SSSE3_SLOW-LABEL: PR34724_1: -; SSSE3_SLOW: # %bb.0: -; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm0 -; SSSE3_SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3_SLOW-NEXT: addps %xmm1, %xmm2 -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] -; SSSE3_SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSSE3_SLOW-NEXT: retq -; -; SSSE3_FAST-LABEL: PR34724_1: -; SSSE3_FAST: # %bb.0: -; SSSE3_FAST-NEXT: haddps %xmm1, %xmm0 -; SSSE3_FAST-NEXT: retq -; -; AVX1_SLOW-LABEL: PR34724_1: -; AVX1_SLOW: # %bb.0: -; AVX1_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] -; AVX1_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX1_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX1_SLOW-NEXT: retq -; -; AVX1_FAST-LABEL: PR34724_1: -; AVX1_FAST: # %bb.0: -; AVX1_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1_FAST-NEXT: retq -; -; AVX2_SLOW-LABEL: PR34724_1: -; AVX2_SLOW: # %bb.0: -; AVX2_SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2_SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] -; AVX2_SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX2_SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2_SLOW-NEXT: retq +; SSSE3-LABEL: PR34724_1: +; SSSE3: # %bb.0: +; SSSE3-NEXT: haddps %xmm1, %xmm0 +; SSSE3-NEXT: retq ; -; AVX2_FAST-LABEL: PR34724_1: -; AVX2_FAST: # %bb.0: -; AVX2_FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2_FAST-NEXT: retq +; AVX-LABEL: PR34724_1: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %t0 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 2, i32 4> %t1 = shufflevector <4 x float> %a, <4 x float> %b, <2 x i32> <i32 3, i32 5> %t2 = fadd <2 x float> %t0, %t1 diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index d268438121ef..48ee31fe64fc 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -576,30 +576,17 @@ define <4 x float> @add_ps_008(<4 x float> %x) { } define <4 x float> @add_ps_016(<4 x float> %0, <4 x float> %1) { -; SSE-SLOW-LABEL: add_ps_016: -; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE-SLOW-NEXT: addps %xmm1, %xmm2 -; SSE-SLOW-NEXT: haddps %xmm0, %xmm1 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,0] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[3,3] -; SSE-SLOW-NEXT: movaps %xmm2, %xmm0 -; SSE-SLOW-NEXT: retq -; -; SSE-FAST-LABEL: add_ps_016: -; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: haddps %xmm0, %xmm1 -; SSE-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,3] -; SSE-FAST-NEXT: movaps %xmm1, %xmm0 -; SSE-FAST-NEXT: retq +; SSE-LABEL: add_ps_016: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm0, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0,3,3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-SLOW-LABEL: add_ps_016: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,3] +; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,3] ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: add_ps_016: @@ -1006,32 +993,15 @@ define <4 x float> @PR45747_2(<4 x float> %a, <4 x float> %b) nounwind { } define <4 x float> @PR34724_add_v4f32_u123(<4 x float> %0, <4 x float> %1) { -; SSE-SLOW-LABEL: PR34724_add_v4f32_u123: -; SSE-SLOW: # %bb.0: -; SSE-SLOW-NEXT: haddps %xmm1, %xmm0 -; SSE-SLOW-NEXT: movsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE-SLOW-NEXT: addps %xmm1, %xmm2 -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[2,0] -; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] -; SSE-SLOW-NEXT: retq -; -; SSE-FAST-LABEL: PR34724_add_v4f32_u123: -; SSE-FAST: # %bb.0: -; SSE-FAST-NEXT: haddps %xmm1, %xmm0 -; SSE-FAST-NEXT: retq -; -; AVX-SLOW-LABEL: PR34724_add_v4f32_u123: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX-SLOW-NEXT: retq +; SSE-LABEL: PR34724_add_v4f32_u123: +; SSE: # %bb.0: +; SSE-NEXT: haddps %xmm1, %xmm0 +; SSE-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f32_u123: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX-FAST-NEXT: retq +; AVX-LABEL: PR34724_add_v4f32_u123: +; AVX: # %bb.0: +; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %3 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 2, i32 4> %4 = shufflevector <4 x float> %0, <4 x float> %1, <2 x i32> <i32 3, i32 5> %5 = fadd <2 x float> %3, %4 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits