[llvm-branch-commits] [llvm] e47a6a2 - [X86] Detect if EFLAGs is live across XBEGIN pseudo instruction. Add it as livein to the basic blocks created when expanding the pseudo

2020-07-28 Thread Hans Wennborg via llvm-branch-commits

Author: Craig Topper
Date: 2020-07-28T13:43:23+02:00
New Revision: e47a6a224a4b6d0ce98028f560a8b3806d145907

URL: 
https://github.com/llvm/llvm-project/commit/e47a6a224a4b6d0ce98028f560a8b3806d145907
DIFF: 
https://github.com/llvm/llvm-project/commit/e47a6a224a4b6d0ce98028f560a8b3806d145907.diff

LOG: [X86] Detect if EFLAGs is live across XBEGIN pseudo instruction. Add it as 
livein to the basic blocks created when expanding the pseudo

XBEGIN causes several based blocks to be inserted. If flags are live across it 
we need to make eflags live in the new basic blocks to avoid machine verifier 
errors.

Fixes PR46827

Reviewed By: ivanbaev

Differential Revision: https://reviews.llvm.org/D84479

(cherry picked from commit 647e861e080382593648b234668ad2f5a376ac5e)

Added: 
llvm/test/CodeGen/X86/pr46827.ll

Modified: 
llvm/lib/Target/X86/X86ISelLowering.cpp

Removed: 




diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f8b6b7eb3aff..7d846e4f2a77 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30953,6 +30953,34 @@ bool X86TargetLowering::areJTsAllowed(const Function 
*Fn) const {
 //   X86 Scheduler Hooks
 
//===--===//
 
+// Returns true if EFLAG is consumed after this iterator in the rest of the
+// basic block or any successors of the basic block.
+static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
+  MachineBasicBlock *BB) {
+  // Scan forward through BB for a use/def of EFLAGS.
+  for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
+ miI != miE; ++miI) {
+const MachineInstr& mi = *miI;
+if (mi.readsRegister(X86::EFLAGS))
+  return true;
+// If we found a def, we can stop searching.
+if (mi.definesRegister(X86::EFLAGS))
+  return false;
+  }
+
+  // If we hit the end of the block, check whether EFLAGS is live into a
+  // successor.
+  for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
+sEnd = BB->succ_end();
+   sItr != sEnd; ++sItr) {
+MachineBasicBlock* succ = *sItr;
+if (succ->isLiveIn(X86::EFLAGS))
+  return true;
+  }
+
+  return false;
+}
+
 /// Utility function to emit xbegin specifying the start of an RTM region.
 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
  const TargetInstrInfo *TII) {
@@ -30985,6 +31013,12 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, 
MachineBasicBlock *MBB,
   MF->insert(I, fallMBB);
   MF->insert(I, sinkMBB);
 
+  if (isEFLAGSLiveAfter(MI, MBB)) {
+mainMBB->addLiveIn(X86::EFLAGS);
+fallMBB->addLiveIn(X86::EFLAGS);
+sinkMBB->addLiveIn(X86::EFLAGS);
+  }
+
   // Transfer the remainder of BB and its successor edges to sinkMBB.
   sinkMBB->splice(sinkMBB->begin(), MBB,
   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
@@ -31373,27 +31407,8 @@ MachineBasicBlock 
*X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
  MachineBasicBlock* BB,
  const TargetRegisterInfo* TRI) {
-  // Scan forward through BB for a use/def of EFLAGS.
-  MachineBasicBlock::iterator miI(std::next(SelectItr));
-  for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
-const MachineInstr& mi = *miI;
-if (mi.readsRegister(X86::EFLAGS))
-  return false;
-if (mi.definesRegister(X86::EFLAGS))
-  break; // Should have kill-flag - update below.
-  }
-
-  // If we hit the end of the block, check whether EFLAGS is live into a
-  // successor.
-  if (miI == BB->end()) {
-for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
-  sEnd = BB->succ_end();
- sItr != sEnd; ++sItr) {
-  MachineBasicBlock* succ = *sItr;
-  if (succ->isLiveIn(X86::EFLAGS))
-return false;
-}
-  }
+  if (isEFLAGSLiveAfter(SelectItr, BB))
+return false;
 
   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
   // out. SelectMI should have a kill flag on EFLAGS.

diff  --git a/llvm/test/CodeGen/X86/pr46827.ll 
b/llvm/test/CodeGen/X86/pr46827.ll
new file mode 100644
index ..438b13c3400f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr46827.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -mtriple=i686-pc-linux -mattr=+rtm -verify-machineinstrs 
-stop-after=finalize-isel | FileCheck %s
+
+; CHECK: body: |
+; CHECK:   bb.0.bb107:
+; CHECK: successors: %bb.3(0x4000), %bb.4(0x4000)
+; CHECK: %0:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 
from %f

[llvm-branch-commits] [llvm] 592454c - [X86][SSE] Add additional (f)add(shuffle(x, y), shuffle(x, y)) tests for D83789

2020-07-28 Thread Hans Wennborg via llvm-branch-commits

Author: Simon Pilgrim
Date: 2020-07-28T13:47:17+02:00
New Revision: 592454c367dec9d3dd0abfb840cbaa664bc67bb8

URL: 
https://github.com/llvm/llvm-project/commit/592454c367dec9d3dd0abfb840cbaa664bc67bb8
DIFF: 
https://github.com/llvm/llvm-project/commit/592454c367dec9d3dd0abfb840cbaa664bc67bb8.diff

LOG: [X86][SSE] Add additional (f)add(shuffle(x,y),shuffle(x,y)) tests for 
D83789

(cherry picked from commit bfc4294ef61d5cf69fffe6b64287a323c003d90f)

Added: 
llvm/test/CodeGen/X86/haddsub-4.ll

Modified: 


Removed: 




diff  --git a/llvm/test/CodeGen/X86/haddsub-4.ll 
b/llvm/test/CodeGen/X86/haddsub-4.ll
new file mode 100644
index ..5c8e9a7c72f2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/haddsub-4.ll
@@ -0,0 +1,405 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3   | FileCheck %s 
--check-prefixes=SSE,SSSE3,SSSE3-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s 
--check-prefixes=SSE,SSSE3,SSSE3-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s 
--check-prefixes=AVX,AVX-SLOW,AVX1,AVX1-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s 
--check-prefixes=AVX,AVX-FAST,AVX1,AVX1-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2| FileCheck %s 
--check-prefixes=AVX,AVX-SLOW,AVX2,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops  | FileCheck %s 
--check-prefixes=AVX,AVX-FAST,AVX2,AVX2-FAST
+
+define <8 x i16> @hadd_reverse_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
+; SSE-LABEL: hadd_reverse_v8i16:
+; SSE:   # %bb.0:
+; SSE-NEXT:movdqa {{.*#+}} xmm2 = 
[14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15]
+; SSE-NEXT:movdqa %xmm1, %xmm3
+; SSE-NEXT:pshufb %xmm2, %xmm3
+; SSE-NEXT:movdqa %xmm0, %xmm4
+; SSE-NEXT:pshufb %xmm2, %xmm4
+; SSE-NEXT:punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; SSE-NEXT:movdqa {{.*#+}} xmm2 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15]
+; SSE-NEXT:pshufb %xmm2, %xmm1
+; SSE-NEXT:pshufb %xmm2, %xmm0
+; SSE-NEXT:punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:paddw %xmm4, %xmm0
+; SSE-NEXT:retq
+;
+; AVX-LABEL: hadd_reverse_v8i16:
+; AVX:   # %bb.0:
+; AVX-NEXT:vmovdqa {{.*#+}} xmm2 = 
[14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15]
+; AVX-NEXT:vpshufb %xmm2, %xmm1, %xmm3
+; AVX-NEXT:vpshufb %xmm2, %xmm0, %xmm2
+; AVX-NEXT:vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT:vmovdqa {{.*#+}} xmm3 = 
[12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15]
+; AVX-NEXT:vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT:vpshufb %xmm3, %xmm0, %xmm0
+; AVX-NEXT:vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:vpaddw %xmm0, %xmm2, %xmm0
+; AVX-NEXT:retq
+  %lhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> 
+  %rhs = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> 
+  %add = add <8 x i16> %lhs, %rhs
+  ret <8 x i16> %add
+}
+
+define <8 x i16> @hadd_reverse2_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
+; SSE-LABEL: hadd_reverse2_v8i16:
+; SSE:   # %bb.0:
+; SSE-NEXT:movdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; SSE-NEXT:pshufb %xmm2, %xmm0
+; SSE-NEXT:pshufb %xmm2, %xmm1
+; SSE-NEXT:phaddw %xmm1, %xmm0
+; SSE-NEXT:retq
+;
+; AVX-LABEL: hadd_reverse2_v8i16:
+; AVX:   # %bb.0:
+; AVX-NEXT:vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; AVX-NEXT:vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:vphaddw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:retq
+  %shuf0 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> 
+  %shuf1 = shufflevector <8 x i16> %a1, <8 x i16> undef, <8 x i32> 
+  %lhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> 
+  %rhs = shufflevector <8 x i16> %shuf0, <8 x i16> %shuf1, <8 x i32> 
+  %add = add <8 x i16> %lhs, %rhs
+  ret <8 x i16> %add
+}
+
+define <16 x i16> @hadd_reverse_v16i16(<16 x i16> %a0, <16 x i16> %a1) 
nounwind {
+; SSE-LABEL: hadd_reverse_v16i16:
+; SSE:   # %bb.0:
+; SSE-NEXT:movdqa %xmm0, %xmm4
+; SSE-NEXT:movdqa {{.*#+}} xmm0 = 
[14,15,10,11,6,7,2,3,14,15,10,11,12,13,14,15]
+; SSE-NEXT:movdqa %xmm2, %xmm5
+; SSE-NEXT:pshufb %xmm0, %xmm5
+; SSE-NEXT:movdqa %xmm4, %xmm6
+; SSE-NEXT:pshufb %xmm0, %xmm6
+; SSE-NEXT:punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm5[0]
+; SSE-NEXT:movdqa %xmm3, %xmm5
+; SSE-NEXT:pshufb %xmm0, %xmm5
+; SSE-NEXT:movdqa %xmm1, %xmm7
+; SSE-NEXT:pshufb %xmm0, %xmm7
+; SSE-NEXT:punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm5[0]
+; SSE-NEXT:movdqa {{.*#+}} xmm0 = [12,13,8,9,4,5,0,1,8,9,12,13,12,13,14,15]
+; SSE-NEXT:pshufb %xmm0, %xmm2
+; SSE-NEXT:pshufb %xmm0, %xmm4
+; SSE-NEXT:punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
+; SSE

[llvm-branch-commits] [llvm] d687594 - [X86][SSE] Attempt to match OP(SHUFFLE(X, Y), SHUFFLE(X, Y)) -> SHUFFLE(HOP(X, Y))

2020-07-28 Thread Hans Wennborg via llvm-branch-commits

Author: Simon Pilgrim
Date: 2020-07-28T13:47:17+02:00
New Revision: d6875948aaade1cd39e5d9b373d02749dd1e58f2

URL: 
https://github.com/llvm/llvm-project/commit/d6875948aaade1cd39e5d9b373d02749dd1e58f2
DIFF: 
https://github.com/llvm/llvm-project/commit/d6875948aaade1cd39e5d9b373d02749dd1e58f2.diff

LOG: [X86][SSE] Attempt to match OP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> 
SHUFFLE(HOP(X,Y))

An initial backend patch towards fixing the various poor HADD combines 
(PR34724, PR41813, PR45747 etc.).

This extends isHorizontalBinOp to check if we have per-element horizontal ops 
(odd+even element pairs), but not in the expected serial order - in which case 
we build a "post shuffle mask" that we can apply to the HOP result, assuming we 
have fast-hops/optsize etc.

The next step will be to extend the SHUFFLE(HOP(X,Y)) combines as suggested on 
PR41813 - accepting more post-shuffle masks even on slow-hop targets if we can 
fold it into another shuffle.

Differential Revision: https://reviews.llvm.org/D83789

(cherry picked from commit 182111777b4ec215eeebe8ab5cc2a324e2f055ff)

Added: 


Modified: 
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/haddsub-3.ll
llvm/test/CodeGen/X86/haddsub-4.ll
llvm/test/CodeGen/X86/haddsub-shuf.ll
llvm/test/CodeGen/X86/haddsub-undef.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll

Removed: 




diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7d846e4f2a77..86aa85e965f6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -44364,8 +44364,8 @@ static SDValue combineVEXTRACT_STORE(SDNode *N, 
SelectionDAG &DAG,
 /// A horizontal-op B, for some already available A and B, and if so then LHS 
is
 /// set to A, RHS to B, and the routine returns 'true'.
 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, SelectionDAG &DAG,
-  const X86Subtarget &Subtarget,
-  bool IsCommutative) {
+  const X86Subtarget &Subtarget, bool 
IsCommutative,
+  SmallVectorImpl &PostShuffleMask) {
   // If either operand is undef, bail out. The binop should be simplified.
   if (LHS.isUndef() || RHS.isUndef())
 return false;
@@ -44458,6 +44458,12 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue 
&RHS, SelectionDAG &DAG,
   RMask.push_back(i);
   }
 
+  // Avoid 128-bit lane crossing if pre-AVX2 and FP (integer will split).
+  if (!Subtarget.hasAVX2() && VT.isFloatingPoint() &&
+  (isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), LMask) ||
+   isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), RMask)))
+return false;
+
   // If A and B occur in reverse order in RHS, then canonicalize by commuting
   // RHS operands and shuffle mask.
   if (A != C) {
@@ -44468,6 +44474,9 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue 
&RHS, SelectionDAG &DAG,
   if (!(A == C && B == D))
 return false;
 
+  PostShuffleMask.clear();
+  PostShuffleMask.append(NumElts, SM_SentinelUndef);
+
   // LHS and RHS are now:
   //   LHS = shuffle A, B, LMask
   //   RHS = shuffle A, B, RMask
@@ -44476,6 +44485,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue 
&RHS, SelectionDAG &DAG,
   // so we just repeat the inner loop if this is a 256-bit op.
   unsigned Num128BitChunks = VT.getSizeInBits() / 128;
   unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
+  unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
   assert((NumEltsPer128BitChunk % 2 == 0) &&
  "Vector type should have an even number of elements in each lane");
   for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
@@ -44487,25 +44497,40 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue 
&RHS, SelectionDAG &DAG,
   (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
 continue;
 
+  // Check that successive odd/even elements are being operated on. If not,
+  // this is not a horizontal operation.
+  if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
+  !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
+return false;
+
+  // Compute the post-shuffle mask index based on where the element
+  // is stored in the HOP result, and where it needs to be moved to.
+  int Base = LIdx & ~1u;
+  int Index = ((Base % NumEltsPer128BitChunk) / 2) +
+  ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
+
   // The  low half of the 128-bit result must choose from A.
   // The high half of the 128-bit result must choose from B,
   // unless B is undef. In that case, we are always choosing from A.
-  unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
-  unsigned Src = B.getNode() ? i >= NumEltsPer6