https://github.com/RKSimon updated 
https://github.com/llvm/llvm-project/pull/204144

>From af79ef9c7d711f0215456ced9924367c1be71acd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <[email protected]>
Date: Tue, 16 Jun 2026 14:19:33 +0100
Subject: [PATCH 1/4] [WIP][X86] Replace X86 specific pdep/pext handling with
 generic PDEP/PEXT intrinsics

* Remove X86ISD::PDEP/PEXT and use ISD::PDEP/PEXT instead
* AutoUpgrade x86 pdep/pext intrinsics to llvm.pdep/pext generics
* Move X86 DAG knownbits/demandedbits handling to generic (unchanged)
* Move X86 InstCombine folds to generic (unchanged)
* Updated clang builtins to emit generics
* Add generic test coverage to msan + instcombine - similar to existing x86 
tests
---
 clang/lib/CodeGen/TargetBuiltins/X86.cpp      | 10 +++
 clang/test/CodeGen/X86/bmi2-builtins.c        |  8 +-
 llvm/include/llvm/IR/IntrinsicsX86.td         | 12 ---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  6 ++
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 18 ++++
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 28 ++++++
 llvm/lib/IR/AutoUpgrade.cpp                   |  8 ++
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 50 +----------
 .../Target/X86/X86InstCombineIntrinsic.cpp    | 88 -------------------
 llvm/lib/Target/X86/X86InstrFragments.td      |  4 -
 llvm/lib/Target/X86/X86InstrMisc.td           | 54 ++----------
 llvm/lib/Target/X86/X86IntrinsicsInfo.h       |  4 -
 .../InstCombine/InstCombineCalls.cpp          | 58 ++++++++++++
 .../Instrumentation/MemorySanitizer.cpp       | 29 +++++-
 llvm/test/CodeGen/X86/bmi2.ll                 | 23 +++--
 .../Instrumentation/MemorySanitizer/bmi.ll    | 16 ++--
 .../Instrumentation/MemorySanitizer/pdep.ll   | 35 +++++---
 .../Instrumentation/MemorySanitizer/pext.ll   | 35 +++++---
 llvm/test/Transforms/InstCombine/pdep.ll      | 30 +++----
 llvm/test/Transforms/InstCombine/pext.ll      | 30 +++----
 20 files changed, 259 insertions(+), 287 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp 
b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index acfeb9967cd2f..50125a71fcd5f 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -976,6 +976,16 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned 
BuiltinID,
     Function *F = CGM.getIntrinsic(Intrinsic::cttz, Ops[0]->getType());
     return Builder.CreateCall(F, {Ops[0], Builder.getInt1(false)});
   }
+  case X86::BI__builtin_ia32_pdep_si:
+  case X86::BI__builtin_ia32_pdep_di: {
+    Function *F = CGM.getIntrinsic(Intrinsic::pdep, Ops[0]->getType());
+    return Builder.CreateCall(F, Ops);
+  }
+  case X86::BI__builtin_ia32_pext_si:
+  case X86::BI__builtin_ia32_pext_di: {
+    Function *F = CGM.getIntrinsic(Intrinsic::pext, Ops[0]->getType());
+    return Builder.CreateCall(F, Ops);
+  }
   case X86::BI__builtin_ia32_undef128:
   case X86::BI__builtin_ia32_undef256:
   case X86::BI__builtin_ia32_undef512:
diff --git a/clang/test/CodeGen/X86/bmi2-builtins.c 
b/clang/test/CodeGen/X86/bmi2-builtins.c
index 1b2cb9048adb2..c83cc43d9fc3f 100644
--- a/clang/test/CodeGen/X86/bmi2-builtins.c
+++ b/clang/test/CodeGen/X86/bmi2-builtins.c
@@ -17,12 +17,12 @@ unsigned int test_bzhi_u32(unsigned int __X, unsigned int 
__Y) {
 }
 
 unsigned int test_pdep_u32(unsigned int __X, unsigned int __Y) {
-  // CHECK: @llvm.x86.bmi.pdep.32
+  // CHECK: @llvm.pdep.i32
   return _pdep_u32(__X, __Y);
 }
 
 unsigned int test_pext_u32(unsigned int __X, unsigned int __Y) {
-  // CHECK: @llvm.x86.bmi.pext.32
+  // CHECK: @llvm.pext.i32
   return _pext_u32(__X, __Y);
 }
 
@@ -41,12 +41,12 @@ unsigned long long test_bzhi_u64(unsigned long long __X, 
unsigned long long __Y)
 }
 
 unsigned long long test_pdep_u64(unsigned long long __X, unsigned long long 
__Y) {
-  // CHECK: @llvm.x86.bmi.pdep.64
+  // CHECK: @llvm.pdep.i64
   return _pdep_u64(__X, __Y);
 }
 
 unsigned long long test_pext_u64(unsigned long long __X, unsigned long long 
__Y) {
-  // CHECK: @llvm.x86.bmi.pext.64
+  // CHECK: @llvm.pext.i64
   return _pext_u64(__X, __Y);
 }
 
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td 
b/llvm/include/llvm/IR/IntrinsicsX86.td
index b75a0485d6263..5c7785731111c 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -2575,18 +2575,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start 
with "llvm.x86.".
   def int_x86_bmi_bzhi_64 : ClangBuiltin<"__builtin_ia32_bzhi_di">,
       DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                             [IntrNoMem]>;
-  def int_x86_bmi_pdep_32 : ClangBuiltin<"__builtin_ia32_pdep_si">,
-      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-                            [IntrNoMem]>;
-  def int_x86_bmi_pdep_64 : ClangBuiltin<"__builtin_ia32_pdep_di">,
-      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
-                            [IntrNoMem]>;
-  def int_x86_bmi_pext_32 : ClangBuiltin<"__builtin_ia32_pext_si">,
-      DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-                            [IntrNoMem]>;
-  def int_x86_bmi_pext_64 : ClangBuiltin<"__builtin_ia32_pext_di">,
-      DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
-                            [IntrNoMem]>;
 }
 
 
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp 
b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5a4ae64cb98af..122b7f89c9d6c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12245,12 +12245,18 @@ SDValue DAGCombiner::visitPDEP(SDNode *N) {
   // pdep(x, 0) -> 0
   if (isNullOrNullSplat(N1))
     return DAG.getConstant(0, DL, VT);
+
   // pdep(x, -1) -> x  (all positions selected, bits deposited at identity)
   if (isAllOnesOrAllOnesSplat(N1))
     return N0;
+
   // fold pdep(c1, c2) -> expandBits(c1, c2)
   if (SDValue C = DAG.FoldConstantArithmetic(ISD::PDEP, DL, VT, {N0, N1}))
     return C;
+
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp 
b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b32c16fe4300f..44120cceed2a3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3952,6 +3952,24 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, 
const APInt &DemandedElts,
     Known.Zero.setBitsFrom(1);
     break;
   }
+  case ISD::PDEP: {
+    Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    // Zeros are retained from the mask operand. But not ones.
+    Known.One.clearAllBits();
+    // The result will have at least as many trailing zeros as the non-mask
+    // operand since bits can only map to the same or higher bit position.
+    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+    break;
+  }
+  case ISD::PEXT: {
+    Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    // The result has as many leading zeros as the number of zeroes in the 
mask.
+    unsigned Count = Known.Zero.popcount();
+    Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
+    Known.One.clearAllBits();
+    break;
+  }
   case ISD::CLMUL: {
     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp 
b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5999e7a9c9fb2..0bd636d19065f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2463,6 +2463,34 @@ bool TargetLowering::SimplifyDemandedBits(
     Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
     break;
   }
+  case ISD::PDEP: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
+    APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
+
+    // If the demanded bits has leading zeroes, we don't demand those from the
+    // mask.
+    if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
+      return true;
+
+    // The number of possible 1s in the mask determines the number of LSBs of
+    // operand 0 used. Undemanded bits from the mask don't matter so filter
+    // them before counting.
+    KnownBits Known2;
+    uint64_t Count = (~Known.Zero & LoMask).popcount();
+    APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
+    if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
+      return true;
+
+    // Zeroes are retained from the mask, but not ones.
+    Known.One.clearAllBits();
+    // The result will have at least as many trailing zeros as the non-mask
+    // operand since bits can only map to the same or higher bit position.
+    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
+    break;
+  }
   case ISD::SIGN_EXTEND_INREG: {
     SDValue Op0 = Op.getOperand(0);
     EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 814e985ebf7be..9422fc6129efd 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -533,6 +533,10 @@ static bool shouldUpgradeX86Intrinsic(Function *F, 
StringRef Name) {
             Name.starts_with("vpcom") || // Added in 3.2, Updated in 9.0
             Name.starts_with("vprot"));  // Added in 8.0
 
+  if (Name.consume_front("bmi."))
+    return (Name.starts_with("pdep.") || // Added in 23.0
+            Name.starts_with("pext."));  // Added in 23.0
+
   return (Name == "addcarry.u32" ||        // Added in 8.0
           Name == "addcarry.u64" ||        // Added in 8.0
           Name == "addcarryx.u32" ||       // Added in 8.0
@@ -4616,6 +4620,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, 
CallBase *CI, Function *F,
   } else if (Name.starts_with("avx512.mask.") &&
              upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) {
     // Rep will be updated by the call in the condition.
+  } else if (Name.starts_with("bmi.pdep.")) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::pdep);
+  } else if (Name.starts_with("bmi.pext.")) {
+    Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::pext);
   } else
     reportFatalUsageErrorWithCI("Unexpected intrinsic", CI);
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp 
b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b9a65e2671aa9..1bc4bfd4251cf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39748,25 +39748,6 @@ void 
X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known.One.clearAllBits();
     break;
   }
-  case X86ISD::PDEP: {
-    KnownBits Known2;
-    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-    // Zeros are retained from the mask operand. But not ones.
-    Known.One.clearAllBits();
-    // The result will have at least as many trailing zeros as the non-mask
-    // operand since bits can only map to the same or higher bit position.
-    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
-    break;
-  }
-  case X86ISD::PEXT: {
-    Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    // The result has as many leading zeros as the number of zeroes in the 
mask.
-    unsigned Count = Known.Zero.popcount();
-    Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
-    Known.One.clearAllBits();
-    break;
-  }
   case X86ISD::VTRUNC:
   case X86ISD::VTRUNCS:
   case X86ISD::VTRUNCUS:
@@ -46015,34 +45996,6 @@ bool 
X86TargetLowering::SimplifyDemandedBitsForTargetNode(
 
     break;
   }
-  case X86ISD::PDEP: {
-    SDValue Op0 = Op.getOperand(0);
-    SDValue Op1 = Op.getOperand(1);
-
-    unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
-    APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
-
-    // If the demanded bits has leading zeroes, we don't demand those from the
-    // mask.
-    if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
-      return true;
-
-    // The number of possible 1s in the mask determines the number of LSBs of
-    // operand 0 used. Undemanded bits from the mask don't matter so filter
-    // them before counting.
-    KnownBits Known2;
-    uint64_t Count = (~Known.Zero & LoMask).popcount();
-    APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
-    if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
-      return true;
-
-    // Zeroes are retained from the mask, but not ones.
-    Known.One.clearAllBits();
-    // The result will have at least as many trailing zeros as the non-mask
-    // operand since bits can only map to the same or higher bit position.
-    Known.Zero.setLowBits(Known2.countMinTrailingZeros());
-    return false;
-  }
   case X86ISD::VPMADD52L:
   case X86ISD::VPMADD52H: {
     KnownBits KnownOp0, KnownOp1, KnownOp2;
@@ -63423,8 +63376,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
   case X86ISD::BEXTR:
   case X86ISD::BEXTRI:
-  case X86ISD::BZHI:
-  case X86ISD::PDEP:        return combineBMI(N, DAG, DCI);
+  case X86ISD::BZHI:        return combineBMI(N, DAG, DCI);
   case X86ISD::PCLMULQDQ:   return combinePCLMULQDQ(N, DAG, DCI);
   case ISD::INTRINSIC_WO_CHAIN:  return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
   case ISD::INTRINSIC_W_CHAIN:  return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp 
b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index 4999581489e82..ad1c171428671 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -2259,94 +2259,6 @@ X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, 
IntrinsicInst &II) const {
       // TODO should we convert this to an AND if the RHS is constant?
     }
     break;
-  case Intrinsic::x86_bmi_pext_32:
-  case Intrinsic::x86_bmi_pext_64:
-    if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
-      if (MaskC->isNullValue()) {
-        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
-      }
-      if (MaskC->isAllOnesValue()) {
-        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
-      }
-
-      unsigned MaskIdx, MaskLen;
-      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
-        // any single contingous sequence of 1s anywhere in the mask simply
-        // describes a subset of the input bits shifted to the appropriate
-        // position.  Replace with the straight forward IR.
-        Value *Input = II.getArgOperand(0);
-        Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
-        Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
-        Value *Shifted = IC.Builder.CreateLShr(Masked, ShiftAmt);
-        return IC.replaceInstUsesWith(II, Shifted);
-      }
-
-      if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
-        uint64_t Src = SrcC->getZExtValue();
-        uint64_t Mask = MaskC->getZExtValue();
-        uint64_t Result = 0;
-        uint64_t BitToSet = 1;
-
-        while (Mask) {
-          // Isolate lowest set bit.
-          uint64_t BitToTest = Mask & -Mask;
-          if (BitToTest & Src)
-            Result |= BitToSet;
-
-          BitToSet <<= 1;
-          // Clear lowest set bit.
-          Mask &= Mask - 1;
-        }
-
-        return IC.replaceInstUsesWith(II,
-                                      ConstantInt::get(II.getType(), Result));
-      }
-    }
-    break;
-  case Intrinsic::x86_bmi_pdep_32:
-  case Intrinsic::x86_bmi_pdep_64:
-    if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
-      if (MaskC->isNullValue()) {
-        return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
-      }
-      if (MaskC->isAllOnesValue()) {
-        return IC.replaceInstUsesWith(II, II.getArgOperand(0));
-      }
-
-      unsigned MaskIdx, MaskLen;
-      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
-        // any single contingous sequence of 1s anywhere in the mask simply
-        // describes a subset of the input bits shifted to the appropriate
-        // position.  Replace with the straight forward IR.
-        Value *Input = II.getArgOperand(0);
-        Value *ShiftAmt = ConstantInt::get(II.getType(), MaskIdx);
-        Value *Shifted = IC.Builder.CreateShl(Input, ShiftAmt);
-        Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
-        return IC.replaceInstUsesWith(II, Masked);
-      }
-
-      if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
-        uint64_t Src = SrcC->getZExtValue();
-        uint64_t Mask = MaskC->getZExtValue();
-        uint64_t Result = 0;
-        uint64_t BitToTest = 1;
-
-        while (Mask) {
-          // Isolate lowest set bit.
-          uint64_t BitToSet = Mask & -Mask;
-          if (BitToTest & Src)
-            Result |= BitToSet;
-
-          BitToTest <<= 1;
-          // Clear lowest set bit;
-          Mask &= Mask - 1;
-        }
-
-        return IC.replaceInstUsesWith(II,
-                                      ConstantInt::get(II.getType(), Result));
-      }
-    }
-    break;
 
   case Intrinsic::x86_sse_cvtss2si:
   case Intrinsic::x86_sse_cvtss2si64:
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td 
b/llvm/lib/Target/X86/X86InstrFragments.td
index 9316360c5e02a..923b968382866 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -424,10 +424,6 @@ def X86bextri : SDNode<"X86ISD::BEXTRI", SDTIntBinOp>;
 // Zero High Bits Starting with Specified Bit Position.
 def X86bzhi   : SDNode<"X86ISD::BZHI",   SDTIntBinOp>;
 
-// Parallel extract and deposit.
-def X86pdep   : SDNode<"X86ISD::PDEP",   SDTIntBinOp>;
-def X86pext   : SDNode<"X86ISD::PEXT",   SDTIntBinOp>;
-
 // X86-specific multiply by immediate.
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td 
b/llvm/lib/Target/X86/X86InstrMisc.td
index 613a431fe365a..c6acaa697fdc7 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1391,55 +1391,17 @@ multiclass PdepPext<string m, X86TypeInfo t, 
SDPatternOperator node,
 }
 
 let Predicates = [HasBMI2, NoEGPR] in {
-  defm PDEP32 : PdepPext<"pdep", Xi32, X86pdep>, XD, VEX;
-  defm PDEP64 : PdepPext<"pdep", Xi64, X86pdep>, XD, REX_W, VEX;
-  defm PEXT32 : PdepPext<"pext", Xi32, X86pext>, XS, VEX;
-  defm PEXT64 : PdepPext<"pext", Xi64, X86pext>, XS, REX_W, VEX;
+  defm PDEP32 : PdepPext<"pdep", Xi32, pdep>, XD, VEX;
+  defm PDEP64 : PdepPext<"pdep", Xi64, pdep>, XD, REX_W, VEX;
+  defm PEXT32 : PdepPext<"pext", Xi32, pext>, XS, VEX;
+  defm PEXT64 : PdepPext<"pext", Xi64, pext>, XS, REX_W, VEX;
 }
 
 let Predicates = [HasBMI2, HasEGPR] in {
-  defm PDEP32 : PdepPext<"pdep", Xi32, X86pdep, "_EVEX">, XD, EVEX;
-  defm PDEP64 : PdepPext<"pdep", Xi64, X86pdep, "_EVEX">, XD, REX_W, EVEX;
-  defm PEXT32 : PdepPext<"pext", Xi32, X86pext, "_EVEX">, XS, EVEX;
-  defm PEXT64 : PdepPext<"pext", Xi64, X86pext, "_EVEX">, XS, REX_W, EVEX;
-}
-
-let Predicates = [HasBMI2, NoEGPR] in {
-  def : Pat<(i32 (pext GR32:$src, GR32:$mask)),
-            (PEXT32rr GR32:$src, GR32:$mask)>;
-  def : Pat<(i32 (pext GR32:$src, (loadi32 addr:$mask))),
-            (PEXT32rm GR32:$src, i32mem:$mask)>;
-  def : Pat<(i64 (pext GR64:$src, GR64:$mask)),
-            (PEXT64rr GR64:$src, GR64:$mask)>;
-  def : Pat<(i64 (pext GR64:$src, (loadi64 addr:$mask))),
-            (PEXT64rm GR64:$src, i64mem:$mask)>;
-  def : Pat<(i32 (pdep GR32:$src, GR32:$mask)),
-            (PDEP32rr GR32:$src, GR32:$mask)>;
-  def : Pat<(i32 (pdep GR32:$src, (loadi32 addr:$mask))),
-            (PDEP32rm GR32:$src, i32mem:$mask)>;
-  def : Pat<(i64 (pdep GR64:$src, GR64:$mask)),
-            (PDEP64rr GR64:$src, GR64:$mask)>;
-  def : Pat<(i64 (pdep GR64:$src, (loadi64 addr:$mask))),
-            (PDEP64rm GR64:$src, i64mem:$mask)>;
-}
-
-let Predicates = [HasBMI2, HasEGPR] in {
-  def : Pat<(i32 (pext GR32:$src, GR32:$mask)),
-            (PEXT32rr_EVEX GR32:$src, GR32:$mask)>;
-  def : Pat<(i32 (pext GR32:$src, (loadi32 addr:$mask))),
-            (PEXT32rm_EVEX GR32:$src, i32mem:$mask)>;
-  def : Pat<(i64 (pext GR64:$src, GR64:$mask)),
-            (PEXT64rr_EVEX GR64:$src, GR64:$mask)>;
-  def : Pat<(i64 (pext GR64:$src, (loadi64 addr:$mask))),
-            (PEXT64rm_EVEX GR64:$src, i64mem:$mask)>;
-  def : Pat<(i32 (pdep GR32:$src, GR32:$mask)),
-            (PDEP32rr_EVEX GR32:$src, GR32:$mask)>;
-  def : Pat<(i32 (pdep GR32:$src, (loadi32 addr:$mask))),
-            (PDEP32rm_EVEX GR32:$src, i32mem:$mask)>;
-  def : Pat<(i64 (pdep GR64:$src, GR64:$mask)),
-            (PDEP64rr_EVEX GR64:$src, GR64:$mask)>;
-  def : Pat<(i64 (pdep GR64:$src, (loadi64 addr:$mask))),
-            (PDEP64rm_EVEX GR64:$src, i64mem:$mask)>;
+  defm PDEP32 : PdepPext<"pdep", Xi32, pdep, "_EVEX">, XD, EVEX;
+  defm PDEP64 : PdepPext<"pdep", Xi64, pdep, "_EVEX">, XD, REX_W, EVEX;
+  defm PEXT32 : PdepPext<"pext", Xi32, pext, "_EVEX">, XS, EVEX;
+  defm PEXT64 : PdepPext<"pext", Xi64, pext, "_EVEX">, XS, REX_W, EVEX;
 }
 
 
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h 
b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 9e32ca23dafe2..a6b0db0230cf3 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -1837,10 +1837,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0),
     X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0),
     X86_INTRINSIC_DATA(bmi_bzhi_64, INTR_TYPE_2OP, X86ISD::BZHI, 0),
-    X86_INTRINSIC_DATA(bmi_pdep_32, INTR_TYPE_2OP, X86ISD::PDEP, 0),
-    X86_INTRINSIC_DATA(bmi_pdep_64, INTR_TYPE_2OP, X86ISD::PDEP, 0),
-    X86_INTRINSIC_DATA(bmi_pext_32, INTR_TYPE_2OP, X86ISD::PEXT, 0),
-    X86_INTRINSIC_DATA(bmi_pext_64, INTR_TYPE_2OP, X86ISD::PEXT, 0),
     X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
     X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB,
                        0),
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp 
b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3cd7515eb7670..050a4e8cb27e4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2660,6 +2660,64 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst 
&CI) {
       return &CI;
     break;
   }
+  case Intrinsic::pdep:
+    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      if (MaskC->isNullValue())
+        return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), 0));
+
+      if (MaskC->isAllOnesValue())
+        return replaceInstUsesWith(*II, II->getArgOperand(0));
+
+      unsigned MaskIdx, MaskLen;
+      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
+        // any single contingous sequence of 1s anywhere in the mask simply
+        // describes a subset of the input bits shifted to the appropriate
+        // position.  Replace with the straight forward IR.
+        Value *Input = II->getArgOperand(0);
+        Value *ShiftAmt = ConstantInt::get(II->getType(), MaskIdx);
+        Value *Shifted = Builder.CreateShl(Input, ShiftAmt);
+        Value *Masked = Builder.CreateAnd(Shifted, II->getArgOperand(1));
+        return replaceInstUsesWith(*II, Masked);
+      }
+
+      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
+        // constant folding.
+        APInt Result =
+            llvm::APIntOps::expandBits(SrcC->getValue(), MaskC->getValue());
+        return replaceInstUsesWith(*II,
+                                   ConstantInt::get(II->getType(), Result));
+      }
+    }
+    break;
+  case Intrinsic::pext:
+    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+      if (MaskC->isNullValue())
+        return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), 0));
+
+      if (MaskC->isAllOnesValue())
+        return replaceInstUsesWith(*II, II->getArgOperand(0));
+
+      unsigned MaskIdx, MaskLen;
+      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
+        // any single contingous sequence of 1s anywhere in the mask simply
+        // describes a subset of the input bits shifted to the appropriate
+        // position.  Replace with the straight forward IR.
+        Value *Input = II->getArgOperand(0);
+        Value *Masked = Builder.CreateAnd(Input, II->getArgOperand(1));
+        Value *ShiftAmt = ConstantInt::get(II->getType(), MaskIdx);
+        Value *Shifted = Builder.CreateLShr(Masked, ShiftAmt);
+        return replaceInstUsesWith(*II, Shifted);
+      }
+
+      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
+        // constant folding.
+        APInt Result =
+            llvm::APIntOps::compressBits(SrcC->getValue(), MaskC->getValue());
+        return replaceInstUsesWith(*II,
+                                   ConstantInt::get(II->getType(), Result));
+      }
+    }
+    break;
   case Intrinsic::ptrmask: {
     unsigned BitWidth = DL.getPointerTypeSizeInBits(II->getType());
     KnownBits Known(BitWidth);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp 
b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index bbc9f5d1b7506..f37e21f2c6dbb 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3333,6 +3333,26 @@ struct MemorySanitizerVisitor : public 
InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  // Instrument packed bits deposit/expand intrinsics.
+  // All of these intrinsics are Z = I(X, Y)
+  // where the types of all operands and the result match.
+  // The following instrumentation happens to work for all of them:
+  //   Sz = I(Sx, Y) | (sext (Sy != 0))
+  void handlePackedBits(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Type *ShadowTy = getShadowTy(&I);
+
+    // If any bit of the mask operand is poisoned, then the whole thing is.
+    Value *SMask = getShadow(&I, 1);
+    SMask = IRB.CreateSExt(IRB.CreateICmpNE(SMask, getCleanShadow(ShadowTy)),
+                           ShadowTy);
+    // Apply the same intrinsic to the shadow of the first operand.
+    Value *S = IRB.CreateIntrinsic(I.getIntrinsicID(), ShadowTy,
+                                   {getShadow(&I, 0), I.getOperand(1)});
+    setShadow(&I, IRB.CreateOr(SMask, S));
+    setOriginForNaryOp(I);
+  }
+
   /// Instrument llvm.memmove
   ///
   /// At this point we don't know if llvm.memmove will be inlined or not.
@@ -5873,6 +5893,11 @@ struct MemorySanitizerVisitor : public 
InstVisitor<MemorySanitizerVisitor> {
       handleFunnelShift(I);
       break;
 
+    case Intrinsic::pdep:
+    case Intrinsic::pext:
+      handlePackedBits(I);
+      break;
+
     case Intrinsic::is_constant:
       // The result of llvm.is.constant() is always defined.
       setShadow(&I, getCleanShadow(&I));
@@ -6503,10 +6528,6 @@ struct MemorySanitizerVisitor : public 
InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_bmi_bextr_64:
     case Intrinsic::x86_bmi_bzhi_32:
     case Intrinsic::x86_bmi_bzhi_64:
-    case Intrinsic::x86_bmi_pdep_32:
-    case Intrinsic::x86_bmi_pdep_64:
-    case Intrinsic::x86_bmi_pext_32:
-    case Intrinsic::x86_bmi_pext_64:
       handleBmiIntrinsic(I);
       break;
 
diff --git a/llvm/test/CodeGen/X86/bmi2.ll b/llvm/test/CodeGen/X86/bmi2.ll
index cabeebb0c3f36..41585bde9a696 100644
--- a/llvm/test/CodeGen/X86/bmi2.ll
+++ b/llvm/test/CodeGen/X86/bmi2.ll
@@ -128,7 +128,7 @@ define i32 @pdep32_load(i32 %x, ptr %y)   {
 define i32 @pdep32_anyext(i16 %x)   {
 ; X86-LABEL: pdep32_anyext:
 ; X86:       # %bb.0:
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $-1431655766, %ecx # imm = 0xAAAAAAAA
 ; X86-NEXT:    pdepl %ecx, %eax, %eax
 ; X86-NEXT:    retl
@@ -178,7 +178,7 @@ define i32 @pdep32_demandedbits(i32 %x) {
 define i32 @pdep32_demandedbits2(i32 %x, i32 %y) {
 ; X86-LABEL: pdep32_demandedbits2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pdepl {{[0-9]+}}(%esp), %eax, %eax
 ; X86-NEXT:    andl $128, %eax
 ; X86-NEXT:    retl
@@ -203,9 +203,8 @@ define i32 @pdep32_demandedbits2(i32 %x, i32 %y) {
 define i32 @pdep32_demandedbits_mask(i32 %x, i16 %y) {
 ; X86-LABEL: pdep32_demandedbits_mask:
 ; X86:       # %bb.0:
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    pdepl %eax, %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pdepl {{[0-9]+}}(%esp), %eax, %eax
 ; X86-NEXT:    andl $32768, %eax # imm = 0x8000
 ; X86-NEXT:    retl
 ;
@@ -230,9 +229,8 @@ define i32 @pdep32_demandedbits_mask(i32 %x, i16 %y) {
 define i32 @pdep32_demandedbits_mask2(i32 %x, i16 %y) {
 ; X86-LABEL: pdep32_demandedbits_mask2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    pdepl %eax, %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pdepl {{[0-9]+}}(%esp), %eax, %eax
 ; X86-NEXT:    movzwl %ax, %eax
 ; X86-NEXT:    retl
 ;
@@ -285,22 +283,23 @@ define i32 @pdep32_knownbits(i32 %x) {
 define i32 @pdep32_knownbits2(i32 %x, i32 %y) {
 ; X86-LABEL: pdep32_knownbits2:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl $-256, %eax
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $8, %eax
 ; X86-NEXT:    pdepl {{[0-9]+}}(%esp), %eax, %eax
 ; X86-NEXT:    imull %eax, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: pdep32_knownbits2:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $-256, %edi
+; X64-NEXT:    andl $16776960, %edi # imm = 0xFFFF00
 ; X64-NEXT:    pdepl %esi, %edi, %eax
 ; X64-NEXT:    imull %eax, %eax
 ; X64-NEXT:    retq
 ;
 ; EGPR-LABEL: pdep32_knownbits2:
 ; EGPR:       # %bb.0:
-; EGPR-NEXT:    andl $-256, %edi # encoding: [0x81,0xe7,0x00,0xff,0xff,0xff]
+; EGPR-NEXT:    andl $16776960, %edi # encoding: 
[0x81,0xe7,0x00,0xff,0xff,0x00]
+; EGPR-NEXT:    # imm = 0xFFFF00
 ; EGPR-NEXT:    pdepl %esi, %edi, %eax # EVEX TO VEX Compression encoding: 
[0xc4,0xe2,0x43,0xf5,0xc6]
 ; EGPR-NEXT:    imull %eax, %eax # encoding: [0x0f,0xaf,0xc0]
 ; EGPR-NEXT:    retq # encoding: [0xc3]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll 
b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
index 46bec2956c73c..208546ec56246 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
@@ -110,9 +110,9 @@ define i32 @Test_pdep_32(i32 %a, i32 %b) sanitize_memory {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.bmi.pdep.32(i32 [[TMP1]], 
i32 [[B]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.pdep.i32(i32 [[TMP1]], i32 
[[B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[C:%.*]] = tail call i32 @llvm.x86.bmi.pdep.32(i32 [[A]], 
i32 [[B]])
+; CHECK-NEXT:    [[C:%.*]] = call i32 @llvm.pdep.i32(i32 [[A]], i32 [[B]])
 ; CHECK-NEXT:    store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -131,9 +131,9 @@ define i64 @Test_pdep_64(i64 %a, i64 %b) sanitize_memory {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.x86.bmi.pdep.64(i64 [[TMP1]], 
i64 [[B]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.pdep.i64(i64 [[TMP1]], i64 
[[B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.x86.bmi.pdep.64(i64 [[A]], 
i64 [[B]])
+; CHECK-NEXT:    [[C:%.*]] = call i64 @llvm.pdep.i64(i64 [[A]], i64 [[B]])
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[C]]
 ;
@@ -152,9 +152,9 @@ define i32 @Test_pext_32(i32 %a, i32 %b) sanitize_memory {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.bmi.pext.32(i32 [[TMP1]], 
i32 [[B]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.pext.i32(i32 [[TMP1]], i32 
[[B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[C:%.*]] = tail call i32 @llvm.x86.bmi.pext.32(i32 [[A]], 
i32 [[B]])
+; CHECK-NEXT:    [[C:%.*]] = call i32 @llvm.pext.i32(i32 [[A]], i32 [[B]])
 ; CHECK-NEXT:    store i32 [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -173,9 +173,9 @@ define i64 @Test_pext_64(i64 %a, i64 %b) sanitize_memory {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext i1 [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.x86.bmi.pext.64(i64 [[TMP1]], 
i64 [[B]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.pext.i64(i64 [[TMP1]], i64 
[[B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = or i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.x86.bmi.pext.64(i64 [[A]], 
i64 [[B]])
+; CHECK-NEXT:    [[C:%.*]] = call i64 @llvm.pext.i64(i64 [[A]], i64 [[B]])
 ; CHECK-NEXT:    store i64 [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[C]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/pdep.ll 
b/llvm/test/Instrumentation/MemorySanitizer/pdep.ll
index 5a94f6abfa773..f323f386d0f50 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/pdep.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/pdep.ll
@@ -7,10 +7,13 @@ target triple = "x86_64-unknown-linux-gnu"
 define i8 @Test_pdep_8(i8 %a, i8 %b) sanitize_memory {
 ; CHECK-LABEL: define i8 @Test_pdep_8(
 ; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr 
@__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = or i8 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP7]] to i8
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.pdep.i8(i8 [[TMP2]], i8 [[B]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or i8 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[C:%.*]] = tail call i8 @llvm.pdep.i8(i8 [[A]], i8 [[B]])
 ; CHECK-NEXT:    store i8 [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i8 [[C]]
@@ -23,10 +26,13 @@ define i8 @Test_pdep_8(i8 %a, i8 %b) sanitize_memory {
 define i16 @Test_pdep_16(i16 %a, i16 %b) sanitize_memory {
 ; CHECK-LABEL: define i16 @Test_pdep_16(
 ; CHECK-SAME: i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr 
@__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = or i16 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP7]] to i16
+; CHECK-NEXT:    [[TMP5:%.*]] = call i16 @llvm.pdep.i16(i16 [[TMP2]], i16 
[[B]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or i16 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[C:%.*]] = tail call i16 @llvm.pdep.i16(i16 [[A]], i16 [[B]])
 ; CHECK-NEXT:    store i16 [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i16 [[C]]
@@ -39,10 +45,13 @@ define i16 @Test_pdep_16(i16 %a, i16 %b) sanitize_memory {
 define i32 @Test_pdep_32(i32 %a, i32 %b) sanitize_memory {
 ; CHECK-LABEL: define i32 @Test_pdep_32(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr 
@__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.pdep.i32(i32 [[TMP2]], i32 
[[B]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[C:%.*]] = tail call i32 @llvm.pdep.i32(i32 [[A]], i32 [[B]])
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[C]]
@@ -55,10 +64,13 @@ define i32 @Test_pdep_32(i32 %a, i32 %b) sanitize_memory {
 define i64 @Test_pdep_64(i64 %a, i64 %b) sanitize_memory {
 ; CHECK-LABEL: define i64 @Test_pdep_64(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr 
@__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.pdep.i64(i64 [[TMP2]], i64 
[[B]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.pdep.i64(i64 [[A]], i64 [[B]])
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[C]]
@@ -70,10 +82,13 @@ define i64 @Test_pdep_64(i64 %a, i64 %b) sanitize_memory {
 define i128 @Test_pdep_128(i128 %a, i128 %b) sanitize_memory {
 ; CHECK-LABEL: define i128 @Test_pdep_128(
 ; CHECK-SAME: i128 [[A:%.*]], i128 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr getelementptr (i8, ptr 
@__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = call i128 @llvm.pdep.i128(i128 [[TMP2]], i128 
[[B]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or i128 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[C:%.*]] = tail call i128 @llvm.pdep.i128(i128 [[A]], i128 
[[B]])
 ; CHECK-NEXT:    store i128 [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i128 [[C]]
diff --git a/llvm/test/Instrumentation/MemorySanitizer/pext.ll 
b/llvm/test/Instrumentation/MemorySanitizer/pext.ll
index 72c4834998446..2caf6a47ac93b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/pext.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/pext.ll
@@ -7,10 +7,13 @@ target triple = "x86_64-unknown-linux-gnu"
 define i8 @Test_pext_8(i8 %a, i8 %b) sanitize_memory {
 ; CHECK-LABEL: define i8 @Test_pext_8(
 ; CHECK-SAME: i8 [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr 
@__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = or i8 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP7]] to i8
+; CHECK-NEXT:    [[TMP5:%.*]] = call i8 @llvm.pext.i8(i8 [[TMP2]], i8 [[B]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or i8 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[C:%.*]] = tail call i8 @llvm.pext.i8(i8 [[A]], i8 [[B]])
 ; CHECK-NEXT:    store i8 [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i8 [[C]]
@@ -23,10 +26,13 @@ define i8 @Test_pext_8(i8 %a, i8 %b) sanitize_memory {
 define i16 @Test_pext_16(i16 %a, i16 %b) sanitize_memory {
 ; CHECK-LABEL: define i16 @Test_pext_16(
 ; CHECK-SAME: i16 [[A:%.*]], i16 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr 
@__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = or i16 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP7]] to i16
+; CHECK-NEXT:    [[TMP5:%.*]] = call i16 @llvm.pext.i16(i16 [[TMP2]], i16 
[[B]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or i16 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[C:%.*]] = tail call i16 @llvm.pext.i16(i16 [[A]], i16 [[B]])
 ; CHECK-NEXT:    store i16 [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i16 [[C]]
@@ -39,10 +45,13 @@ define i16 @Test_pext_16(i16 %a, i16 %b) sanitize_memory {
 define i32 @Test_pext_32(i32 %a, i32 %b) sanitize_memory {
 ; CHECK-LABEL: define i32 @Test_pext_32(
 ; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr 
@__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.pext.i32(i32 [[TMP2]], i32 
[[B]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[C:%.*]] = tail call i32 @llvm.pext.i32(i32 [[A]], i32 [[B]])
 ; CHECK-NEXT:    store i32 [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[C]]
@@ -55,10 +64,13 @@ define i32 @Test_pext_32(i32 %a, i32 %b) sanitize_memory {
 define i64 @Test_pext_64(i64 %a, i64 %b) sanitize_memory {
 ; CHECK-LABEL: define i64 @Test_pext_64(
 ; CHECK-SAME: i64 [[A:%.*]], i64 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr getelementptr (i8, ptr 
@__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.pext.i64(i64 [[TMP2]], i64 
[[B]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[C:%.*]] = tail call i64 @llvm.pext.i64(i64 [[A]], i64 [[B]])
 ; CHECK-NEXT:    store i64 [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[C]]
@@ -70,10 +82,13 @@ define i64 @Test_pext_64(i64 %a, i64 %b) sanitize_memory {
 define i128 @Test_pext_128(i128 %a, i128 %b) sanitize_memory {
 ; CHECK-LABEL: define i128 @Test_pext_128(
 ; CHECK-SAME: i128 [[A:%.*]], i128 [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr getelementptr (i8, ptr 
@__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = call i128 @llvm.pext.i128(i128 [[TMP2]], i128 
[[B]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or i128 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[C:%.*]] = tail call i128 @llvm.pext.i128(i128 [[A]], i128 
[[B]])
 ; CHECK-NEXT:    store i128 [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i128 [[C]]
diff --git a/llvm/test/Transforms/InstCombine/pdep.ll 
b/llvm/test/Transforms/InstCombine/pdep.ll
index ceb4d1f97b6b0..b726e87a6168c 100644
--- a/llvm/test/Transforms/InstCombine/pdep.ll
+++ b/llvm/test/Transforms/InstCombine/pdep.ll
@@ -3,8 +3,7 @@
 
 define i32 @test_pdep_32_zero_mask(i32 %x) nounwind readnone {
 ; CHECK-LABEL: @test_pdep_32_zero_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 [[X:%.*]], 
i32 0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 0
 ;
   %1 = tail call i32 @llvm.pdep.i32(i32 %x, i32 0)
   ret i32 %1
@@ -12,8 +11,7 @@ define i32 @test_pdep_32_zero_mask(i32 %x) nounwind readnone {
 
 define i64 @test_pdep_64_zero_mask(i64 %x) nounwind readnone {
 ; CHECK-LABEL: @test_pdep_64_zero_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 [[X:%.*]], 
i64 0)
-; CHECK-NEXT:    ret i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 0
 ;
   %1 = tail call i64 @llvm.pdep.i64(i64 %x, i64 0)
   ret i64 %1
@@ -21,8 +19,7 @@ define i64 @test_pdep_64_zero_mask(i64 %x) nounwind readnone {
 
 define i32 @test_pdep_32_allones_mask(i32 %x) nounwind readnone {
 ; CHECK-LABEL: @test_pdep_32_allones_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 [[X:%.*]], 
i32 -1)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP1:%.*]]
 ;
   %1 = tail call i32 @llvm.pdep.i32(i32 %x, i32 -1)
   ret i32 %1
@@ -30,8 +27,7 @@ define i32 @test_pdep_32_allones_mask(i32 %x) nounwind 
readnone {
 
 define i64 @test_pdep_64_allones_mask(i64 %x) nounwind readnone {
 ; CHECK-LABEL: @test_pdep_64_allones_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 [[X:%.*]], 
i64 -1)
-; CHECK-NEXT:    ret i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 [[TMP1:%.*]]
 ;
   %1 = tail call i64 @llvm.pdep.i64(i64 %x, i64 -1)
   ret i64 %1
@@ -39,7 +35,8 @@ define i64 @test_pdep_64_allones_mask(i64 %x) nounwind 
readnone {
 
 define i32 @test_pdep_32_shifted_mask(i32 %x) nounwind readnone {
 ; CHECK-LABEL: @test_pdep_32_shifted_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 [[X:%.*]], 
i32 12)
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[X:%.*]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP2]], 12
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %1 = tail call i32 @llvm.pdep.i32(i32 %x, i32 12)
@@ -48,7 +45,8 @@ define i32 @test_pdep_32_shifted_mask(i32 %x) nounwind 
readnone {
 
 define i64 @test_pdep_64_shifted_mask(i64 %x) nounwind readnone {
 ; CHECK-LABEL: @test_pdep_64_shifted_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 [[X:%.*]], 
i64 12)
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[X:%.*]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP2]], 12
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
   %1 = tail call i64 @llvm.pdep.i64(i64 %x, i64 12)
@@ -57,8 +55,7 @@ define i64 @test_pdep_64_shifted_mask(i64 %x) nounwind 
readnone {
 
 define i32 @test_pdep_32_constant_fold() nounwind readnone {
 ; CHECK-LABEL: @test_pdep_32_constant_fold(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 1985229328, 
i32 -252645136)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 807407616
 ;
   %1 = tail call i32 @llvm.pdep.i32(i32 1985229328, i32 4042322160)
   ret i32 %1
@@ -66,8 +63,7 @@ define i32 @test_pdep_32_constant_fold() nounwind readnone {
 
 define i64 @test_pdep_64_constant_fold() nounwind readnone {
 ; CHECK-LABEL: @test_pdep_64_constant_fold(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 
8526495043095935640, i64 -1085102592571150096)
-; CHECK-NEXT:    ret i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 -1089641583808049024
 ;
   %1 = tail call i64 @llvm.pdep.i64(i64 8526495043095935640, i64 
-1085102592571150096)
   ret i64 %1
@@ -75,8 +71,7 @@ define i64 @test_pdep_64_constant_fold() nounwind readnone {
 
 define i32 @test_pdep_32_constant_fold_2() nounwind readnone {
 ; CHECK-LABEL: @test_pdep_32_constant_fold_2(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.pdep.i32(i32 1985229328, 
i32 -16776961)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 838860816
 ;
   %1 = tail call i32 @llvm.pdep.i32(i32 1985229328, i32 4278190335)
   ret i32 %1
@@ -84,8 +79,7 @@ define i32 @test_pdep_32_constant_fold_2() nounwind readnone {
 
 define i64 @test_pdep_64_constant_fold_2() nounwind readnone {
 ; CHECK-LABEL: @test_pdep_64_constant_fold_2(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.pdep.i64(i64 
8526495043095935640, i64 -72056498804490496)
-; CHECK-NEXT:    ret i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 -144114243170822144
 ;
   %1 = tail call i64 @llvm.pdep.i64(i64 8526495043095935640, i64 
-72056498804490496)
   ret i64 %1
diff --git a/llvm/test/Transforms/InstCombine/pext.ll 
b/llvm/test/Transforms/InstCombine/pext.ll
index 52baa9a171c62..0f13f3f542023 100644
--- a/llvm/test/Transforms/InstCombine/pext.ll
+++ b/llvm/test/Transforms/InstCombine/pext.ll
@@ -3,8 +3,7 @@
 
 define i32 @test_pext_32_zero_mask(i32 %x) nounwind readnone {
 ; CHECK-LABEL: @test_pext_32_zero_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 [[X:%.*]], 
i32 0)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 0
 ;
   %1 = tail call i32 @llvm.pext.i32(i32 %x, i32 0)
   ret i32 %1
@@ -12,8 +11,7 @@ define i32 @test_pext_32_zero_mask(i32 %x) nounwind readnone {
 
 define i64 @test_pext_64_zero_mask(i64 %x) nounwind readnone {
 ; CHECK-LABEL: @test_pext_64_zero_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 [[X:%.*]], 
i64 0)
-; CHECK-NEXT:    ret i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 0
 ;
   %1 = tail call i64 @llvm.pext.i64(i64 %x, i64 0)
   ret i64 %1
@@ -21,8 +19,7 @@ define i64 @test_pext_64_zero_mask(i64 %x) nounwind readnone {
 
 define i32 @test_pext_32_allones_mask(i32 %x) nounwind readnone {
 ; CHECK-LABEL: @test_pext_32_allones_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 [[X:%.*]], 
i32 -1)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP1:%.*]]
 ;
   %1 = tail call i32 @llvm.pext.i32(i32 %x, i32 -1)
   ret i32 %1
@@ -30,8 +27,7 @@ define i32 @test_pext_32_allones_mask(i32 %x) nounwind 
readnone {
 
 define i64 @test_pext_64_allones_mask(i64 %x) nounwind readnone {
 ; CHECK-LABEL: @test_pext_64_allones_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 [[X:%.*]], 
i64 -1)
-; CHECK-NEXT:    ret i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 [[TMP1:%.*]]
 ;
   %1 = tail call i64 @llvm.pext.i64(i64 %x, i64 -1)
   ret i64 %1
@@ -39,7 +35,8 @@ define i64 @test_pext_64_allones_mask(i64 %x) nounwind 
readnone {
 
 define i32 @test_pext_32_shifted_mask(i32 %x) nounwind readnone {
 ; CHECK-LABEL: @test_pext_32_shifted_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 [[X:%.*]], 
i32 6)
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP2]], 3
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %1 = tail call i32 @llvm.pext.i32(i32 %x, i32 6)
@@ -48,7 +45,8 @@ define i32 @test_pext_32_shifted_mask(i32 %x) nounwind 
readnone {
 
 define i64 @test_pext_64_shifted_mask(i64 %x) nounwind readnone {
 ; CHECK-LABEL: @test_pext_64_shifted_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 [[X:%.*]], 
i64 6)
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[X:%.*]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP2]], 3
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
   %1 = tail call i64 @llvm.pext.i64(i64 %x, i64 6)
@@ -58,8 +56,7 @@ define i64 @test_pext_64_shifted_mask(i64 %x) nounwind 
readnone {
 
 define i32 @test_pext_32_constant_fold() nounwind readnone {
 ; CHECK-LABEL: @test_pext_32_constant_fold(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 1985229328, 
i32 -252645136)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 30001
 ;
   %1 = tail call i32 @llvm.pext.i32(i32 1985229328, i32 4042322160)
   ret i32 %1
@@ -67,8 +64,7 @@ define i32 @test_pext_32_constant_fold() nounwind readnone {
 
 define i64 @test_pext_64_constant_fold() nounwind readnone {
 ; CHECK-LABEL: @test_pext_64_constant_fold(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 
8526495043095935640, i64 -1085102592571150096)
-; CHECK-NEXT:    ret i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 1966210489
 ;
   %1 = tail call i64 @llvm.pext.i64(i64 8526495043095935640, i64 
-1085102592571150096)
   ret i64 %1
@@ -76,8 +72,7 @@ define i64 @test_pext_64_constant_fold() nounwind readnone {
 
 define i32 @test_pext_32_constant_fold_2() nounwind readnone {
 ; CHECK-LABEL: @test_pext_32_constant_fold_2(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.pext.i32(i32 1985229328, 
i32 -16776961)
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 30224
 ;
   %1 = tail call i32 @llvm.pext.i32(i32 1985229328, i32 4278190335)
   ret i32 %1
@@ -85,8 +80,7 @@ define i32 @test_pext_32_constant_fold_2() nounwind readnone {
 
 define i64 @test_pext_64_constant_fold_2() nounwind readnone {
 ; CHECK-LABEL: @test_pext_64_constant_fold_2(
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.pext.i64(i64 
8526495043095935640, i64 -72056498804490496)
-; CHECK-NEXT:    ret i64 [[TMP1]]
+; CHECK-NEXT:    ret i64 1980816570
 ;
   %1 = tail call i64 @llvm.pext.i64(i64 8526495043095935640, i64 
-72056498804490496)
   ret i64 %1

>From 5049452103c09e5f800ceb2f08e90e3c13d00e3e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <[email protected]>
Date: Thu, 18 Jun 2026 12:10:39 +0100
Subject: [PATCH 2/4] Use InstSimplify

---
 llvm/lib/Analysis/InstructionSimplify.cpp          | 14 ++++++++++++++
 .../Transforms/InstCombine/InstCombineCalls.cpp    | 12 ------------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp 
b/llvm/lib/Analysis/InstructionSimplify.cpp
index 7698d0d772a94..3b20592bcaed2 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6930,6 +6930,20 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, 
Type *ReturnType,
       return Constant::getNullValue(ReturnType);
     break;
   }
+  case Intrinsic::pdep: {
+    if (match(Op1, m_Zero()))
+      return Constant::getNullValue(ReturnType);
+    if (match(Op1, m_AllOnes()))
+      return Op0;
+    break;
+  }
+  case Intrinsic::pext: {
+    if (match(Op1, m_Zero()))
+      return Constant::getNullValue(ReturnType);
+    if (match(Op1, m_AllOnes()))
+      return Op0;
+    break;
+  }
   case Intrinsic::ptrmask: {
     // NOTE: We can't apply this simplifications based on the value of Op1
     // because we need to preserve provenance.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp 
b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 050a4e8cb27e4..980fd6391bbd1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2662,12 +2662,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst 
&CI) {
   }
   case Intrinsic::pdep:
     if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
-      if (MaskC->isNullValue())
-        return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), 0));
-
-      if (MaskC->isAllOnesValue())
-        return replaceInstUsesWith(*II, II->getArgOperand(0));
-
       unsigned MaskIdx, MaskLen;
       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
         // any single contingous sequence of 1s anywhere in the mask simply
@@ -2691,12 +2685,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst 
&CI) {
     break;
   case Intrinsic::pext:
     if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
-      if (MaskC->isNullValue())
-        return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), 0));
-
-      if (MaskC->isAllOnesValue())
-        return replaceInstUsesWith(*II, II->getArgOperand(0));
-
       unsigned MaskIdx, MaskLen;
       if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
         // any single contingous sequence of 1s anywhere in the mask simply

>From 40076a7e0f3c2dc863d3e5d94b42c10c367ecf98 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <[email protected]>
Date: Thu, 18 Jun 2026 12:21:50 +0100
Subject: [PATCH 3/4] Constant folding

---
 llvm/lib/Analysis/ConstantFolding.cpp            | 10 ++++++++++
 .../Transforms/InstCombine/InstCombineCalls.cpp  | 16 ----------------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Analysis/ConstantFolding.cpp 
b/llvm/lib/Analysis/ConstantFolding.cpp
index 3fe78d6c4322d..f18b7a0b66a21 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1756,6 +1756,8 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, 
const Function *F) {
   case Intrinsic::fshl:
   case Intrinsic::fshr:
   case Intrinsic::clmul:
+  case Intrinsic::pdep:
+  case Intrinsic::pext:
   case Intrinsic::launder_invariant_group:
   case Intrinsic::strip_invariant_group:
   case Intrinsic::masked_load:
@@ -3904,6 +3906,14 @@ static Constant 
*ConstantFoldIntrinsicCall2(Intrinsic::ID IntrinsicID, Type *Ty,
       if (!C0 || !C1)
         return Constant::getNullValue(Ty);
       return ConstantInt::get(Ty, APIntOps::clmul(*C0, *C1));
+    case Intrinsic::pdep:
+      if (!C0 || !C1)
+        return Constant::getNullValue(Ty);
+      return ConstantInt::get(Ty, APIntOps::expandBits(*C0, *C1));
+    case Intrinsic::pext:
+      if (!C0 || !C1)
+        return Constant::getNullValue(Ty);
+      return ConstantInt::get(Ty, APIntOps::compressBits(*C0, *C1));
     case Intrinsic::amdgcn_wave_reduce_umin:
     case Intrinsic::amdgcn_wave_reduce_umax:
     case Intrinsic::amdgcn_wave_reduce_max:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp 
b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 980fd6391bbd1..9e36aaaf8b508 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2673,14 +2673,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst 
&CI) {
         Value *Masked = Builder.CreateAnd(Shifted, II->getArgOperand(1));
         return replaceInstUsesWith(*II, Masked);
       }
-
-      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
-        // constant folding.
-        APInt Result =
-            llvm::APIntOps::expandBits(SrcC->getValue(), MaskC->getValue());
-        return replaceInstUsesWith(*II,
-                                   ConstantInt::get(II->getType(), Result));
-      }
     }
     break;
   case Intrinsic::pext:
@@ -2696,14 +2688,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst 
&CI) {
         Value *Shifted = Builder.CreateLShr(Masked, ShiftAmt);
         return replaceInstUsesWith(*II, Shifted);
       }
-
-      if (auto *SrcC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
-        // constant folding.
-        APInt Result =
-            llvm::APIntOps::compressBits(SrcC->getValue(), MaskC->getValue());
-        return replaceInstUsesWith(*II,
-                                   ConstantInt::get(II->getType(), Result));
-      }
     }
     break;
   case Intrinsic::ptrmask: {

>From 7f95f8b1f959733e851081c7a25dc975758b49ae Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <[email protected]>
Date: Thu, 18 Jun 2026 12:29:58 +0100
Subject: [PATCH 4/4] Use PatternMatch to collect packed bits masks

---
 .../InstCombine/InstCombineCalls.cpp          | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp 
b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 9e36aaaf8b508..1df156053e302 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2660,11 +2660,12 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst 
&CI) {
       return &CI;
     break;
   }
-  case Intrinsic::pdep:
-    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+  case Intrinsic::pdep: {
+    const APInt *MaskC;
+    if (match(II->getArgOperand(1), m_APInt(MaskC))) {
       unsigned MaskIdx, MaskLen;
-      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
-        // any single contingous sequence of 1s anywhere in the mask simply
+      if (MaskC->isShiftedMask(MaskIdx, MaskLen)) {
+        // any single contiguous sequence of 1s anywhere in the mask simply
         // describes a subset of the input bits shifted to the appropriate
         // position.  Replace with the straight forward IR.
         Value *Input = II->getArgOperand(0);
@@ -2675,11 +2676,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst 
&CI) {
       }
     }
     break;
-  case Intrinsic::pext:
-    if (auto *MaskC = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+  }
+  case Intrinsic::pext: {
+    const APInt *MaskC;
+    if (match(II->getArgOperand(1), m_APInt(MaskC))) {
       unsigned MaskIdx, MaskLen;
-      if (MaskC->getValue().isShiftedMask(MaskIdx, MaskLen)) {
-        // any single contingous sequence of 1s anywhere in the mask simply
+      if (MaskC->isShiftedMask(MaskIdx, MaskLen)) {
+        // any single contiguous sequence of 1s anywhere in the mask simply
         // describes a subset of the input bits shifted to the appropriate
         // position.  Replace with the straight forward IR.
         Value *Input = II->getArgOperand(0);
@@ -2690,6 +2693,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst 
&CI) {
       }
     }
     break;
+  }
   case Intrinsic::ptrmask: {
     unsigned BitWidth = DL.getPointerTypeSizeInBits(II->getType());
     KnownBits Known(BitWidth);

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to