https://github.com/banach-space created 
https://github.com/llvm/llvm-project/pull/181974

- **[clang][ARM] Refactor argument handling in `EmitAArch64BuiltinExpr` (1/2) 
(NFC)**
- **[clang][ARM] Refactor argument handling in `EmitAArch64BuiltinExpr` (2/2) 
(NFC)**


From 5dc9f1a3aee2873a5169f35e1d3e97dfbcf1ef25 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <[email protected]>
Date: Mon, 16 Feb 2026 18:07:31 +0000
Subject: [PATCH 1/2] [clang][ARM] Refactor argument handling in
 `EmitAArch64BuiltinExpr` (1/2) (NFC)

Refactor `EmitAArch64BuiltinExpr` so that all AArch64/NEON builtins
handled by this hook _and marked as non-overloaded_ share a common path
for generating LLVM IR arguments (collected into the `Ops`
`SmallVector<Value*>`) (*)

Previously, the argument emission loop unconditionally skipped the
trailing argument:

```cpp
  for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; ++i)
```

This was originally intended to ignore the extra Sema-only argument
used by overloaded NEON builtins (e.g. the type discriminator passed
by `__builtin_neon_*` intrinsics). However, this logic was applied
unconditionally.

This patch updates the loop to skip the trailing argument only when
`HasExtraNeonArgument` returns true for non-SISD builtins:

```cpp
  bool HasExtraArg = !IsSISD && HasExtraNeonArgument(BuiltinID);
  unsigned NumArgs =
      E->getNumArgs() - (HasExtraArg ? 1 : 0);
  for (unsigned i = 0, e = NumArgs; i != e; ++i)
```

This preserves existing IR generation behaviour while making the
handling of Sema-only NEON discriminator arguments explicit.

For context, type discriminators can be found in definitions of various
builtins in `arm_neon.h`. For example, `vsriq_n_p64(<args>)` expands
into the following call:

```cpp
  __builtin_neon_vsriq_n_v(<args>, 38)
```

The trailing `38` encodes the concrete NEON vector type
(e.g. `poly64x2_t`) for overload resolution in Sema; it is not
semantically part of the operation and is ignored during IR generation.

As part of this change, `HasExtraNeonArgument` was completed so
that these discriminator arguments are correctly identified.

No functional change intended.

(*) This refers to two large `switch` stmts inside
`EmitAArch64BuiltinExpr` that are meant to switch the processing into
non-overloaded and overloaded builtins. The intended split between
non-overloaded and overloaded builtins is not consistently enforced: the
second switch (nominally handling overloaded builtins) also processes
some non-overloaded cases. This patch refactors only the first switch
and prepares for a follow-up cleanup in 2/2.
---
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 298 +++++++++++++-----
 .../test/CodeGen/arm64-microsoft-intrinsics.c |  32 +-
 2 files changed, 238 insertions(+), 92 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index cb6bbfe07538e..f0dddf33ac5a0 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -2710,46 +2710,203 @@ static Value *EmitRangePrefetchBuiltin(CodeGenFunction 
&CGF, unsigned BuiltinID,
 
 /// Return true if BuiltinID is an overloaded Neon intrinsic with an extra
 /// argument that specifies the vector type.
+/// TODO: Make this return false for SISD builtins.
 static bool HasExtraNeonArgument(unsigned BuiltinID) {
   switch (BuiltinID) {
   default: break;
-  case NEON::BI__builtin_neon_vget_lane_i8:
-  case NEON::BI__builtin_neon_vget_lane_i16:
-  case NEON::BI__builtin_neon_vget_lane_bf16:
-  case NEON::BI__builtin_neon_vget_lane_i32:
-  case NEON::BI__builtin_neon_vget_lane_i64:
-  case NEON::BI__builtin_neon_vget_lane_mf8:
-  case NEON::BI__builtin_neon_vget_lane_f32:
-  case NEON::BI__builtin_neon_vgetq_lane_i8:
-  case NEON::BI__builtin_neon_vgetq_lane_i16:
-  case NEON::BI__builtin_neon_vgetq_lane_bf16:
-  case NEON::BI__builtin_neon_vgetq_lane_i32:
-  case NEON::BI__builtin_neon_vgetq_lane_i64:
-  case NEON::BI__builtin_neon_vgetq_lane_mf8:
-  case NEON::BI__builtin_neon_vgetq_lane_f32:
-  case NEON::BI__builtin_neon_vduph_lane_bf16:
-  case NEON::BI__builtin_neon_vduph_laneq_bf16:
+
+  // Cases from EmitARMBuiltinExpr
+  case NEON::BI__builtin_neon_vsha1h_u32:
+  case NEON::BI__builtin_neon_vsha1cq_u32:
+  case NEON::BI__builtin_neon_vsha1pq_u32:
+  case NEON::BI__builtin_neon_vsha1mq_u32:
+  case NEON::BI__builtin_neon_vcvth_bf16_f32:
+
+  case clang::ARM::BI_MoveToCoprocessor:
+  case clang::ARM::BI_MoveToCoprocessor2:
+
+  // Cases for non-overloaded builtins from EmitAArch64BuiltinExpr
+  case NEON::BI__builtin_neon_vabsh_f16:
+  case NEON::BI__builtin_neon_vaddq_p128:
+  case NEON::BI__builtin_neon_vldrq_p128:
+  case NEON::BI__builtin_neon_vstrq_p128:
+  case NEON::BI__builtin_neon_vcvts_f32_u32:
+  case NEON::BI__builtin_neon_vcvtd_f64_u64:
+  case NEON::BI__builtin_neon_vcvts_f32_s32:
+  case NEON::BI__builtin_neon_vcvtd_f64_s64:
+  case NEON::BI__builtin_neon_vcvth_f16_u16:
+  case NEON::BI__builtin_neon_vcvth_f16_u32:
+  case NEON::BI__builtin_neon_vcvth_f16_u64:
+  case NEON::BI__builtin_neon_vcvth_f16_s16:
+  case NEON::BI__builtin_neon_vcvth_f16_s32:
+  case NEON::BI__builtin_neon_vcvth_f16_s64:
+  case NEON::BI__builtin_neon_vcvtah_u16_f16:
+  case NEON::BI__builtin_neon_vcvtmh_u16_f16:
+  case NEON::BI__builtin_neon_vcvtnh_u16_f16:
+  case NEON::BI__builtin_neon_vcvtph_u16_f16:
+  case NEON::BI__builtin_neon_vcvth_u16_f16:
+  case NEON::BI__builtin_neon_vcvtah_s16_f16:
+  case NEON::BI__builtin_neon_vcvtmh_s16_f16:
+  case NEON::BI__builtin_neon_vcvtnh_s16_f16:
+  case NEON::BI__builtin_neon_vcvtph_s16_f16:
+  case NEON::BI__builtin_neon_vcvth_s16_f16:
+  case NEON::BI__builtin_neon_vcaleh_f16:
+  case NEON::BI__builtin_neon_vcalth_f16:
+  case NEON::BI__builtin_neon_vcageh_f16:
+  case NEON::BI__builtin_neon_vcagth_f16:
+  case NEON::BI__builtin_neon_vcvth_n_s16_f16:
+  case NEON::BI__builtin_neon_vcvth_n_u16_f16:
+  case NEON::BI__builtin_neon_vcvth_n_f16_s16:
+  case NEON::BI__builtin_neon_vcvth_n_f16_u16:
+  case NEON::BI__builtin_neon_vpaddd_s64:
+  case NEON::BI__builtin_neon_vpaddd_f64:
+  case NEON::BI__builtin_neon_vpadds_f32:
+  case NEON::BI__builtin_neon_vceqzd_s64:
+  case NEON::BI__builtin_neon_vceqzd_f64:
+  case NEON::BI__builtin_neon_vceqzs_f32:
+  case NEON::BI__builtin_neon_vceqzh_f16:
+  case NEON::BI__builtin_neon_vcgezd_s64:
+  case NEON::BI__builtin_neon_vcgezd_f64:
+  case NEON::BI__builtin_neon_vcgezs_f32:
+  case NEON::BI__builtin_neon_vcgezh_f16:
+  case NEON::BI__builtin_neon_vclezd_s64:
+  case NEON::BI__builtin_neon_vclezd_f64:
+  case NEON::BI__builtin_neon_vclezs_f32:
+  case NEON::BI__builtin_neon_vclezh_f16:
+  case NEON::BI__builtin_neon_vcgtzd_s64:
+  case NEON::BI__builtin_neon_vcgtzd_f64:
+  case NEON::BI__builtin_neon_vcgtzs_f32:
+  case NEON::BI__builtin_neon_vcgtzh_f16:
+  case NEON::BI__builtin_neon_vcltzd_s64:
+  case NEON::BI__builtin_neon_vcltzd_f64:
+  case NEON::BI__builtin_neon_vcltzs_f32:
+  case NEON::BI__builtin_neon_vcltzh_f16:
+  case NEON::BI__builtin_neon_vceqzd_u64:
+  case NEON::BI__builtin_neon_vceqd_f64:
+  case NEON::BI__builtin_neon_vcled_f64:
+  case NEON::BI__builtin_neon_vcltd_f64:
+  case NEON::BI__builtin_neon_vcged_f64:
+  case NEON::BI__builtin_neon_vcgtd_f64:
+  case NEON::BI__builtin_neon_vceqs_f32:
+  case NEON::BI__builtin_neon_vcles_f32:
+  case NEON::BI__builtin_neon_vclts_f32:
+  case NEON::BI__builtin_neon_vcges_f32:
+  case NEON::BI__builtin_neon_vcgts_f32:
+  case NEON::BI__builtin_neon_vceqh_f16:
+  case NEON::BI__builtin_neon_vcleh_f16:
+  case NEON::BI__builtin_neon_vclth_f16:
+  case NEON::BI__builtin_neon_vcgeh_f16:
+  case NEON::BI__builtin_neon_vcgth_f16:
+  case NEON::BI__builtin_neon_vceqd_s64:
+  case NEON::BI__builtin_neon_vceqd_u64:
+  case NEON::BI__builtin_neon_vcgtd_s64:
+  case NEON::BI__builtin_neon_vcgtd_u64:
+  case NEON::BI__builtin_neon_vcltd_s64:
+  case NEON::BI__builtin_neon_vcltd_u64:
+  case NEON::BI__builtin_neon_vcged_u64:
+  case NEON::BI__builtin_neon_vcged_s64:
+  case NEON::BI__builtin_neon_vcled_u64:
+  case NEON::BI__builtin_neon_vcled_s64:
+  case NEON::BI__builtin_neon_vnegd_s64:
+  case NEON::BI__builtin_neon_vnegh_f16:
+  case NEON::BI__builtin_neon_vtstd_s64:
+  case NEON::BI__builtin_neon_vtstd_u64:
   case NEON::BI__builtin_neon_vset_lane_i8:
-  case NEON::BI__builtin_neon_vset_lane_mf8:
   case NEON::BI__builtin_neon_vset_lane_i16:
-  case NEON::BI__builtin_neon_vset_lane_bf16:
   case NEON::BI__builtin_neon_vset_lane_i32:
   case NEON::BI__builtin_neon_vset_lane_i64:
+  case NEON::BI__builtin_neon_vset_lane_bf16:
   case NEON::BI__builtin_neon_vset_lane_f32:
   case NEON::BI__builtin_neon_vsetq_lane_i8:
-  case NEON::BI__builtin_neon_vsetq_lane_mf8:
   case NEON::BI__builtin_neon_vsetq_lane_i16:
-  case NEON::BI__builtin_neon_vsetq_lane_bf16:
   case NEON::BI__builtin_neon_vsetq_lane_i32:
   case NEON::BI__builtin_neon_vsetq_lane_i64:
+  case NEON::BI__builtin_neon_vsetq_lane_bf16:
   case NEON::BI__builtin_neon_vsetq_lane_f32:
-  case NEON::BI__builtin_neon_vsha1h_u32:
-  case NEON::BI__builtin_neon_vsha1cq_u32:
-  case NEON::BI__builtin_neon_vsha1pq_u32:
-  case NEON::BI__builtin_neon_vsha1mq_u32:
-  case NEON::BI__builtin_neon_vcvth_bf16_f32:
-  case clang::ARM::BI_MoveToCoprocessor:
-  case clang::ARM::BI_MoveToCoprocessor2:
+  case NEON::BI__builtin_neon_vset_lane_f64:
+  case NEON::BI__builtin_neon_vset_lane_mf8:
+  case NEON::BI__builtin_neon_vsetq_lane_mf8:
+  case NEON::BI__builtin_neon_vsetq_lane_f64:
+  case NEON::BI__builtin_neon_vget_lane_i8:
+  case NEON::BI__builtin_neon_vdupb_lane_i8:
+  case NEON::BI__builtin_neon_vgetq_lane_i8:
+  case NEON::BI__builtin_neon_vdupb_laneq_i8:
+  case NEON::BI__builtin_neon_vget_lane_mf8:
+  case NEON::BI__builtin_neon_vdupb_lane_mf8:
+  case NEON::BI__builtin_neon_vgetq_lane_mf8:
+  case NEON::BI__builtin_neon_vdupb_laneq_mf8:
+  case NEON::BI__builtin_neon_vget_lane_i16:
+  case NEON::BI__builtin_neon_vduph_lane_i16:
+  case NEON::BI__builtin_neon_vgetq_lane_i16:
+  case NEON::BI__builtin_neon_vduph_laneq_i16:
+  case NEON::BI__builtin_neon_vget_lane_i32:
+  case NEON::BI__builtin_neon_vdups_lane_i32:
+  case NEON::BI__builtin_neon_vdups_lane_f32:
+  case NEON::BI__builtin_neon_vgetq_lane_i32:
+  case NEON::BI__builtin_neon_vdups_laneq_i32:
+  case NEON::BI__builtin_neon_vget_lane_i64:
+  case NEON::BI__builtin_neon_vdupd_lane_i64:
+  case NEON::BI__builtin_neon_vdupd_lane_f64:
+  case NEON::BI__builtin_neon_vgetq_lane_i64:
+  case NEON::BI__builtin_neon_vdupd_laneq_i64:
+  case NEON::BI__builtin_neon_vget_lane_f32:
+  case NEON::BI__builtin_neon_vget_lane_f64:
+  case NEON::BI__builtin_neon_vgetq_lane_f32:
+  case NEON::BI__builtin_neon_vdups_laneq_f32:
+  case NEON::BI__builtin_neon_vgetq_lane_f64:
+  case NEON::BI__builtin_neon_vdupd_laneq_f64:
+  case NEON::BI__builtin_neon_vaddh_f16:
+  case NEON::BI__builtin_neon_vsubh_f16:
+  case NEON::BI__builtin_neon_vmulh_f16:
+  case NEON::BI__builtin_neon_vdivh_f16:
+  case NEON::BI__builtin_neon_vfmah_f16:
+  case NEON::BI__builtin_neon_vfmsh_f16:
+  case NEON::BI__builtin_neon_vaddd_s64:
+  case NEON::BI__builtin_neon_vaddd_u64:
+  case NEON::BI__builtin_neon_vsubd_s64:
+  case NEON::BI__builtin_neon_vsubd_u64:
+  case NEON::BI__builtin_neon_vqdmlalh_s16:
+  case NEON::BI__builtin_neon_vqdmlslh_s16:
+  case NEON::BI__builtin_neon_vqshlud_n_s64:
+  case NEON::BI__builtin_neon_vqshld_n_u64:
+  case NEON::BI__builtin_neon_vqshld_n_s64:
+  case NEON::BI__builtin_neon_vrshrd_n_u64:
+  case NEON::BI__builtin_neon_vrshrd_n_s64:
+  case NEON::BI__builtin_neon_vrsrad_n_u64:
+  case NEON::BI__builtin_neon_vrsrad_n_s64:
+  case NEON::BI__builtin_neon_vshld_n_s64:
+  case NEON::BI__builtin_neon_vshld_n_u64:
+  case NEON::BI__builtin_neon_vshrd_n_s64:
+  case NEON::BI__builtin_neon_vshrd_n_u64:
+  case NEON::BI__builtin_neon_vsrad_n_s64:
+  case NEON::BI__builtin_neon_vsrad_n_u64:
+  case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
+  case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
+  case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
+  case NEON::BI__builtin_neon_vqdmlslh_laneq_s16:
+  case NEON::BI__builtin_neon_vqdmlals_s32:
+  case NEON::BI__builtin_neon_vqdmlsls_s32:
+  case NEON::BI__builtin_neon_vqdmlals_lane_s32:
+  case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
+  case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
+  case NEON::BI__builtin_neon_vqdmlsls_laneq_s32:
+  case NEON::BI__builtin_neon_vget_lane_bf16:
+  case NEON::BI__builtin_neon_vduph_lane_bf16:
+  case NEON::BI__builtin_neon_vduph_lane_f16:
+  case NEON::BI__builtin_neon_vgetq_lane_bf16:
+  case NEON::BI__builtin_neon_vduph_laneq_bf16:
+  case NEON::BI__builtin_neon_vduph_laneq_f16:
+  case NEON::BI__builtin_neon_vcvt_bf16_f32:
+  case NEON::BI__builtin_neon_vcvtq_low_bf16_f32:
+  case NEON::BI__builtin_neon_vcvtq_high_bf16_f32:
+  case clang::AArch64::BI_InterlockedAdd:
+  case clang::AArch64::BI_InterlockedAdd_acq:
+  case clang::AArch64::BI_InterlockedAdd_rel:
+  case clang::AArch64::BI_InterlockedAdd_nf:
+  case clang::AArch64::BI_InterlockedAdd64:
+  case clang::AArch64::BI_InterlockedAdd64_acq:
+  case clang::AArch64::BI_InterlockedAdd64_rel:
+  case clang::AArch64::BI_InterlockedAdd64_nf:
     return false;
   }
   return true;
@@ -5871,6 +6028,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   if (It != end(NEONEquivalentIntrinsicMap))
     BuiltinID = It->second;
 
+  // Check whether this is an SISD builtin.
+  auto SISDMap = ArrayRef(AArch64SISDIntrinsicMap);
+  const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
+      SISDMap, BuiltinID, AArch64SISDIntrinsicsProvenSorted);
+  bool IsSISD = (Builtin != nullptr);
+
   // Find out if any arguments are required to be integer constant
   // expressions.
   unsigned ICEArguments = 0;
@@ -5880,7 +6043,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
 
   llvm::SmallVector<Value*, 4> Ops;
   Address PtrOp0 = Address::invalid();
-  for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
+  // Note the assumption that SISD intrinsics do not contain extra arguments.
+  // TODO: Fold this into a single function call instead of, effectively, two
+  // separate checks.
+  bool HasExtraArg = !IsSISD && HasExtraNeonArgument(BuiltinID);
+  unsigned NumArgs = E->getNumArgs() - (HasExtraArg ? 1 : 0);
+  for (unsigned i = 0, e = NumArgs; i != e; i++) {
     if (i == 0) {
       switch (BuiltinID) {
       case NEON::BI__builtin_neon_vld1_v:
@@ -5907,12 +6075,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Ops.push_back(EmitScalarOrConstFoldImmArg(ICEArguments, i, E));
   }
 
-  auto SISDMap = ArrayRef(AArch64SISDIntrinsicMap);
-  const ARMVectorIntrinsicInfo *Builtin = findARMVectorIntrinsicInMap(
-      SISDMap, BuiltinID, AArch64SISDIntrinsicsProvenSorted);
-
   if (Builtin) {
-    Ops.push_back(EmitScalarExpr(E->getArg(E->getNumArgs() - 1)));
     Value *Result = EmitCommonNeonSISDBuiltinExpr(*this, *Builtin, Ops, E);
     assert(Result && "SISD intrinsic should have been handled");
     return Result;
@@ -5947,7 +6110,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   switch (BuiltinID) {
   default: break;
   case NEON::BI__builtin_neon_vabsh_f16:
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, HalfTy), Ops, 
"vabs");
   case NEON::BI__builtin_neon_vaddq_p128: {
     llvm::Type *Ty = GetNeonType(this, NeonTypeFlags::Poly128);
@@ -5974,7 +6136,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     [[fallthrough]];
   case NEON::BI__builtin_neon_vcvts_f32_s32:
   case NEON::BI__builtin_neon_vcvtd_f64_s64: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     bool Is64 = Ops[0]->getType()->getPrimitiveSizeInBits() == 64;
     llvm::Type *InTy = Is64 ? Int64Ty : Int32Ty;
     llvm::Type *FTy = Is64 ? DoubleTy : FloatTy;
@@ -5991,7 +6152,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   case NEON::BI__builtin_neon_vcvth_f16_s16:
   case NEON::BI__builtin_neon_vcvth_f16_s32:
   case NEON::BI__builtin_neon_vcvth_f16_s64: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     llvm::Type *FTy = HalfTy;
     llvm::Type *InTy;
     if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64)
@@ -6018,7 +6178,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     llvm::Type *InTy = Int16Ty;
     llvm::Type* FTy  = HalfTy;
     llvm::Type *Tys[2] = {InTy, FTy};
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     switch (BuiltinID) {
     default: llvm_unreachable("missing builtin ID in switch!");
     case NEON::BI__builtin_neon_vcvtah_u16_f16:
@@ -6051,7 +6210,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     llvm::Type* InTy = Int32Ty;
     llvm::Type* FTy  = HalfTy;
     llvm::Type *Tys[2] = {InTy, FTy};
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     switch (BuiltinID) {
     default: llvm_unreachable("missing builtin ID in switch!");
     case NEON::BI__builtin_neon_vcageh_f16:
@@ -6071,7 +6229,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     llvm::Type* InTy = Int32Ty;
     llvm::Type* FTy  = HalfTy;
     llvm::Type *Tys[2] = {InTy, FTy};
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     switch (BuiltinID) {
     default: llvm_unreachable("missing builtin ID in switch!");
     case NEON::BI__builtin_neon_vcvth_n_s16_f16:
@@ -6087,7 +6244,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     llvm::Type* FTy  = HalfTy;
     llvm::Type* InTy = Int32Ty;
     llvm::Type *Tys[2] = {FTy, InTy};
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     switch (BuiltinID) {
     default: llvm_unreachable("missing builtin ID in switch!");
     case NEON::BI__builtin_neon_vcvth_n_f16_s16:
@@ -6102,91 +6258,81 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "fcvth_n");
   }
   case NEON::BI__builtin_neon_vpaddd_s64: {
+    // TODO: Isn't this handled by
+    // EmitCommonNeonSISDBuiltinExpr?
     auto *Ty = llvm::FixedVectorType::get(Int64Ty, 2);
-    Value *Vec = EmitScalarExpr(E->getArg(0));
     // The vector is v2f64, so make sure it's bitcast to that.
-    Vec = Builder.CreateBitCast(Vec, Ty, "v2i64");
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty, "v2i64");
     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
-    Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
-    Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
+    Value *Op0 = Builder.CreateExtractElement(Ops[0], Idx0, "lane0");
+    Value *Op1 = Builder.CreateExtractElement(Ops[0], Idx1, "lane1");
     // Pairwise addition of a v2f64 into a scalar f64.
     return Builder.CreateAdd(Op0, Op1, "vpaddd");
   }
   case NEON::BI__builtin_neon_vpaddd_f64: {
     auto *Ty = llvm::FixedVectorType::get(DoubleTy, 2);
-    Value *Vec = EmitScalarExpr(E->getArg(0));
     // The vector is v2f64, so make sure it's bitcast to that.
-    Vec = Builder.CreateBitCast(Vec, Ty, "v2f64");
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty, "v2f64");
     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
-    Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
-    Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
+    Value *Op0 = Builder.CreateExtractElement(Ops[0], Idx0, "lane0");
+    Value *Op1 = Builder.CreateExtractElement(Ops[0], Idx1, "lane1");
     // Pairwise addition of a v2f64 into a scalar f64.
     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
   }
   case NEON::BI__builtin_neon_vpadds_f32: {
     auto *Ty = llvm::FixedVectorType::get(FloatTy, 2);
-    Value *Vec = EmitScalarExpr(E->getArg(0));
     // The vector is v2f32, so make sure it's bitcast to that.
-    Vec = Builder.CreateBitCast(Vec, Ty, "v2f32");
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty, "v2f32");
     llvm::Value *Idx0 = llvm::ConstantInt::get(SizeTy, 0);
     llvm::Value *Idx1 = llvm::ConstantInt::get(SizeTy, 1);
-    Value *Op0 = Builder.CreateExtractElement(Vec, Idx0, "lane0");
-    Value *Op1 = Builder.CreateExtractElement(Vec, Idx1, "lane1");
+    Value *Op0 = Builder.CreateExtractElement(Ops[0], Idx0, "lane0");
+    Value *Op1 = Builder.CreateExtractElement(Ops[0], Idx1, "lane1");
     // Pairwise addition of a v2f32 into a scalar f32.
     return Builder.CreateFAdd(Op0, Op1, "vpaddd");
   }
   case NEON::BI__builtin_neon_vceqzd_s64:
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::ICMP_EQ, "vceqz");
   case NEON::BI__builtin_neon_vceqzd_f64:
   case NEON::BI__builtin_neon_vceqzs_f32:
   case NEON::BI__builtin_neon_vceqzh_f16:
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::FCMP_OEQ, "vceqz");
   case NEON::BI__builtin_neon_vcgezd_s64:
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::ICMP_SGE, "vcgez");
   case NEON::BI__builtin_neon_vcgezd_f64:
   case NEON::BI__builtin_neon_vcgezs_f32:
   case NEON::BI__builtin_neon_vcgezh_f16:
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::FCMP_OGE, "vcgez");
   case NEON::BI__builtin_neon_vclezd_s64:
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::ICMP_SLE, "vclez");
   case NEON::BI__builtin_neon_vclezd_f64:
   case NEON::BI__builtin_neon_vclezs_f32:
   case NEON::BI__builtin_neon_vclezh_f16:
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::FCMP_OLE, "vclez");
   case NEON::BI__builtin_neon_vcgtzd_s64:
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::ICMP_SGT, "vcgtz");
   case NEON::BI__builtin_neon_vcgtzd_f64:
   case NEON::BI__builtin_neon_vcgtzs_f32:
   case NEON::BI__builtin_neon_vcgtzh_f16:
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::FCMP_OGT, "vcgtz");
   case NEON::BI__builtin_neon_vcltzd_s64:
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::ICMP_SLT, "vcltz");
@@ -6194,13 +6340,11 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   case NEON::BI__builtin_neon_vcltzd_f64:
   case NEON::BI__builtin_neon_vcltzs_f32:
   case NEON::BI__builtin_neon_vcltzh_f16:
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     return EmitAArch64CompareBuiltinExpr(
         Ops[0], ConvertType(E->getCallReturnType(getContext())),
         ICmpInst::FCMP_OLT, "vcltz");
 
   case NEON::BI__builtin_neon_vceqzd_u64: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Ops[0] = Builder.CreateBitCast(Ops[0], Int64Ty);
     Ops[0] =
         Builder.CreateICmpEQ(Ops[0], llvm::Constant::getNullValue(Int64Ty));
@@ -6220,7 +6364,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     case NEON::BI__builtin_neon_vcged_f64: P = llvm::FCmpInst::FCMP_OGE; break;
     case NEON::BI__builtin_neon_vcgtd_f64: P = llvm::FCmpInst::FCMP_OGT; break;
     }
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
     Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy);
     if (P == llvm::FCmpInst::FCMP_OEQ)
@@ -6474,7 +6617,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   case NEON::BI__builtin_neon_vqdmlslh_s16: {
     SmallVector<Value *, 2> ProductOps;
     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
-    ProductOps.push_back(vectorWrapScalar16(EmitScalarExpr(E->getArg(2))));
+    ProductOps.push_back(vectorWrapScalar16(Ops[2]));
     auto *VTy = llvm::FixedVectorType::get(Int32Ty, 4);
     Ops[1] = EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmull, 
VTy),
                           ProductOps, "vqdmlXl");
@@ -6484,10 +6627,11 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlalh_s16
                                         ? Intrinsic::aarch64_neon_sqadd
                                         : Intrinsic::aarch64_neon_sqsub;
+    // Drop the 2nd multiplication argument before the accumulation
+    Ops.pop_back();
     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int32Ty), Ops, "vqdmlXl");
   }
   case NEON::BI__builtin_neon_vqshlud_n_s64: {
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqshlu, 
Int64Ty),
                         Ops, "vqshlu_n");
@@ -6497,7 +6641,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = BuiltinID == NEON::BI__builtin_neon_vqshld_n_u64
               ? Intrinsic::aarch64_neon_uqshl
               : Intrinsic::aarch64_neon_sqshl;
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     Ops[1] = Builder.CreateZExt(Ops[1], Int64Ty);
     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vqshl_n");
   }
@@ -6506,7 +6649,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = BuiltinID == NEON::BI__builtin_neon_vrshrd_n_u64
               ? Intrinsic::aarch64_neon_urshl
               : Intrinsic::aarch64_neon_srshl;
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     int SV = cast<ConstantInt>(Ops[1])->getSExtValue();
     Ops[1] = ConstantInt::get(Int64Ty, -SV);
     return EmitNeonCall(CGM.getIntrinsic(Int, Int64Ty), Ops, "vrshr_n");
@@ -6517,7 +6659,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
               ? Intrinsic::aarch64_neon_urshl
               : Intrinsic::aarch64_neon_srshl;
     Ops[1] = Builder.CreateBitCast(Ops[1], Int64Ty);
-    Ops.push_back(Builder.CreateNeg(EmitScalarExpr(E->getArg(2))));
+    Ops[2] = Builder.CreateNeg(Ops[2]);
     Ops[1] = Builder.CreateCall(CGM.getIntrinsic(Int, Int64Ty),
                                 {Ops[1], Builder.CreateSExt(Ops[2], Int64Ty)});
     return Builder.CreateAdd(Ops[0], Builder.CreateBitCast(Ops[1], Int64Ty));
@@ -6567,8 +6709,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
   case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
   case NEON::BI__builtin_neon_vqdmlslh_laneq_s16: {
-    Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
-                                          "lane");
+    Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "lane");
     SmallVector<Value *, 2> ProductOps;
     ProductOps.push_back(vectorWrapScalar16(Ops[1]));
     ProductOps.push_back(vectorWrapScalar16(Ops[2]));
@@ -6577,7 +6718,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
                           ProductOps, "vqdmlXl");
     Constant *CI = ConstantInt::get(SizeTy, 0);
     Ops[1] = Builder.CreateExtractElement(Ops[1], CI, "lane0");
-    Ops.pop_back();
+    // Drop lane-selection and the corresponding vector argument (these have
+    // already been used)
+    Ops.pop_back_n(2);
 
     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlalh_lane_s16 ||
                        BuiltinID == NEON::BI__builtin_neon_vqdmlalh_laneq_s16)
@@ -6597,21 +6740,24 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     unsigned AccumInt = BuiltinID == NEON::BI__builtin_neon_vqdmlals_s32
                                         ? Intrinsic::aarch64_neon_sqadd
                                         : Intrinsic::aarch64_neon_sqsub;
+    // Drop the 2nd multiplication argument before the accumulation
+    Ops.pop_back();
     return EmitNeonCall(CGM.getIntrinsic(AccumInt, Int64Ty), Ops, "vqdmlXl");
   }
   case NEON::BI__builtin_neon_vqdmlals_lane_s32:
   case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
   case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
   case NEON::BI__builtin_neon_vqdmlsls_laneq_s32: {
-    Ops[2] = Builder.CreateExtractElement(Ops[2], EmitScalarExpr(E->getArg(3)),
-                                          "lane");
+    Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "lane");
     SmallVector<Value *, 2> ProductOps;
     ProductOps.push_back(Ops[1]);
     ProductOps.push_back(Ops[2]);
     Ops[1] =
         EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_sqdmulls_scalar),
                      ProductOps, "vqdmlXl");
-    Ops.pop_back();
+    // Drop lane-selection and the corresponding vector argument (these have
+    // already been used)
+    Ops.pop_back_n(2);
 
     unsigned AccInt = (BuiltinID == NEON::BI__builtin_neon_vqdmlals_lane_s32 ||
                        BuiltinID == NEON::BI__builtin_neon_vqdmlals_laneq_s32)
@@ -6670,7 +6816,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   case clang::AArch64::BI_InterlockedAdd64_rel:
   case clang::AArch64::BI_InterlockedAdd64_nf: {
     Address DestAddr = CheckAtomicAlignment(*this, E);
-    Value *Val = EmitScalarExpr(E->getArg(1));
+    Value *Val = Ops[1];
     llvm::AtomicOrdering Ordering;
     switch (BuiltinID) {
     case clang::AArch64::BI_InterlockedAdd:
diff --git a/clang/test/CodeGen/arm64-microsoft-intrinsics.c 
b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
index c0ff785883c71..2f5ab50d6c848 100644
--- a/clang/test/CodeGen/arm64-microsoft-intrinsics.c
+++ b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
@@ -23,8 +23,8 @@ long test_InterlockedAdd_constant(int32_t volatile *Addend) {
 }
 
 // CHECK-LABEL: define {{.*}} i32 @test_InterlockedAdd(ptr %Addend, i32 
%Value) {{.*}} {
-// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i32 %2 seq_cst, 
align 4
-// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i32 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %2, i32 %1 seq_cst, 
align 4
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i32 %[[OLDVAL:[0-9]+]], %1
 // CHECK-MSVC: ret i32 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd'
 
@@ -33,8 +33,8 @@ long test_InterlockedAdd_acq(int32_t volatile *Addend, long 
Value) {
 }
 
 // CHECK-LABEL: define {{.*}} i32 @test_InterlockedAdd_acq(ptr %Addend, i32 
%Value) {{.*}} {
-// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i32 %2 acquire, 
align 4
-// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i32 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %2, i32 %1 acquire, 
align 4
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i32 %[[OLDVAL:[0-9]+]], %1
 // CHECK-MSVC: ret i32 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd_acq'
 
@@ -43,8 +43,8 @@ long test_InterlockedAdd_nf(int32_t volatile *Addend, long 
Value) {
 }
 
 // CHECK-LABEL: define {{.*}} i32 @test_InterlockedAdd_nf(ptr %Addend, i32 
%Value) {{.*}} {
-// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i32 %2 monotonic, 
align 4
-// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i32 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %2, i32 %1 monotonic, 
align 4
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i32 %[[OLDVAL:[0-9]+]], %1
 // CHECK-MSVC: ret i32 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd_nf'
 
@@ -53,8 +53,8 @@ long test_InterlockedAdd_rel(int32_t volatile *Addend, long 
Value) {
 }
 
 // CHECK-LABEL: define {{.*}} i32 @test_InterlockedAdd_rel(ptr %Addend, i32 
%Value) {{.*}} {
-// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i32 %2 release, 
align 4
-// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i32 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %2, i32 %1 release, 
align 4
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i32 %[[OLDVAL:[0-9]+]], %1
 // CHECK-MSVC: ret i32 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd_rel'
 
@@ -67,8 +67,8 @@ __int64 test_InterlockedAdd64_constant(__int64 volatile 
*Addend) {
 }
 
 // CHECK-LABEL: define {{.*}} i64 @test_InterlockedAdd64(ptr %Addend, i64 
%Value) {{.*}} {
-// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i64 %2 seq_cst, 
align 8
-// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i64 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %2, i64 %1 seq_cst, 
align 8
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i64 %[[OLDVAL:[0-9]+]], %1
 // CHECK-MSVC: ret i64 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd64'
 
@@ -77,8 +77,8 @@ __int64 test_InterlockedAdd64_acq(__int64 volatile *Addend, 
__int64 Value) {
 }
 
 // CHECK-LABEL: define {{.*}} i64 @test_InterlockedAdd64_acq(ptr %Addend, i64 
%Value) {{.*}} {
-// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i64 %2 acquire, 
align 8
-// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i64 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %2, i64 %1 acquire, 
align 8
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i64 %[[OLDVAL:[0-9]+]], %1
 // CHECK-MSVC: ret i64 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd64_acq'
 
@@ -87,8 +87,8 @@ __int64 test_InterlockedAdd64_nf(__int64 volatile *Addend, 
__int64 Value) {
 }
 
 // CHECK-LABEL: define {{.*}} i64 @test_InterlockedAdd64_nf(ptr %Addend, i64 
%Value) {{.*}} {
-// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i64 %2 monotonic, 
align 8
-// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i64 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %2, i64 %1 monotonic, 
align 8
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i64 %[[OLDVAL:[0-9]+]], %1
 // CHECK-MSVC: ret i64 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd64_nf'
 
@@ -97,8 +97,8 @@ __int64 test_InterlockedAdd64_rel(__int64 volatile *Addend, 
__int64 Value) {
 }
 
 // CHECK-LABEL: define {{.*}} i64 @test_InterlockedAdd64_rel(ptr %Addend, i64 
%Value) {{.*}} {
-// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %1, i64 %2 release, 
align 8
-// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i64 %[[OLDVAL:[0-9]+]], %2
+// CHECK-MSVC: %[[OLDVAL:[0-9]+]] = atomicrmw add ptr %2, i64 %1 release, 
align 8
+// CHECK-MSVC: %[[NEWVAL:[0-9]+]] = add i64 %[[OLDVAL:[0-9]+]], %1
 // CHECK-MSVC: ret i64 %[[NEWVAL:[0-9]+]]
 // CHECK-LINUX: error: call to undeclared function '_InterlockedAdd64_rel'
 

From aa5d820c960104c4e2c7497fedfe212402aa59b3 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <[email protected]>
Date: Wed, 18 Feb 2026 08:35:41 +0000
Subject: [PATCH 2/2] [clang][ARM] Refactor argument handling in
 `EmitAArch64BuiltinExpr` (2/2) (NFC)

Refactor `EmitAArch64BuiltinExpr` so that all AArch64/NEON builtins
handled by this hook _and marked as overloaded_ share a common path
for generating LLVM IR arguments (collected into the `Ops`
`SmallVector<Value*>`) (*). This is a follow-up for #181794 - please
refer to that PR for more context.

As in the previous PR, the key change is implemented in
`HasExtraNeonArgument` , i.e. in the hook that identifies Builtins with
the extra argument. In this PR, I am replacing the ad-hoc switch
statement with a more principled approach borrowed from SemaARM.cpp,
namely:
```cpp
uint64_t mask = 0;
  switch (BuiltinID) {
  #define GET_NEON_OVERLOAD_CHECK
  #include "clang/Basic/arm_fp16.inc"
  #include "clang/Basic/arm_neon.inc"
  #undef GET_NEON_OVERLOAD_CHECK
  // Non-neon builtins for controling VFP that take extra argument for
  // discriminating the type.
  case ARM::BI__builtin_arm_vcvtr_f:
  case ARM::BI__builtin_arm_vcvtr_d:
    mask = 1;
  }
  switch (BuiltinID) {
  default: break;
  }

  if (mask)
    return true;

  return false;
```

This is preferred because the extra argument is defined for Sema
verification. CodeGen should reuse the same source of truth rather than
duplicating or partially reimplementing the logic.

No functional change intended.

(*) `EmitAArch64BuiltinExpr` contains two large switch statements
intended to separate handling of non-overloaded and overloaded builtins.
In practice, the split is not consistently enforced. Patch 1/2
refactored the first switch (non-overloaded path). This patch applies
the same cleanup to the overloaded path and completes the refactoring.
---
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 297 ++++-------------------
 clang/lib/Sema/SemaARM.cpp               |   4 +-
 2 files changed, 48 insertions(+), 253 deletions(-)

diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index f0dddf33ac5a0..5032903a850c9 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -446,9 +446,8 @@ Value *CodeGenFunction::EmitFP8NeonCall(unsigned IID,
                                         ArrayRef<llvm::Type *> Tys,
                                         SmallVectorImpl<Value *> &Ops,
                                         const CallExpr *E, const char *name) {
-  llvm::Value *FPM =
-      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, 
E);
-  Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
+  Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr),
+                     Ops.pop_back_val());
   return EmitNeonCall(CGM.getIntrinsic(IID, Tys), Ops, name);
 }
 
@@ -2709,207 +2708,38 @@ static Value *EmitRangePrefetchBuiltin(CodeGenFunction 
&CGF, unsigned BuiltinID,
 }
 
 /// Return true if BuiltinID is an overloaded Neon intrinsic with an extra
-/// argument that specifies the vector type.
+/// argument that specifies the vector type. The additional argument is meant
+/// for Sema checking (see `CheckNeonBuiltinFunctionCall`) and this function
+/// should be kept consistent with the logic in Sema.
 /// TODO: Make this return false for SISD builtins.
 static bool HasExtraNeonArgument(unsigned BuiltinID) {
+  // Required by the headers included below, but not in this particular
+  // function.
+  int PtrArgNum = -1;
+  bool HasConstPtr = false;
+
+  // The mask encodes the type. We don't care about the actual value. Instead,
+  // we just check whether its been set.
+  uint64_t mask = 0;
+  switch (BuiltinID) {
+#define GET_NEON_OVERLOAD_CHECK
+#include "clang/Basic/arm_fp16.inc"
+#include "clang/Basic/arm_neon.inc"
+#undef GET_NEON_OVERLOAD_CHECK
+  // Non-neon builtins for controling VFP that take extra argument for
+  // discriminating the type.
+  case ARM::BI__builtin_arm_vcvtr_f:
+  case ARM::BI__builtin_arm_vcvtr_d:
+    mask = 1;
+  }
   switch (BuiltinID) {
   default: break;
+  }
 
-  // Cases from EmitARMBuiltinExpr
-  case NEON::BI__builtin_neon_vsha1h_u32:
-  case NEON::BI__builtin_neon_vsha1cq_u32:
-  case NEON::BI__builtin_neon_vsha1pq_u32:
-  case NEON::BI__builtin_neon_vsha1mq_u32:
-  case NEON::BI__builtin_neon_vcvth_bf16_f32:
-
-  case clang::ARM::BI_MoveToCoprocessor:
-  case clang::ARM::BI_MoveToCoprocessor2:
+  if (mask)
+    return true;
 
-  // Cases for non-overloaded builtins from EmitAArch64BuiltinExpr
-  case NEON::BI__builtin_neon_vabsh_f16:
-  case NEON::BI__builtin_neon_vaddq_p128:
-  case NEON::BI__builtin_neon_vldrq_p128:
-  case NEON::BI__builtin_neon_vstrq_p128:
-  case NEON::BI__builtin_neon_vcvts_f32_u32:
-  case NEON::BI__builtin_neon_vcvtd_f64_u64:
-  case NEON::BI__builtin_neon_vcvts_f32_s32:
-  case NEON::BI__builtin_neon_vcvtd_f64_s64:
-  case NEON::BI__builtin_neon_vcvth_f16_u16:
-  case NEON::BI__builtin_neon_vcvth_f16_u32:
-  case NEON::BI__builtin_neon_vcvth_f16_u64:
-  case NEON::BI__builtin_neon_vcvth_f16_s16:
-  case NEON::BI__builtin_neon_vcvth_f16_s32:
-  case NEON::BI__builtin_neon_vcvth_f16_s64:
-  case NEON::BI__builtin_neon_vcvtah_u16_f16:
-  case NEON::BI__builtin_neon_vcvtmh_u16_f16:
-  case NEON::BI__builtin_neon_vcvtnh_u16_f16:
-  case NEON::BI__builtin_neon_vcvtph_u16_f16:
-  case NEON::BI__builtin_neon_vcvth_u16_f16:
-  case NEON::BI__builtin_neon_vcvtah_s16_f16:
-  case NEON::BI__builtin_neon_vcvtmh_s16_f16:
-  case NEON::BI__builtin_neon_vcvtnh_s16_f16:
-  case NEON::BI__builtin_neon_vcvtph_s16_f16:
-  case NEON::BI__builtin_neon_vcvth_s16_f16:
-  case NEON::BI__builtin_neon_vcaleh_f16:
-  case NEON::BI__builtin_neon_vcalth_f16:
-  case NEON::BI__builtin_neon_vcageh_f16:
-  case NEON::BI__builtin_neon_vcagth_f16:
-  case NEON::BI__builtin_neon_vcvth_n_s16_f16:
-  case NEON::BI__builtin_neon_vcvth_n_u16_f16:
-  case NEON::BI__builtin_neon_vcvth_n_f16_s16:
-  case NEON::BI__builtin_neon_vcvth_n_f16_u16:
-  case NEON::BI__builtin_neon_vpaddd_s64:
-  case NEON::BI__builtin_neon_vpaddd_f64:
-  case NEON::BI__builtin_neon_vpadds_f32:
-  case NEON::BI__builtin_neon_vceqzd_s64:
-  case NEON::BI__builtin_neon_vceqzd_f64:
-  case NEON::BI__builtin_neon_vceqzs_f32:
-  case NEON::BI__builtin_neon_vceqzh_f16:
-  case NEON::BI__builtin_neon_vcgezd_s64:
-  case NEON::BI__builtin_neon_vcgezd_f64:
-  case NEON::BI__builtin_neon_vcgezs_f32:
-  case NEON::BI__builtin_neon_vcgezh_f16:
-  case NEON::BI__builtin_neon_vclezd_s64:
-  case NEON::BI__builtin_neon_vclezd_f64:
-  case NEON::BI__builtin_neon_vclezs_f32:
-  case NEON::BI__builtin_neon_vclezh_f16:
-  case NEON::BI__builtin_neon_vcgtzd_s64:
-  case NEON::BI__builtin_neon_vcgtzd_f64:
-  case NEON::BI__builtin_neon_vcgtzs_f32:
-  case NEON::BI__builtin_neon_vcgtzh_f16:
-  case NEON::BI__builtin_neon_vcltzd_s64:
-  case NEON::BI__builtin_neon_vcltzd_f64:
-  case NEON::BI__builtin_neon_vcltzs_f32:
-  case NEON::BI__builtin_neon_vcltzh_f16:
-  case NEON::BI__builtin_neon_vceqzd_u64:
-  case NEON::BI__builtin_neon_vceqd_f64:
-  case NEON::BI__builtin_neon_vcled_f64:
-  case NEON::BI__builtin_neon_vcltd_f64:
-  case NEON::BI__builtin_neon_vcged_f64:
-  case NEON::BI__builtin_neon_vcgtd_f64:
-  case NEON::BI__builtin_neon_vceqs_f32:
-  case NEON::BI__builtin_neon_vcles_f32:
-  case NEON::BI__builtin_neon_vclts_f32:
-  case NEON::BI__builtin_neon_vcges_f32:
-  case NEON::BI__builtin_neon_vcgts_f32:
-  case NEON::BI__builtin_neon_vceqh_f16:
-  case NEON::BI__builtin_neon_vcleh_f16:
-  case NEON::BI__builtin_neon_vclth_f16:
-  case NEON::BI__builtin_neon_vcgeh_f16:
-  case NEON::BI__builtin_neon_vcgth_f16:
-  case NEON::BI__builtin_neon_vceqd_s64:
-  case NEON::BI__builtin_neon_vceqd_u64:
-  case NEON::BI__builtin_neon_vcgtd_s64:
-  case NEON::BI__builtin_neon_vcgtd_u64:
-  case NEON::BI__builtin_neon_vcltd_s64:
-  case NEON::BI__builtin_neon_vcltd_u64:
-  case NEON::BI__builtin_neon_vcged_u64:
-  case NEON::BI__builtin_neon_vcged_s64:
-  case NEON::BI__builtin_neon_vcled_u64:
-  case NEON::BI__builtin_neon_vcled_s64:
-  case NEON::BI__builtin_neon_vnegd_s64:
-  case NEON::BI__builtin_neon_vnegh_f16:
-  case NEON::BI__builtin_neon_vtstd_s64:
-  case NEON::BI__builtin_neon_vtstd_u64:
-  case NEON::BI__builtin_neon_vset_lane_i8:
-  case NEON::BI__builtin_neon_vset_lane_i16:
-  case NEON::BI__builtin_neon_vset_lane_i32:
-  case NEON::BI__builtin_neon_vset_lane_i64:
-  case NEON::BI__builtin_neon_vset_lane_bf16:
-  case NEON::BI__builtin_neon_vset_lane_f32:
-  case NEON::BI__builtin_neon_vsetq_lane_i8:
-  case NEON::BI__builtin_neon_vsetq_lane_i16:
-  case NEON::BI__builtin_neon_vsetq_lane_i32:
-  case NEON::BI__builtin_neon_vsetq_lane_i64:
-  case NEON::BI__builtin_neon_vsetq_lane_bf16:
-  case NEON::BI__builtin_neon_vsetq_lane_f32:
-  case NEON::BI__builtin_neon_vset_lane_f64:
-  case NEON::BI__builtin_neon_vset_lane_mf8:
-  case NEON::BI__builtin_neon_vsetq_lane_mf8:
-  case NEON::BI__builtin_neon_vsetq_lane_f64:
-  case NEON::BI__builtin_neon_vget_lane_i8:
-  case NEON::BI__builtin_neon_vdupb_lane_i8:
-  case NEON::BI__builtin_neon_vgetq_lane_i8:
-  case NEON::BI__builtin_neon_vdupb_laneq_i8:
-  case NEON::BI__builtin_neon_vget_lane_mf8:
-  case NEON::BI__builtin_neon_vdupb_lane_mf8:
-  case NEON::BI__builtin_neon_vgetq_lane_mf8:
-  case NEON::BI__builtin_neon_vdupb_laneq_mf8:
-  case NEON::BI__builtin_neon_vget_lane_i16:
-  case NEON::BI__builtin_neon_vduph_lane_i16:
-  case NEON::BI__builtin_neon_vgetq_lane_i16:
-  case NEON::BI__builtin_neon_vduph_laneq_i16:
-  case NEON::BI__builtin_neon_vget_lane_i32:
-  case NEON::BI__builtin_neon_vdups_lane_i32:
-  case NEON::BI__builtin_neon_vdups_lane_f32:
-  case NEON::BI__builtin_neon_vgetq_lane_i32:
-  case NEON::BI__builtin_neon_vdups_laneq_i32:
-  case NEON::BI__builtin_neon_vget_lane_i64:
-  case NEON::BI__builtin_neon_vdupd_lane_i64:
-  case NEON::BI__builtin_neon_vdupd_lane_f64:
-  case NEON::BI__builtin_neon_vgetq_lane_i64:
-  case NEON::BI__builtin_neon_vdupd_laneq_i64:
-  case NEON::BI__builtin_neon_vget_lane_f32:
-  case NEON::BI__builtin_neon_vget_lane_f64:
-  case NEON::BI__builtin_neon_vgetq_lane_f32:
-  case NEON::BI__builtin_neon_vdups_laneq_f32:
-  case NEON::BI__builtin_neon_vgetq_lane_f64:
-  case NEON::BI__builtin_neon_vdupd_laneq_f64:
-  case NEON::BI__builtin_neon_vaddh_f16:
-  case NEON::BI__builtin_neon_vsubh_f16:
-  case NEON::BI__builtin_neon_vmulh_f16:
-  case NEON::BI__builtin_neon_vdivh_f16:
-  case NEON::BI__builtin_neon_vfmah_f16:
-  case NEON::BI__builtin_neon_vfmsh_f16:
-  case NEON::BI__builtin_neon_vaddd_s64:
-  case NEON::BI__builtin_neon_vaddd_u64:
-  case NEON::BI__builtin_neon_vsubd_s64:
-  case NEON::BI__builtin_neon_vsubd_u64:
-  case NEON::BI__builtin_neon_vqdmlalh_s16:
-  case NEON::BI__builtin_neon_vqdmlslh_s16:
-  case NEON::BI__builtin_neon_vqshlud_n_s64:
-  case NEON::BI__builtin_neon_vqshld_n_u64:
-  case NEON::BI__builtin_neon_vqshld_n_s64:
-  case NEON::BI__builtin_neon_vrshrd_n_u64:
-  case NEON::BI__builtin_neon_vrshrd_n_s64:
-  case NEON::BI__builtin_neon_vrsrad_n_u64:
-  case NEON::BI__builtin_neon_vrsrad_n_s64:
-  case NEON::BI__builtin_neon_vshld_n_s64:
-  case NEON::BI__builtin_neon_vshld_n_u64:
-  case NEON::BI__builtin_neon_vshrd_n_s64:
-  case NEON::BI__builtin_neon_vshrd_n_u64:
-  case NEON::BI__builtin_neon_vsrad_n_s64:
-  case NEON::BI__builtin_neon_vsrad_n_u64:
-  case NEON::BI__builtin_neon_vqdmlalh_lane_s16:
-  case NEON::BI__builtin_neon_vqdmlalh_laneq_s16:
-  case NEON::BI__builtin_neon_vqdmlslh_lane_s16:
-  case NEON::BI__builtin_neon_vqdmlslh_laneq_s16:
-  case NEON::BI__builtin_neon_vqdmlals_s32:
-  case NEON::BI__builtin_neon_vqdmlsls_s32:
-  case NEON::BI__builtin_neon_vqdmlals_lane_s32:
-  case NEON::BI__builtin_neon_vqdmlals_laneq_s32:
-  case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
-  case NEON::BI__builtin_neon_vqdmlsls_laneq_s32:
-  case NEON::BI__builtin_neon_vget_lane_bf16:
-  case NEON::BI__builtin_neon_vduph_lane_bf16:
-  case NEON::BI__builtin_neon_vduph_lane_f16:
-  case NEON::BI__builtin_neon_vgetq_lane_bf16:
-  case NEON::BI__builtin_neon_vduph_laneq_bf16:
-  case NEON::BI__builtin_neon_vduph_laneq_f16:
-  case NEON::BI__builtin_neon_vcvt_bf16_f32:
-  case NEON::BI__builtin_neon_vcvtq_low_bf16_f32:
-  case NEON::BI__builtin_neon_vcvtq_high_bf16_f32:
-  case clang::AArch64::BI_InterlockedAdd:
-  case clang::AArch64::BI_InterlockedAdd_acq:
-  case clang::AArch64::BI_InterlockedAdd_rel:
-  case clang::AArch64::BI_InterlockedAdd_nf:
-  case clang::AArch64::BI_InterlockedAdd64:
-  case clang::AArch64::BI_InterlockedAdd64_acq:
-  case clang::AArch64::BI_InterlockedAdd64_rel:
-  case clang::AArch64::BI_InterlockedAdd64_nf:
-    return false;
-  }
-  return true;
+  return false;
 }
 
 Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID,
@@ -6956,7 +6786,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax");
   case NEON::BI__builtin_neon_vmaxh_f16: {
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     Int = Intrinsic::aarch64_neon_fmax;
     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax");
   }
@@ -6967,7 +6796,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin");
   case NEON::BI__builtin_neon_vminh_f16: {
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     Int = Intrinsic::aarch64_neon_fmin;
     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin");
   }
@@ -7010,7 +6838,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_fminnm;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm");
   case NEON::BI__builtin_neon_vminnmh_f16:
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     Int = Intrinsic::aarch64_neon_fminnm;
     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm");
   case NEON::BI__builtin_neon_vmaxnm_v:
@@ -7018,20 +6845,16 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_fmaxnm;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm");
   case NEON::BI__builtin_neon_vmaxnmh_f16:
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     Int = Intrinsic::aarch64_neon_fmaxnm;
     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm");
   case NEON::BI__builtin_neon_vrecpss_f32: {
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, 
FloatTy),
                         Ops, "vrecps");
   }
   case NEON::BI__builtin_neon_vrecpsd_f64:
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, 
DoubleTy),
                         Ops, "vrecps");
   case NEON::BI__builtin_neon_vrecpsh_f16:
-    Ops.push_back(EmitScalarExpr(E->getArg(1)));
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, 
HalfTy),
                         Ops, "vrecps");
   case NEON::BI__builtin_neon_vqshrun_n_v:
@@ -7050,7 +6873,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : 
Intrinsic::aarch64_neon_sqrshrn;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n");
   case NEON::BI__builtin_neon_vrndah_f16: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Builder.getIsFPConstrained()
               ? Intrinsic::experimental_constrained_round
               : Intrinsic::round;
@@ -7064,14 +6886,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda");
   }
   case NEON::BI__builtin_neon_vrndih_f16: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Builder.getIsFPConstrained()
               ? Intrinsic::experimental_constrained_nearbyint
               : Intrinsic::nearbyint;
     return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi");
   }
   case NEON::BI__builtin_neon_vrndmh_f16: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Builder.getIsFPConstrained()
               ? Intrinsic::experimental_constrained_floor
               : Intrinsic::floor;
@@ -7085,7 +6905,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm");
   }
   case NEON::BI__builtin_neon_vrndnh_f16: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Builder.getIsFPConstrained()
               ? Intrinsic::experimental_constrained_roundeven
               : Intrinsic::roundeven;
@@ -7099,14 +6918,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn");
   }
   case NEON::BI__builtin_neon_vrndns_f32: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Builder.getIsFPConstrained()
               ? Intrinsic::experimental_constrained_roundeven
               : Intrinsic::roundeven;
     return EmitNeonCall(CGM.getIntrinsic(Int, FloatTy), Ops, "vrndn");
   }
   case NEON::BI__builtin_neon_vrndph_f16: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Builder.getIsFPConstrained()
               ? Intrinsic::experimental_constrained_ceil
               : Intrinsic::ceil;
@@ -7120,7 +6937,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp");
   }
   case NEON::BI__builtin_neon_vrndxh_f16: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Builder.getIsFPConstrained()
               ? Intrinsic::experimental_constrained_rint
               : Intrinsic::rint;
@@ -7134,7 +6950,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx");
   }
   case NEON::BI__builtin_neon_vrndh_f16: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Builder.getIsFPConstrained()
               ? Intrinsic::experimental_constrained_trunc
               : Intrinsic::trunc;
@@ -7144,7 +6959,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   case NEON::BI__builtin_neon_vrnd32xq_f32:
   case NEON::BI__builtin_neon_vrnd32x_f64:
   case NEON::BI__builtin_neon_vrnd32xq_f64: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Intrinsic::aarch64_neon_frint32x;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32x");
   }
@@ -7152,7 +6966,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   case NEON::BI__builtin_neon_vrnd32zq_f32:
   case NEON::BI__builtin_neon_vrnd32z_f64:
   case NEON::BI__builtin_neon_vrnd32zq_f64: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Intrinsic::aarch64_neon_frint32z;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd32z");
   }
@@ -7160,7 +6973,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   case NEON::BI__builtin_neon_vrnd64xq_f32:
   case NEON::BI__builtin_neon_vrnd64x_f64:
   case NEON::BI__builtin_neon_vrnd64xq_f64: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Intrinsic::aarch64_neon_frint64x;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64x");
   }
@@ -7168,7 +6980,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   case NEON::BI__builtin_neon_vrnd64zq_f32:
   case NEON::BI__builtin_neon_vrnd64z_f64:
   case NEON::BI__builtin_neon_vrnd64zq_f64: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Intrinsic::aarch64_neon_frint64z;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnd64z");
   }
@@ -7291,7 +7102,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
   case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
     // vmulx_lane should be mapped to Neon scalar mulx after
     // extracting the scalar element
-    Ops.push_back(EmitScalarExpr(E->getArg(2)));
     Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
     Ops.pop_back();
     Int = Intrinsic::aarch64_neon_fmulx;
@@ -7322,7 +7132,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm");
   }
   case NEON::BI__builtin_neon_vsqrth_f16: {
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
     Int = Builder.getIsFPConstrained()
               ? Intrinsic::experimental_constrained_sqrt
               : Intrinsic::sqrt;
@@ -7345,8 +7154,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_fmaxv;
     Ty = HalfTy;
     VTy = llvm::FixedVectorType::get(HalfTy, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
     return Builder.CreateTrunc(Ops[0], HalfTy);
   }
@@ -7354,8 +7162,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_fmaxv;
     Ty = HalfTy;
     VTy = llvm::FixedVectorType::get(HalfTy, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxv");
     return Builder.CreateTrunc(Ops[0], HalfTy);
   }
@@ -7363,8 +7170,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_fminv;
     Ty = HalfTy;
     VTy = llvm::FixedVectorType::get(HalfTy, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
     return Builder.CreateTrunc(Ops[0], HalfTy);
   }
@@ -7372,8 +7178,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_fminv;
     Ty = HalfTy;
     VTy = llvm::FixedVectorType::get(HalfTy, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminv");
     return Builder.CreateTrunc(Ops[0], HalfTy);
   }
@@ -7381,8 +7186,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_fmaxnmv;
     Ty = HalfTy;
     VTy = llvm::FixedVectorType::get(HalfTy, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
     return Builder.CreateTrunc(Ops[0], HalfTy);
   }
@@ -7390,8 +7194,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_fmaxnmv;
     Ty = HalfTy;
     VTy = llvm::FixedVectorType::get(HalfTy, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmaxnmv");
     return Builder.CreateTrunc(Ops[0], HalfTy);
   }
@@ -7399,8 +7202,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_fminnmv;
     Ty = HalfTy;
     VTy = llvm::FixedVectorType::get(HalfTy, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
     return Builder.CreateTrunc(Ops[0], HalfTy);
   }
@@ -7408,22 +7210,20 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_fminnmv;
     Ty = HalfTy;
     VTy = llvm::FixedVectorType::get(HalfTy, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vminnmv");
     return Builder.CreateTrunc(Ops[0], HalfTy);
   }
   case NEON::BI__builtin_neon_vmul_n_f64: {
     Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy);
-    Value *RHS = Builder.CreateBitCast(EmitScalarExpr(E->getArg(1)), DoubleTy);
+    Value *RHS = Builder.CreateBitCast(Ops[1], DoubleTy);
     return Builder.CreateFMul(Ops[0], RHS);
   }
   case NEON::BI__builtin_neon_vaddlv_u8: {
     Int = Intrinsic::aarch64_neon_uaddlv;
     Ty = Int32Ty;
     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
     return Builder.CreateTrunc(Ops[0], Int16Ty);
   }
@@ -7431,16 +7231,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_uaddlv;
     Ty = Int32Ty;
     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
   }
   case NEON::BI__builtin_neon_vaddlvq_u8: {
     Int = Intrinsic::aarch64_neon_uaddlv;
     Ty = Int32Ty;
     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
     return Builder.CreateTrunc(Ops[0], Int16Ty);
   }
@@ -7448,16 +7246,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_uaddlv;
     Ty = Int32Ty;
     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
   }
   case NEON::BI__builtin_neon_vaddlv_s8: {
     Int = Intrinsic::aarch64_neon_saddlv;
     Ty = Int32Ty;
     VTy = llvm::FixedVectorType::get(Int8Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
     return Builder.CreateTrunc(Ops[0], Int16Ty);
   }
@@ -7465,16 +7261,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_saddlv;
     Ty = Int32Ty;
     VTy = llvm::FixedVectorType::get(Int16Ty, 4);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
   }
   case NEON::BI__builtin_neon_vaddlvq_s8: {
     Int = Intrinsic::aarch64_neon_saddlv;
     Ty = Int32Ty;
     VTy = llvm::FixedVectorType::get(Int8Ty, 16);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     Ops[0] = EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
     return Builder.CreateTrunc(Ops[0], Int16Ty);
   }
@@ -7482,8 +7276,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned 
BuiltinID,
     Int = Intrinsic::aarch64_neon_saddlv;
     Ty = Int32Ty;
     VTy = llvm::FixedVectorType::get(Int16Ty, 8);
-    llvm::Type *Tys[2] = { Ty, VTy };
-    Ops.push_back(EmitScalarExpr(E->getArg(0)));
+    llvm::Type *Tys[2] = {Ty, VTy};
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vaddlv");
   }
   case NEON::BI__builtin_neon_vsri_n_v:
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 53e8c002a1962..33edc455366a7 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -742,11 +742,13 @@ bool SemaARM::CheckNeonBuiltinFunctionCall(const 
TargetInfo &TI,
 
   // For NEON intrinsics which are overloaded on vector element type, validate
   // the immediate which specifies which variant to emit.
-  unsigned ImmArg = TheCall->getNumArgs() - 1;
   if (mask) {
+    unsigned ImmArg = TheCall->getNumArgs() - 1;
     if (SemaRef.BuiltinConstantArg(TheCall, ImmArg, Result))
       return true;
 
+    // FIXME: This is effectively dead code. Change the logic above so that the
+    // following check is actually run.
     TV = Result.getLimitedValue(64);
     if ((TV > 63) || (mask & (1ULL << TV)) == 0)
       return Diag(TheCall->getBeginLoc(), diag::err_invalid_neon_type_code)

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to