https://github.com/jhuber6 updated 
https://github.com/llvm/llvm-project/pull/154203

>From cc9d2d9e4923bad0d904d251fa116ad388b0b6ac Mon Sep 17 00:00:00 2001
From: Joseph Huber <hube...@outlook.com>
Date: Mon, 18 Aug 2025 15:59:46 -0500
Subject: [PATCH 1/6] [Clang] Support generic bit counting builtins on fixed
 boolean vectors

Summary:
Boolean vectors as implemented in clang can be bit-casted to an integer
that is rounded up to the next primitive sized integer. Users can do
this themselves, but since the counting bits are very likely to be used
with bitmasks like this and the generic forms are expected to be
generic it seems reasonable that we handle this case directly.
---
 clang/docs/LanguageExtensions.rst             |   4 +-
 clang/docs/ReleaseNotes.rst                   |   4 +-
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      |  46 +-
 clang/lib/AST/ExprConstant.cpp                |  42 +-
 clang/lib/CodeGen/CGBuiltin.cpp               |  28 +-
 clang/lib/Sema/SemaChecking.cpp               |   4 +-
 clang/test/AST/ByteCode/builtin-functions.cpp |   4 +
 clang/test/CodeGen/builtins.c                 | 423 ++++++++++--------
 8 files changed, 346 insertions(+), 209 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst 
b/clang/docs/LanguageExtensions.rst
index 3c6c97bb1fa10..40f8fc9190f94 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4182,7 +4182,7 @@ builtin, the mangler emits their usual pattern without 
any special treatment.
 -----------------------
 
 ``__builtin_popcountg`` returns the number of 1 bits in the argument. The
-argument can be of any unsigned integer type.
+argument can be of any unsigned integer type or fixed boolean vector.
 
 **Syntax**:
 
@@ -4214,7 +4214,7 @@ such as ``unsigned __int128`` and C23 ``unsigned 
_BitInt(N)``.
 
 ``__builtin_clzg`` (respectively ``__builtin_ctzg``) returns the number of
 leading (respectively trailing) 0 bits in the first argument. The first 
argument
-can be of any unsigned integer type.
+can be of any unsigned integer type or fixed boolean vector.
 
 If the first argument is 0 and an optional second argument of ``int`` type is
 provided, then the second argument is returned. If the first argument is 0, but
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c32102d102cd3..f0732deec23c3 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -143,10 +143,12 @@ Non-comprehensive list of changes in this release
 - Added ``__builtin_masked_load`` and ``__builtin_masked_store`` for 
conditional
   memory loads from vectors. Binds to the LLVM intrinsic of the same name.
 
+- The ``__builtin_popcountg``, ``__builtin_ctzg``, and ``__builtin_clzg``
+  functions now accept fixed-size boolean vectors.
+
 - Use of ``__has_feature`` to detect the ``ptrauth_qualifier`` and 
``ptrauth_intrinsics``
   features has been deprecated, and is restricted to the arm64e target only. 
The
   correct method to check for these features is to test for the ``__PTRAUTH__``
-  macro.
 
 - Added a new builtin, ``__builtin_dedup_pack``, to remove duplicate types 
from a parameter pack.
   This feature is particularly useful in template metaprogramming for 
normalizing type lists.
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp 
b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 2cbebaf7b630e..5039d9950a4a4 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -141,6 +141,22 @@ static void diagnoseNonConstexprBuiltin(InterpState &S, 
CodePtr OpPC,
     S.CCEDiag(Loc, diag::note_invalid_subexpr_in_const_expr);
 }
 
+static llvm::APSInt convertBoolVectorToInt(const Pointer &Val) {
+  assert(Val.getFieldDesc()->isPrimitiveArray() &&
+         Val.getFieldDesc()->getElemQualType()->isBooleanType() &&
+         "Not a boolean vector");
+  unsigned NumElts = Val.getNumElems();
+
+  // Each element is one bit, so create an integer with NumElts bits.
+  llvm::APSInt Result(NumElts, 0);
+  for (unsigned I = 0; I < NumElts; ++I) {
+    if (Val.elem<bool>(I))
+      Result.setBit(I);
+  }
+
+  return Result;
+}
+
 static bool interp__builtin_is_constant_evaluated(InterpState &S, CodePtr OpPC,
                                                   const InterpFrame *Frame,
                                                   const CallExpr *Call) {
@@ -638,8 +654,14 @@ static bool interp__builtin_abs(InterpState &S, CodePtr 
OpPC,
 static bool interp__builtin_popcount(InterpState &S, CodePtr OpPC,
                                      const InterpFrame *Frame,
                                      const CallExpr *Call) {
-  PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
-  APSInt Val = popToAPSInt(S.Stk, ArgT);
+  APSInt Val;
+  if (Call->getArg(0)->getType()->isExtVectorBoolType()) {
+    const Pointer &Arg = S.Stk.pop<Pointer>();
+    Val = convertBoolVectorToInt(Arg);
+  } else {
+    PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType());
+    Val = popToAPSInt(S.Stk, ArgT);
+  }
   pushInteger(S, Val.popcount(), Call->getType());
   return true;
 }
@@ -935,8 +957,14 @@ static bool interp__builtin_clz(InterpState &S, CodePtr 
OpPC,
     PrimType FallbackT = *S.getContext().classify(Call->getArg(1));
     Fallback = popToAPSInt(S.Stk, FallbackT);
   }
-  PrimType ValT = *S.getContext().classify(Call->getArg(0));
-  const APSInt &Val = popToAPSInt(S.Stk, ValT);
+  APSInt Val;
+  if (Call->getArg(0)->getType()->isExtVectorBoolType()) {
+    const Pointer &Arg = S.Stk.pop<Pointer>();
+    Val = convertBoolVectorToInt(Arg);
+  } else {
+    PrimType ValT = *S.getContext().classify(Call->getArg(0));
+    Val = popToAPSInt(S.Stk, ValT);
+  }
 
   // When the argument is 0, the result of GCC builtins is undefined, whereas
   // for Microsoft intrinsics, the result is the bit-width of the argument.
@@ -966,8 +994,14 @@ static bool interp__builtin_ctz(InterpState &S, CodePtr 
OpPC,
     PrimType FallbackT = *S.getContext().classify(Call->getArg(1));
     Fallback = popToAPSInt(S.Stk, FallbackT);
   }
-  PrimType ValT = *S.getContext().classify(Call->getArg(0));
-  const APSInt &Val = popToAPSInt(S.Stk, ValT);
+  APSInt Val;
+  if (Call->getArg(0)->getType()->isExtVectorBoolType()) {
+    const Pointer &Arg = S.Stk.pop<Pointer>();
+    Val = convertBoolVectorToInt(Arg);
+  } else {
+    PrimType ValT = *S.getContext().classify(Call->getArg(0));
+    Val = popToAPSInt(S.Stk, ValT);
+  }
 
   if (Val == 0) {
     if (Fallback) {
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index a03e64fcffde2..e0d1b03dd6d2f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11314,6 +11314,24 @@ static bool EvaluateVector(const Expr* E, APValue& 
Result, EvalInfo &Info) {
   return VectorExprEvaluator(Info, Result).Visit(E);
 }
 
+static llvm::APInt ConvertBoolVectorToInt(const APValue &Val) {
+  assert(Val.isVector() && "expected vector APValue");
+  unsigned NumElts = Val.getVectorLength();
+
+  // Each element is one bit, so create an integer with NumElts bits.
+  llvm::APInt Result(NumElts, 0);
+
+  for (unsigned I = 0; I < NumElts; ++I) {
+    const APValue &Elt = Val.getVectorElt(I);
+    assert(Elt.isInt() && "expected integer element in bool vector");
+
+    if (Elt.getInt().getBoolValue())
+      Result.setBit(I);
+  }
+
+  return Result;
+}
+
 bool VectorExprEvaluator::VisitCastExpr(const CastExpr *E) {
   const VectorType *VTy = E->getType()->castAs<VectorType>();
   unsigned NElts = VTy->getNumElements();
@@ -13456,8 +13474,14 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const 
CallExpr *E,
   case Builtin::BI__lzcnt:
   case Builtin::BI__lzcnt64: {
     APSInt Val;
-    if (!EvaluateInteger(E->getArg(0), Val, Info))
+    if (E->getArg(0)->getType()->isExtVectorBoolType()) {
+      APValue Vec;
+      if (!EvaluateVector(E->getArg(0), Vec, Info))
+        return false;
+      Val = ConvertBoolVectorToInt(Vec);
+    } else if (!EvaluateInteger(E->getArg(0), Val, Info)) {
       return false;
+    }
 
     std::optional<APSInt> Fallback;
     if ((BuiltinOp == Builtin::BI__builtin_clzg ||
@@ -13542,8 +13566,14 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const 
CallExpr *E,
   case Builtin::BI__builtin_ctzg:
   case Builtin::BI__builtin_elementwise_cttz: {
     APSInt Val;
-    if (!EvaluateInteger(E->getArg(0), Val, Info))
+    if (E->getArg(0)->getType()->isExtVectorBoolType()) {
+      APValue Vec;
+      if (!EvaluateVector(E->getArg(0), Vec, Info))
+        return false;
+      Val = ConvertBoolVectorToInt(Vec);
+    } else if (!EvaluateInteger(E->getArg(0), Val, Info)) {
       return false;
+    }
 
     std::optional<APSInt> Fallback;
     if ((BuiltinOp == Builtin::BI__builtin_ctzg ||
@@ -13758,8 +13788,14 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const 
CallExpr *E,
   case Builtin::BI__popcnt:
   case Builtin::BI__popcnt64: {
     APSInt Val;
-    if (!EvaluateInteger(E->getArg(0), Val, Info))
+    if (E->getArg(0)->getType()->isExtVectorBoolType()) {
+      APValue Vec;
+      if (!EvaluateVector(E->getArg(0), Vec, Info))
+        return false;
+      Val = ConvertBoolVectorToInt(Vec);
+    } else if (!EvaluateInteger(E->getArg(0), Val, Info)) {
       return false;
+    }
 
     return Success(Val.popcount(), E);
   }
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d9cc37d123fb4..b98218c36161d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -1693,6 +1693,26 @@ getBitTestAtomicOrdering(BitTest::InterlockingKind I) {
   llvm_unreachable("invalid interlocking");
 }
 
+static llvm::Value *EmitBitCountExpr(CodeGenFunction &CGF, const Expr *E) {
+  llvm::Value *ArgValue = CGF.EmitScalarExpr(E);
+  llvm::Type *ArgType = ArgValue->getType();
+
+  // Boolean vectors can be casted directly to its bitfield representation. We
+  // intentionally do not round up to the next power of two size and let LLVM
+  // handle the trailing bits.
+  if (auto *VT = dyn_cast<llvm::FixedVectorType>(ArgType);
+      VT && VT->getElementType()->isIntegerTy(1)) {
+    llvm::Type *StorageType =
+        llvm::Type::getIntNTy(CGF.getLLVMContext(), VT->getNumElements());
+    ArgValue = CGF.emitBoolVecConversion(
+        ArgValue, StorageType->getPrimitiveSizeInBits(), "insertvec");
+    ArgValue = CGF.Builder.CreateBitCast(ArgValue, StorageType);
+    ArgType = ArgValue->getType();
+  }
+
+  return ArgValue;
+}
+
 /// Emit a _bittest* intrinsic. These intrinsics take a pointer to an array of
 /// bits and a bit position and read and optionally modify the bit at that
 /// position. The position index can be arbitrarily large, i.e. it can be 
larger
@@ -2020,7 +2040,7 @@ Value *CodeGenFunction::EmitCheckedArgForBuiltin(const 
Expr *E,
   assert((Kind == BCK_CLZPassedZero || Kind == BCK_CTZPassedZero) &&
          "Unsupported builtin check kind");
 
-  Value *ArgValue = EmitScalarExpr(E);
+  Value *ArgValue = EmitBitCountExpr(*this, E);
   if (!SanOpts.has(SanitizerKind::Builtin))
     return ArgValue;
 
@@ -3334,7 +3354,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
         E->getNumArgs() > 1;
 
     Value *ArgValue =
-        HasFallback ? EmitScalarExpr(E->getArg(0))
+        HasFallback ? EmitBitCountExpr(*this, E->getArg(0))
                     : EmitCheckedArgForBuiltin(E->getArg(0), 
BCK_CTZPassedZero);
 
     llvm::Type *ArgType = ArgValue->getType();
@@ -3371,7 +3391,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
         E->getNumArgs() > 1;
 
     Value *ArgValue =
-        HasFallback ? EmitScalarExpr(E->getArg(0))
+        HasFallback ? EmitBitCountExpr(*this, E->getArg(0))
                     : EmitCheckedArgForBuiltin(E->getArg(0), 
BCK_CLZPassedZero);
 
     llvm::Type *ArgType = ArgValue->getType();
@@ -3456,7 +3476,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl 
GD, unsigned BuiltinID,
   case Builtin::BI__builtin_popcountl:
   case Builtin::BI__builtin_popcountll:
   case Builtin::BI__builtin_popcountg: {
-    Value *ArgValue = EmitScalarExpr(E->getArg(0));
+    Value *ArgValue = EmitBitCountExpr(*this, E->getArg(0));
 
     llvm::Type *ArgType = ArgValue->getType();
     Function *F = CGM.getIntrinsic(Intrinsic::ctpop, ArgType);
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 2944c1a09b32c..e343d77503cc2 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2214,7 +2214,7 @@ static bool BuiltinPopcountg(Sema &S, CallExpr *TheCall) {
 
   QualType ArgTy = Arg->getType();
 
-  if (!ArgTy->isUnsignedIntegerType()) {
+  if (!ArgTy->isUnsignedIntegerType() && !ArgTy->isExtVectorBoolType()) {
     S.Diag(Arg->getBeginLoc(), diag::err_builtin_invalid_arg_type)
         << 1 << /* scalar */ 1 << /* unsigned integer ty */ 3 << /* no fp */ 0
         << ArgTy;
@@ -2239,7 +2239,7 @@ static bool BuiltinCountZeroBitsGeneric(Sema &S, CallExpr 
*TheCall) {
 
   QualType Arg0Ty = Arg0->getType();
 
-  if (!Arg0Ty->isUnsignedIntegerType()) {
+  if (!Arg0Ty->isUnsignedIntegerType() && !Arg0Ty->isExtVectorBoolType()) {
     S.Diag(Arg0->getBeginLoc(), diag::err_builtin_invalid_arg_type)
         << 1 << /* scalar */ 1 << /* unsigned integer ty */ 3 << /* no fp */ 0
         << Arg0Ty;
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp 
b/clang/test/AST/ByteCode/builtin-functions.cpp
index 3277ef65a880b..f47bc49d9a1a8 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -454,6 +454,7 @@ namespace SourceLocation {
 }
 
 #define BITSIZE(x) (sizeof(x) * 8)
+constexpr bool __attribute__((ext_vector_type(4))) v4b{};
 namespace popcount {
   static_assert(__builtin_popcount(~0u) == __CHAR_BIT__ * sizeof(unsigned 
int), "");
   static_assert(__builtin_popcount(0) == 0, "");
@@ -471,6 +472,7 @@ namespace popcount {
   static_assert(__builtin_popcountg(0ul) == 0, "");
   static_assert(__builtin_popcountg(~0ull) == __CHAR_BIT__ * sizeof(unsigned 
long long), "");
   static_assert(__builtin_popcountg(0ull) == 0, "");
+  static_assert(__builtin_popcountg(v4b) == 0, "");
 #ifdef __SIZEOF_INT128__
   static_assert(__builtin_popcountg(~(unsigned __int128)0) == __CHAR_BIT__ * 
sizeof(unsigned __int128), "");
   static_assert(__builtin_popcountg((unsigned __int128)0) == 0, "");
@@ -743,6 +745,7 @@ namespace clz {
   char clz62[__builtin_clzg((unsigned _BitInt(128))0xf) == 
BITSIZE(_BitInt(128)) - 4 ? 1 : -1];
   char clz63[__builtin_clzg((unsigned _BitInt(128))0xf, 42) == 
BITSIZE(_BitInt(128)) - 4 ? 1 : -1];
 #endif
+  char clz64[__builtin_clzg(v4b, 0) == 0 ? 1 : -1];
 }
 
 namespace ctz {
@@ -813,6 +816,7 @@ namespace ctz {
   char ctz62[__builtin_ctzg((unsigned _BitInt(128))1 << (BITSIZE(_BitInt(128)) 
- 1)) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
   char ctz63[__builtin_ctzg((unsigned _BitInt(128))1 << (BITSIZE(_BitInt(128)) 
- 1), 42) == BITSIZE(_BitInt(128)) - 1 ? 1 : -1];
 #endif
+  char clz64[__builtin_ctzg(v4b, 0) == 0 ? 1 : -1];
 }
 
 namespace bswap {
diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c
index aa9965b815983..7ad143ed165c8 100644
--- a/clang/test/CodeGen/builtins.c
+++ b/clang/test/CodeGen/builtins.c
@@ -991,247 +991,288 @@ void test_builtin_os_log_long_double(void *buf, long 
double ld) {
 void test_builtin_popcountg(unsigned char uc, unsigned short us,
                             unsigned int ui, unsigned long ul,
                             unsigned long long ull, unsigned __int128 ui128,
-                            unsigned _BitInt(128) ubi128) {
+                            unsigned _BitInt(128) ubi128,
+                            _Bool __attribute__((ext_vector_type(8))) vb8) {
   volatile int pop;
-  pop = __builtin_popcountg(uc);
-  // CHECK: %1 = load i8, ptr %uc.addr, align 1
-  // CHECK-NEXT: %2 = call i8 @llvm.ctpop.i8(i8 %1)
-  // CHECK-NEXT: %cast = zext i8 %2 to i32
+  //      CHECK: %2 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT: %3 = call i8 @llvm.ctpop.i8(i8 %2)
+  // CHECK-NEXT: %cast = zext i8 %3 to i32
   // CHECK-NEXT: store volatile i32 %cast, ptr %pop, align 4
+  pop = __builtin_popcountg(uc);
+  //      CHECK: %4 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT: %5 = call i16 @llvm.ctpop.i16(i16 %4)
+  // CHECK-NEXT: %cast2 = zext i16 %5 to i32
+  // CHECK-NEXT: store volatile i32 %cast2, ptr %pop, align 4
   pop = __builtin_popcountg(us);
-  // CHECK-NEXT: %3 = load i16, ptr %us.addr, align 2
-  // CHECK-NEXT: %4 = call i16 @llvm.ctpop.i16(i16 %3)
-  // CHECK-NEXT: %cast1 = zext i16 %4 to i32
-  // CHECK-NEXT: store volatile i32 %cast1, ptr %pop, align 4
+  //      CHECK: %6 = load i32, ptr %ui.addr, align 4
+  // CHECK-NEXT: %7 = call i32 @llvm.ctpop.i32(i32 %6)
+  // CHECK-NEXT: store volatile i32 %7, ptr %pop, align 4
   pop = __builtin_popcountg(ui);
-  // CHECK-NEXT: %5 = load i32, ptr %ui.addr, align 4
-  // CHECK-NEXT: %6 = call i32 @llvm.ctpop.i32(i32 %5)
-  // CHECK-NEXT: store volatile i32 %6, ptr %pop, align 4
+  // CHECK: %8 = load i64, ptr %ul.addr, align 8
+  // CHECK-NEXT: %9 = call i64 @llvm.ctpop.i64(i64 %8)
+  // CHECK-NEXT: %cast3 = trunc i64 %9 to i32
+  // CHECK-NEXT: store volatile i32 %cast3, ptr %pop, align 4
   pop = __builtin_popcountg(ul);
-  // CHECK-NEXT: %7 = load i64, ptr %ul.addr, align 8
-  // CHECK-NEXT: %8 = call i64 @llvm.ctpop.i64(i64 %7)
-  // CHECK-NEXT: %cast2 = trunc i64 %8 to i32
-  // CHECK-NEXT: store volatile i32 %cast2, ptr %pop, align 4
+  //      CHECK: %10 = load i64, ptr %ull.addr, align 8
+  // CHECK-NEXT: %11 = call i64 @llvm.ctpop.i64(i64 %10)
+  // CHECK-NEXT: %cast4 = trunc i64 %11 to i32
+  // CHECK-NEXT: store volatile i32 %cast4, ptr %pop, align 4
   pop = __builtin_popcountg(ull);
-  // CHECK-NEXT: %9 = load i64, ptr %ull.addr, align 8
-  // CHECK-NEXT: %10 = call i64 @llvm.ctpop.i64(i64 %9)
-  // CHECK-NEXT: %cast3 = trunc i64 %10 to i32
-  // CHECK-NEXT: store volatile i32 %cast3, ptr %pop, align 4
+  //      CHECK: %12 = load i128, ptr %ui128.addr, align 16
+  // CHECK-NEXT: %13 = call i128 @llvm.ctpop.i128(i128 %12)
+  // CHECK-NEXT: %cast5 = trunc i128 %13 to i32
+  // CHECK-NEXT: store volatile i32 %cast5, ptr %pop, align 4
   pop = __builtin_popcountg(ui128);
-  // CHECK-NEXT: %11 = load i128, ptr %ui128.addr, align 16
-  // CHECK-NEXT: %12 = call i128 @llvm.ctpop.i128(i128 %11)
-  // CHECK-NEXT: %cast4 = trunc i128 %12 to i32
-  // CHECK-NEXT: store volatile i32 %cast4, ptr %pop, align 4
+  //      CHECK: %14 = load i128, ptr %ubi128.addr, align 8
+  // CHECK-NEXT: %15 = call i128 @llvm.ctpop.i128(i128 %14)
+  // CHECK-NEXT: %cast6 = trunc i128 %15 to i32
+  // CHECK-NEXT: store volatile i32 %cast6, ptr %pop, align 4
   pop = __builtin_popcountg(ubi128);
-  // CHECK-NEXT: %13 = load i128, ptr %ubi128.addr, align 8
-  // CHECK-NEXT: %14 = call i128 @llvm.ctpop.i128(i128 %13)
-  // CHECK-NEXT: %cast5 = trunc i128 %14 to i32
-  // CHECK-NEXT: store volatile i32 %cast5, ptr %pop, align 4
-  // CHECK-NEXT: ret void
+  //      CHECK: %load_bits7 = load i8, ptr %vb8.addr, align 1
+  // CHECK-NEXT: %16 = bitcast i8 %load_bits7 to <8 x i1>
+  // CHECK-NEXT: %17 = bitcast <8 x i1> %16 to i8
+  // CHECK-NEXT: %18 = call i8 @llvm.ctpop.i8(i8 %17)
+  // CHECK-NEXT: %cast8 = zext i8 %18 to i32
+  // CHECK-NEXT: store volatile i32 %cast8, ptr %pop, align 4
+  pop = __builtin_popcountg(vb8);
 }
 
 // CHECK-LABEL: define{{.*}} void @test_builtin_clzg
 void test_builtin_clzg(unsigned char uc, unsigned short us, unsigned int ui,
                        unsigned long ul, unsigned long long ull,
                        unsigned __int128 ui128, unsigned _BitInt(128) ubi128,
-                       signed char sc, short s, int i) {
+                       signed char sc, short s, int i,
+                       _Bool __attribute__((ext_vector_type(8))) vb8) {
   volatile int lz;
+  //      CHECK:  %2 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT:  %3 = call i8 @llvm.ctlz.i8(i8 %2, i1 true)
+  // CHECK-NEXT:  %cast = zext i8 %3 to i32
+  // CHECK-NEXT:  store volatile i32 %cast, ptr %lz, align 4
   lz = __builtin_clzg(uc);
-  // CHECK: %1 = load i8, ptr %uc.addr, align 1
-  // CHECK-NEXT: %2 = call i8 @llvm.ctlz.i8(i8 %1, i1 true)
-  // CHECK-NEXT: %cast = zext i8 %2 to i32
-  // CHECK-NEXT: store volatile i32 %cast, ptr %lz, align 4
+  // CHECK-NEXT:  %4 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT:  %5 = call i16 @llvm.ctlz.i16(i16 %4, i1 true)
+  // CHECK-NEXT:  %cast2 = zext i16 %5 to i32
+  // CHECK-NEXT:  store volatile i32 %cast2, ptr %lz, align 4
   lz = __builtin_clzg(us);
-  // CHECK-NEXT: %3 = load i16, ptr %us.addr, align 2
-  // CHECK-NEXT: %4 = call i16 @llvm.ctlz.i16(i16 %3, i1 true)
-  // CHECK-NEXT: %cast1 = zext i16 %4 to i32
-  // CHECK-NEXT: store volatile i32 %cast1, ptr %lz, align 4
+  // CHECK-NEXT:  %6 = load i32, ptr %ui.addr, align 4
+  // CHECK-NEXT:  %7 = call i32 @llvm.ctlz.i32(i32 %6, i1 true)
+  // CHECK-NEXT:  store volatile i32 %7, ptr %lz, align 4
   lz = __builtin_clzg(ui);
-  // CHECK-NEXT: %5 = load i32, ptr %ui.addr, align 4
-  // CHECK-NEXT: %6 = call i32 @llvm.ctlz.i32(i32 %5, i1 true)
-  // CHECK-NEXT: store volatile i32 %6, ptr %lz, align 4
+  // CHECK-NEXT:  %8 = load i64, ptr %ul.addr, align 8
+  // CHECK-NEXT:  %9 = call i64 @llvm.ctlz.i64(i64 %8, i1 true)
+  // CHECK-NEXT:  %cast3 = trunc i64 %9 to i32
+  // CHECK-NEXT:  store volatile i32 %cast3, ptr %lz, align 4
   lz = __builtin_clzg(ul);
-  // CHECK-NEXT: %7 = load i64, ptr %ul.addr, align 8
-  // CHECK-NEXT: %8 = call i64 @llvm.ctlz.i64(i64 %7, i1 true)
-  // CHECK-NEXT: %cast2 = trunc i64 %8 to i32
-  // CHECK-NEXT: store volatile i32 %cast2, ptr %lz, align 4
+  // CHECK-NEXT:  %10 = load i64, ptr %ull.addr, align 8
+  // CHECK-NEXT:  %11 = call i64 @llvm.ctlz.i64(i64 %10, i1 true)
+  // CHECK-NEXT:  %cast4 = trunc i64 %11 to i32
+  // CHECK-NEXT:  store volatile i32 %cast4, ptr %lz, align 4
   lz = __builtin_clzg(ull);
-  // CHECK-NEXT: %9 = load i64, ptr %ull.addr, align 8
-  // CHECK-NEXT: %10 = call i64 @llvm.ctlz.i64(i64 %9, i1 true)
-  // CHECK-NEXT: %cast3 = trunc i64 %10 to i32
-  // CHECK-NEXT: store volatile i32 %cast3, ptr %lz, align 4
+  // CHECK-NEXT:  %12 = load i128, ptr %ui128.addr, align 16
+  // CHECK-NEXT:  %13 = call i128 @llvm.ctlz.i128(i128 %12, i1 true)
+  // CHECK-NEXT:  %cast5 = trunc i128 %13 to i32
+  // CHECK-NEXT:  store volatile i32 %cast5, ptr %lz, align 4
   lz = __builtin_clzg(ui128);
-  // CHECK-NEXT: %11 = load i128, ptr %ui128.addr, align 16
-  // CHECK-NEXT: %12 = call i128 @llvm.ctlz.i128(i128 %11, i1 true)
-  // CHECK-NEXT: %cast4 = trunc i128 %12 to i32
-  // CHECK-NEXT: store volatile i32 %cast4, ptr %lz, align 4
+  // CHECK-NEXT:  %14 = load i128, ptr %ubi128.addr, align 8
+  // CHECK-NEXT:  %15 = call i128 @llvm.ctlz.i128(i128 %14, i1 true)
+  // CHECK-NEXT:  %cast6 = trunc i128 %15 to i32
+  // CHECK-NEXT:  store volatile i32 %cast6, ptr %lz, align 4
   lz = __builtin_clzg(ubi128);
-  // CHECK-NEXT: %13 = load i128, ptr %ubi128.addr, align 8
-  // CHECK-NEXT: %14 = call i128 @llvm.ctlz.i128(i128 %13, i1 true)
-  // CHECK-NEXT: %cast5 = trunc i128 %14 to i32
-  // CHECK-NEXT: store volatile i32 %cast5, ptr %lz, align 4
+  // CHECK-NEXT:  %load_bits7 = load i8, ptr %vb8.addr, align 1
+  // CHECK-NEXT:  %16 = bitcast i8 %load_bits7 to <8 x i1>
+  // CHECK-NEXT:  %17 = bitcast <8 x i1> %16 to i8
+  // CHECK-NEXT:  %18 = call i8 @llvm.ctlz.i8(i8 %17, i1 true)
+  // CHECK-NEXT:  %cast8 = zext i8 %18 to i32
+  // CHECK-NEXT:  store volatile i32 %cast8, ptr %lz, align 4
+  lz = __builtin_clzg(vb8);
+  // CHECK-NEXT:  %19 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT:  %20 = call i8 @llvm.ctlz.i8(i8 %19, i1 true)
+  // CHECK-NEXT:  %cast9 = zext i8 %20 to i32
+  // CHECK-NEXT:  %iszero = icmp eq i8 %19, 0
+  // CHECK-NEXT:  %21 = load i8, ptr %sc.addr, align 1
+  // CHECK-NEXT:  %conv = sext i8 %21 to i32
+  // CHECK-NEXT:  %clzg = select i1 %iszero, i32 %conv, i32 %cast9
+  // CHECK-NEXT:  store volatile i32 %clzg, ptr %lz, align 4
   lz = __builtin_clzg(uc, sc);
-  // CHECK-NEXT: %15 = load i8, ptr %uc.addr, align 1
-  // CHECK-NEXT: %16 = call i8 @llvm.ctlz.i8(i8 %15, i1 true)
-  // CHECK-NEXT: %cast6 = zext i8 %16 to i32
-  // CHECK-NEXT: %iszero = icmp eq i8 %15, 0
-  // CHECK-NEXT: %17 = load i8, ptr %sc.addr, align 1
-  // CHECK-NEXT: %conv = sext i8 %17 to i32
-  // CHECK-NEXT: %clzg = select i1 %iszero, i32 %conv, i32 %cast6
-  // CHECK-NEXT: store volatile i32 %clzg, ptr %lz, align 4
+  // CHECK-NEXT:  %22 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT:  %23 = call i16 @llvm.ctlz.i16(i16 %22, i1 true)
+  // CHECK-NEXT:  %cast10 = zext i16 %23 to i32
+  // CHECK-NEXT:  %iszero11 = icmp eq i16 %22, 0
+  // CHECK-NEXT:  %24 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT:  %conv12 = zext i8 %24 to i32
+  // CHECK-NEXT:  %clzg13 = select i1 %iszero11, i32 %conv12, i32 %cast10
+  // CHECK-NEXT:  store volatile i32 %clzg13, ptr %lz, align 4
   lz = __builtin_clzg(us, uc);
-  // CHECK-NEXT: %18 = load i16, ptr %us.addr, align 2
-  // CHECK-NEXT: %19 = call i16 @llvm.ctlz.i16(i16 %18, i1 true)
-  // CHECK-NEXT: %cast7 = zext i16 %19 to i32
-  // CHECK-NEXT: %iszero8 = icmp eq i16 %18, 0
-  // CHECK-NEXT: %20 = load i8, ptr %uc.addr, align 1
-  // CHECK-NEXT: %conv9 = zext i8 %20 to i32
-  // CHECK-NEXT: %clzg10 = select i1 %iszero8, i32 %conv9, i32 %cast7
-  // CHECK-NEXT: store volatile i32 %clzg10, ptr %lz, align 4
+  // CHECK-NEXT:  %25 = load i32, ptr %ui.addr, align 4
+  // CHECK-NEXT:  %26 = call i32 @llvm.ctlz.i32(i32 %25, i1 true)
+  // CHECK-NEXT:  %iszero14 = icmp eq i32 %25, 0
+  // CHECK-NEXT:  %27 = load i16, ptr %s.addr, align 2
+  // CHECK-NEXT:  %conv15 = sext i16 %27 to i32
+  // CHECK-NEXT:  %clzg16 = select i1 %iszero14, i32 %conv15, i32 %26
+  // CHECK-NEXT:  store volatile i32 %clzg16, ptr %lz, align 4
   lz = __builtin_clzg(ui, s);
-  // CHECK-NEXT: %21 = load i32, ptr %ui.addr, align 4
-  // CHECK-NEXT: %22 = call i32 @llvm.ctlz.i32(i32 %21, i1 true)
-  // CHECK-NEXT: %iszero11 = icmp eq i32 %21, 0
-  // CHECK-NEXT: %23 = load i16, ptr %s.addr, align 2
-  // CHECK-NEXT: %conv12 = sext i16 %23 to i32
-  // CHECK-NEXT: %clzg13 = select i1 %iszero11, i32 %conv12, i32 %22
-  // CHECK-NEXT: store volatile i32 %clzg13, ptr %lz, align 4
+  // CHECK-NEXT:  %28 = load i64, ptr %ul.addr, align 8
+  // CHECK-NEXT:  %29 = call i64 @llvm.ctlz.i64(i64 %28, i1 true)
+  // CHECK-NEXT:  %cast17 = trunc i64 %29 to i32
+  // CHECK-NEXT:  %iszero18 = icmp eq i64 %28, 0
+  // CHECK-NEXT:  %30 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT:  %conv19 = zext i16 %30 to i32
+  // CHECK-NEXT:  %clzg20 = select i1 %iszero18, i32 %conv19, i32 %cast17
+  // CHECK-NEXT:  store volatile i32 %clzg20, ptr %lz, align 4
   lz = __builtin_clzg(ul, us);
-  // CHECK-NEXT: %24 = load i64, ptr %ul.addr, align 8
-  // CHECK-NEXT: %25 = call i64 @llvm.ctlz.i64(i64 %24, i1 true)
-  // CHECK-NEXT: %cast14 = trunc i64 %25 to i32
-  // CHECK-NEXT: %iszero15 = icmp eq i64 %24, 0
-  // CHECK-NEXT: %26 = load i16, ptr %us.addr, align 2
-  // CHECK-NEXT: %conv16 = zext i16 %26 to i32
-  // CHECK-NEXT: %clzg17 = select i1 %iszero15, i32 %conv16, i32 %cast14
-  // CHECK-NEXT: store volatile i32 %clzg17, ptr %lz, align 4
+  // CHECK-NEXT:  %31 = load i64, ptr %ull.addr, align 8
+  // CHECK-NEXT:  %32 = call i64 @llvm.ctlz.i64(i64 %31, i1 true)
+  // CHECK-NEXT:  %cast21 = trunc i64 %32 to i32
+  // CHECK-NEXT:  %iszero22 = icmp eq i64 %31, 0
+  // CHECK-NEXT:  %33 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT:  %clzg23 = select i1 %iszero22, i32 %33, i32 %cast21
+  // CHECK-NEXT:  store volatile i32 %clzg23, ptr %lz, align 4
   lz = __builtin_clzg(ull, i);
-  // CHECK-NEXT: %27 = load i64, ptr %ull.addr, align 8
-  // CHECK-NEXT: %28 = call i64 @llvm.ctlz.i64(i64 %27, i1 true)
-  // CHECK-NEXT: %cast18 = trunc i64 %28 to i32
-  // CHECK-NEXT: %iszero19 = icmp eq i64 %27, 0
-  // CHECK-NEXT: %29 = load i32, ptr %i.addr, align 4
-  // CHECK-NEXT: %clzg20 = select i1 %iszero19, i32 %29, i32 %cast18
-  // CHECK-NEXT: store volatile i32 %clzg20, ptr %lz, align 4
+  // CHECK-NEXT:  %34 = load i128, ptr %ui128.addr, align 16
+  // CHECK-NEXT:  %35 = call i128 @llvm.ctlz.i128(i128 %34, i1 true)
+  // CHECK-NEXT:  %cast24 = trunc i128 %35 to i32
+  // CHECK-NEXT:  %iszero25 = icmp eq i128 %34, 0
+  // CHECK-NEXT:  %36 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT:  %clzg26 = select i1 %iszero25, i32 %36, i32 %cast24
+  // CHECK-NEXT:  store volatile i32 %clzg26, ptr %lz, align 4
   lz = __builtin_clzg(ui128, i);
-  // CHECK-NEXT: %30 = load i128, ptr %ui128.addr, align 16
-  // CHECK-NEXT: %31 = call i128 @llvm.ctlz.i128(i128 %30, i1 true)
-  // CHECK-NEXT: %cast21 = trunc i128 %31 to i32
-  // CHECK-NEXT: %iszero22 = icmp eq i128 %30, 0
-  // CHECK-NEXT: %32 = load i32, ptr %i.addr, align 4
-  // CHECK-NEXT: %clzg23 = select i1 %iszero22, i32 %32, i32 %cast21
-  // CHECK-NEXT: store volatile i32 %clzg23, ptr %lz, align 4
+  // CHECK-NEXT:  %37 = load i128, ptr %ubi128.addr, align 8
+  // CHECK-NEXT:  %38 = call i128 @llvm.ctlz.i128(i128 %37, i1 true)
+  // CHECK-NEXT:  %cast27 = trunc i128 %38 to i32
+  // CHECK-NEXT:  %iszero28 = icmp eq i128 %37, 0
+  // CHECK-NEXT:  %39 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT:  %clzg29 = select i1 %iszero28, i32 %39, i32 %cast27
+  // CHECK-NEXT:  store volatile i32 %clzg29, ptr %lz, align 4
   lz = __builtin_clzg(ubi128, i);
-   // CHECK-NEXT: %33 = load i128, ptr %ubi128.addr, align 8
-  // CHECK-NEXT: %34 = call i128 @llvm.ctlz.i128(i128 %33, i1 true)
-  // CHECK-NEXT: %cast24 = trunc i128 %34 to i32
-  // CHECK-NEXT: %iszero25 = icmp eq i128 %33, 0
-  // CHECK-NEXT: %35 = load i32, ptr %i.addr, align 4
-  // CHECK-NEXT: %clzg26 = select i1 %iszero25, i32 %35, i32 %cast24
-  // CHECK-NEXT: store volatile i32 %clzg26, ptr %lz, align 4
-  // CHECK-NEXT: ret void
+  // CHECK-NEXT:  %load_bits30 = load i8, ptr %vb8.addr, align 1
+  // CHECK-NEXT:  %40 = bitcast i8 %load_bits30 to <8 x i1>
+  // CHECK-NEXT:  %41 = bitcast <8 x i1> %40 to i8
+  // CHECK-NEXT:  %42 = call i8 @llvm.ctlz.i8(i8 %41, i1 true)
+  // CHECK-NEXT:  %cast31 = zext i8 %42 to i32
+  // CHECK-NEXT:  %iszero32 = icmp eq i8 %41, 0
+  // CHECK-NEXT:  %43 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT:  %clzg33 = select i1 %iszero32, i32 %43, i32 %cast31
+  // CHECK-NEXT:  store volatile i32 %clzg33, ptr %lz, align 4
+  lz = __builtin_clzg(vb8, i);
 }
 
 // CHECK-LABEL: define{{.*}} void @test_builtin_ctzg
 void test_builtin_ctzg(unsigned char uc, unsigned short us, unsigned int ui,
                        unsigned long ul, unsigned long long ull,
                        unsigned __int128 ui128, unsigned _BitInt(128) ubi128,
-                       signed char sc, short s, int i) {
+                       signed char sc, short s, int i,
+                       _Bool __attribute__((ext_vector_type(8))) vb8) {
   volatile int tz;
-  tz = __builtin_ctzg(uc);
-  // CHECK: %1 = load i8, ptr %uc.addr, align 1
-  // CHECK-NEXT: %2 = call i8 @llvm.cttz.i8(i8 %1, i1 true)
-  // CHECK-NEXT: %cast = zext i8 %2 to i32
+  //      CHECK: %2 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT: %3 = call i8 @llvm.cttz.i8(i8 %2, i1 true)
+  // CHECK-NEXT: %cast = zext i8 %3 to i32
   // CHECK-NEXT: store volatile i32 %cast, ptr %tz, align 4
+  tz = __builtin_ctzg(uc);
+  // CHECK-NEXT: %4 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT: %5 = call i16 @llvm.cttz.i16(i16 %4, i1 true)
+  // CHECK-NEXT: %cast2 = zext i16 %5 to i32
+  // CHECK-NEXT: store volatile i32 %cast2, ptr %tz, align 4
   tz = __builtin_ctzg(us);
-  // CHECK-NEXT: %3 = load i16, ptr %us.addr, align 2
-  // CHECK-NEXT: %4 = call i16 @llvm.cttz.i16(i16 %3, i1 true)
-  // CHECK-NEXT: %cast1 = zext i16 %4 to i32
-  // CHECK-NEXT: store volatile i32 %cast1, ptr %tz, align 4
+  // CHECK-NEXT: %6 = load i32, ptr %ui.addr, align 4
+  // CHECK-NEXT: %7 = call i32 @llvm.cttz.i32(i32 %6, i1 true)
+  // CHECK-NEXT: store volatile i32 %7, ptr %tz, align 4
   tz = __builtin_ctzg(ui);
-  // CHECK-NEXT: %5 = load i32, ptr %ui.addr, align 4
-  // CHECK-NEXT: %6 = call i32 @llvm.cttz.i32(i32 %5, i1 true)
-  // CHECK-NEXT: store volatile i32 %6, ptr %tz, align 4
+  // CHECK-NEXT: %8 = load i64, ptr %ul.addr, align 8
+  // CHECK-NEXT: %9 = call i64 @llvm.cttz.i64(i64 %8, i1 true)
+  // CHECK-NEXT: %cast3 = trunc i64 %9 to i32
+  // CHECK-NEXT: store volatile i32 %cast3, ptr %tz, align 4
   tz = __builtin_ctzg(ul);
-  // CHECK-NEXT: %7 = load i64, ptr %ul.addr, align 8
-  // CHECK-NEXT: %8 = call i64 @llvm.cttz.i64(i64 %7, i1 true)
-  // CHECK-NEXT: %cast2 = trunc i64 %8 to i32
-  // CHECK-NEXT: store volatile i32 %cast2, ptr %tz, align 4
+  // CHECK-NEXT: %10 = load i64, ptr %ull.addr, align 8
+  // CHECK-NEXT: %11 = call i64 @llvm.cttz.i64(i64 %10, i1 true)
+  // CHECK-NEXT: %cast4 = trunc i64 %11 to i32
+  // CHECK-NEXT: store volatile i32 %cast4, ptr %tz, align 4
   tz = __builtin_ctzg(ull);
-  // CHECK-NEXT: %9 = load i64, ptr %ull.addr, align 8
-  // CHECK-NEXT: %10 = call i64 @llvm.cttz.i64(i64 %9, i1 true)
-  // CHECK-NEXT: %cast3 = trunc i64 %10 to i32
-  // CHECK-NEXT: store volatile i32 %cast3, ptr %tz, align 4
+  // CHECK-NEXT: %12 = load i128, ptr %ui128.addr, align 16
+  // CHECK-NEXT: %13 = call i128 @llvm.cttz.i128(i128 %12, i1 true)
+  // CHECK-NEXT: %cast5 = trunc i128 %13 to i32
+  // CHECK-NEXT: store volatile i32 %cast5, ptr %tz, align 4
   tz = __builtin_ctzg(ui128);
-  // CHECK-NEXT: %11 = load i128, ptr %ui128.addr, align 16
-  // CHECK-NEXT: %12 = call i128 @llvm.cttz.i128(i128 %11, i1 true)
-  // CHECK-NEXT: %cast4 = trunc i128 %12 to i32
-  // CHECK-NEXT: store volatile i32 %cast4, ptr %tz, align 4
+  // CHECK-NEXT: %14 = load i128, ptr %ubi128.addr, align 8
+  // CHECK-NEXT: %15 = call i128 @llvm.cttz.i128(i128 %14, i1 true)
+  // CHECK-NEXT: %cast6 = trunc i128 %15 to i32
+  // CHECK-NEXT: store volatile i32 %cast6, ptr %tz, align 4
   tz = __builtin_ctzg(ubi128);
-  // CHECK-NEXT: %13 = load i128, ptr %ubi128.addr, align 8
-  // CHECK-NEXT: %14 = call i128 @llvm.cttz.i128(i128 %13, i1 true)
-  // CHECK-NEXT: %cast5 = trunc i128 %14 to i32
-  // CHECK-NEXT: store volatile i32 %cast5, ptr %tz, align 4
-  tz = __builtin_ctzg(uc, sc);
-  // CHECK-NEXT: %15 = load i8, ptr %uc.addr, align 1
-  // CHECK-NEXT: %16 = call i8 @llvm.cttz.i8(i8 %15, i1 true)
-  // CHECK-NEXT: %cast6 = zext i8 %16 to i32
-  // CHECK-NEXT: %iszero = icmp eq i8 %15, 0
-  // CHECK-NEXT: %17 = load i8, ptr %sc.addr, align 1
-  // CHECK-NEXT: %conv = sext i8 %17 to i32
-  // CHECK-NEXT: %ctzg = select i1 %iszero, i32 %conv, i32 %cast6
+  // CHECK-NEXT: %load_bits7 = load i8, ptr %vb8.addr, align 1
+  // CHECK-NEXT: %16 = bitcast i8 %load_bits7 to <8 x i1>
+  // CHECK-NEXT: %17 = bitcast <8 x i1> %16 to i8
+  // CHECK-NEXT: %18 = call i8 @llvm.cttz.i8(i8 %17, i1 true)
+  // CHECK-NEXT: %cast8 = zext i8 %18 to i32
+  // CHECK-NEXT: store volatile i32 %cast8, ptr %tz, align 4
+  tz = __builtin_ctzg(vb8);
+  // CHECK-NEXT: %19 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT: %20 = call i8 @llvm.cttz.i8(i8 %19, i1 true)
+  // CHECK-NEXT: %cast9 = zext i8 %20 to i32
+  // CHECK-NEXT: %iszero = icmp eq i8 %19, 0
+  // CHECK-NEXT: %21 = load i8, ptr %sc.addr, align 1
+  // CHECK-NEXT: %conv = sext i8 %21 to i32
+  // CHECK-NEXT: %ctzg = select i1 %iszero, i32 %conv, i32 %cast9
   // CHECK-NEXT: store volatile i32 %ctzg, ptr %tz, align 4
+  tz = __builtin_ctzg(uc, sc);
+  // CHECK-NEXT: %22 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT: %23 = call i16 @llvm.cttz.i16(i16 %22, i1 true)
+  // CHECK-NEXT: %cast10 = zext i16 %23 to i32
+  // CHECK-NEXT: %iszero11 = icmp eq i16 %22, 0
+  // CHECK-NEXT: %24 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT: %conv12 = zext i8 %24 to i32
+  // CHECK-NEXT: %ctzg13 = select i1 %iszero11, i32 %conv12, i32 %cast10
+  // CHECK-NEXT: store volatile i32 %ctzg13, ptr %tz, align 4
   tz = __builtin_ctzg(us, uc);
-  // CHECK-NEXT: %18 = load i16, ptr %us.addr, align 2
-  // CHECK-NEXT: %19 = call i16 @llvm.cttz.i16(i16 %18, i1 true)
-  // CHECK-NEXT: %cast7 = zext i16 %19 to i32
-  // CHECK-NEXT: %iszero8 = icmp eq i16 %18, 0
-  // CHECK-NEXT: %20 = load i8, ptr %uc.addr, align 1
-  // CHECK-NEXT: %conv9 = zext i8 %20 to i32
-  // CHECK-NEXT: %ctzg10 = select i1 %iszero8, i32 %conv9, i32 %cast7
-  // CHECK-NEXT: store volatile i32 %ctzg10, ptr %tz, align 4
+  // CHECK-NEXT: %25 = load i32, ptr %ui.addr, align 4
+  // CHECK-NEXT: %26 = call i32 @llvm.cttz.i32(i32 %25, i1 true)
+  // CHECK-NEXT: %iszero14 = icmp eq i32 %25, 0
+  // CHECK-NEXT: %27 = load i16, ptr %s.addr, align 2
+  // CHECK-NEXT: %conv15 = sext i16 %27 to i32
+  // CHECK-NEXT: %ctzg16 = select i1 %iszero14, i32 %conv15, i32 %26
+  // CHECK-NEXT: store volatile i32 %ctzg16, ptr %tz, align 4
   tz = __builtin_ctzg(ui, s);
-  // CHECK-NEXT: %21 = load i32, ptr %ui.addr, align 4
-  // CHECK-NEXT: %22 = call i32 @llvm.cttz.i32(i32 %21, i1 true)
-  // CHECK-NEXT: %iszero11 = icmp eq i32 %21, 0
-  // CHECK-NEXT: %23 = load i16, ptr %s.addr, align 2
-  // CHECK-NEXT: %conv12 = sext i16 %23 to i32
-  // CHECK-NEXT: %ctzg13 = select i1 %iszero11, i32 %conv12, i32 %22
-  // CHECK-NEXT: store volatile i32 %ctzg13, ptr %tz, align 4
+  // CHECK-NEXT: %28 = load i64, ptr %ul.addr, align 8
+  // CHECK-NEXT: %29 = call i64 @llvm.cttz.i64(i64 %28, i1 true)
+  // CHECK-NEXT: %cast17 = trunc i64 %29 to i32
+  // CHECK-NEXT: %iszero18 = icmp eq i64 %28, 0
+  // CHECK-NEXT: %30 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT: %conv19 = zext i16 %30 to i32
+  // CHECK-NEXT: %ctzg20 = select i1 %iszero18, i32 %conv19, i32 %cast17
+  // CHECK-NEXT: store volatile i32 %ctzg20, ptr %tz, align 4
   tz = __builtin_ctzg(ul, us);
-  // CHECK-NEXT: %24 = load i64, ptr %ul.addr, align 8
-  // CHECK-NEXT: %25 = call i64 @llvm.cttz.i64(i64 %24, i1 true)
-  // CHECK-NEXT: %cast14 = trunc i64 %25 to i32
-  // CHECK-NEXT: %iszero15 = icmp eq i64 %24, 0
-  // CHECK-NEXT: %26 = load i16, ptr %us.addr, align 2
-  // CHECK-NEXT: %conv16 = zext i16 %26 to i32
-  // CHECK-NEXT: %ctzg17 = select i1 %iszero15, i32 %conv16, i32 %cast14
-  // CHECK-NEXT: store volatile i32 %ctzg17, ptr %tz, align 4
+  // CHECK-NEXT: %31 = load i64, ptr %ull.addr, align 8
+  // CHECK-NEXT: %32 = call i64 @llvm.cttz.i64(i64 %31, i1 true)
+  // CHECK-NEXT: %cast21 = trunc i64 %32 to i32
+  // CHECK-NEXT: %iszero22 = icmp eq i64 %31, 0
+  // CHECK-NEXT: %33 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT: %ctzg23 = select i1 %iszero22, i32 %33, i32 %cast21
+  // CHECK-NEXT: store volatile i32 %ctzg23, ptr %tz, align 4
   tz = __builtin_ctzg(ull, i);
-  // CHECK-NEXT: %27 = load i64, ptr %ull.addr, align 8
-  // CHECK-NEXT: %28 = call i64 @llvm.cttz.i64(i64 %27, i1 true)
-  // CHECK-NEXT: %cast18 = trunc i64 %28 to i32
-  // CHECK-NEXT: %iszero19 = icmp eq i64 %27, 0
-  // CHECK-NEXT: %29 = load i32, ptr %i.addr, align 4
-  // CHECK-NEXT: %ctzg20 = select i1 %iszero19, i32 %29, i32 %cast18
-  // CHECK-NEXT: store volatile i32 %ctzg20, ptr %tz, align 4
+  // CHECK-NEXT: %34 = load i128, ptr %ui128.addr, align 16
+  // CHECK-NEXT: %35 = call i128 @llvm.cttz.i128(i128 %34, i1 true)
+  // CHECK-NEXT: %cast24 = trunc i128 %35 to i32
+  // CHECK-NEXT: %iszero25 = icmp eq i128 %34, 0
+  // CHECK-NEXT: %36 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT: %ctzg26 = select i1 %iszero25, i32 %36, i32 %cast24
+  // CHECK-NEXT: store volatile i32 %ctzg26, ptr %tz, align 4
   tz = __builtin_ctzg(ui128, i);
-  // CHECK-NEXT: %30 = load i128, ptr %ui128.addr, align 16
-  // CHECK-NEXT: %31 = call i128 @llvm.cttz.i128(i128 %30, i1 true)
-  // CHECK-NEXT: %cast21 = trunc i128 %31 to i32
-  // CHECK-NEXT: %iszero22 = icmp eq i128 %30, 0
-  // CHECK-NEXT: %32 = load i32, ptr %i.addr, align 4
-  // CHECK-NEXT: %ctzg23 = select i1 %iszero22, i32 %32, i32 %cast21
-  // CHECK-NEXT: store volatile i32 %ctzg23, ptr %tz, align 4
+  // CHECK-NEXT: %37 = load i128, ptr %ubi128.addr, align 8
+  // CHECK-NEXT: %38 = call i128 @llvm.cttz.i128(i128 %37, i1 true)
+  // CHECK-NEXT: %cast27 = trunc i128 %38 to i32
+  // CHECK-NEXT: %iszero28 = icmp eq i128 %37, 0
+  // CHECK-NEXT: %39 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT: %ctzg29 = select i1 %iszero28, i32 %39, i32 %cast27
+  // CHECK-NEXT: store volatile i32 %ctzg29, ptr %tz, align 4
   tz = __builtin_ctzg(ubi128, i);
-  // CHECK-NEXT: %33 = load i128, ptr %ubi128.addr, align 8
-  // CHECK-NEXT: %34 = call i128 @llvm.cttz.i128(i128 %33, i1 true)
-  // CHECK-NEXT: %cast24 = trunc i128 %34 to i32
-  // CHECK-NEXT: %iszero25 = icmp eq i128 %33, 0
-  // CHECK-NEXT: %35 = load i32, ptr %i.addr, align 4
-  // CHECK-NEXT: %ctzg26 = select i1 %iszero25, i32 %35, i32 %cast24
-  // CHECK-NEXT: store volatile i32 %ctzg26, ptr %tz, align 4
-  // CHECK-NEXT: ret void
+  // CHECK-NEXT: %load_bits30 = load i8, ptr %vb8.addr, align 1
+  // CHECK-NEXT: %40 = bitcast i8 %load_bits30 to <8 x i1>
+  // CHECK-NEXT: %41 = bitcast <8 x i1> %40 to i8
+  // CHECK-NEXT: %42 = call i8 @llvm.cttz.i8(i8 %41, i1 true)
+  // CHECK-NEXT: %cast31 = zext i8 %42 to i32
+  // CHECK-NEXT: %iszero32 = icmp eq i8 %41, 0
+  // CHECK-NEXT: %43 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT: %ctzg33 = select i1 %iszero32, i32 %43, i32 %cast31
+  // CHECK-NEXT: store volatile i32 %ctzg33, ptr %tz, align 4
+  tz = __builtin_ctzg(vb8, i);
 }
 
 #endif

>From 65bdc4c536bdf9588d357a8bb7281b96fffc6fb6 Mon Sep 17 00:00:00 2001
From: Joseph Huber <hube...@outlook.com>
Date: Thu, 21 Aug 2025 16:13:39 -0500
Subject: [PATCH 2/6] docs

---
 clang/docs/LanguageExtensions.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/docs/LanguageExtensions.rst 
b/clang/docs/LanguageExtensions.rst
index 40f8fc9190f94..d6584121b148f 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4214,7 +4214,9 @@ such as ``unsigned __int128`` and C23 ``unsigned 
_BitInt(N)``.
 
 ``__builtin_clzg`` (respectively ``__builtin_ctzg``) returns the number of
 leading (respectively trailing) 0 bits in the first argument. The first 
argument
-can be of any unsigned integer type or fixed boolean vector.
+can be of any unsigned integer type or fixed boolean vector. Boolean vectors
+behave like a bit field where the least significant bits are trailing and the
+most significant bits are leading.
 
 If the first argument is 0 and an optional second argument of ``int`` type is
 provided, then the second argument is returned. If the first argument is 0, but

>From b6629161080ed60f17b9cd16845ff9c372897062 Mon Sep 17 00:00:00 2001
From: Joseph Huber <hube...@outlook.com>
Date: Thu, 21 Aug 2025 16:24:42 -0500
Subject: [PATCH 3/6] simplify codegen

---
 clang/lib/CodeGen/CGBuiltin.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b98218c36161d..0979104e945a8 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -1704,10 +1704,7 @@ static llvm::Value *EmitBitCountExpr(CodeGenFunction 
&CGF, const Expr *E) {
       VT && VT->getElementType()->isIntegerTy(1)) {
     llvm::Type *StorageType =
         llvm::Type::getIntNTy(CGF.getLLVMContext(), VT->getNumElements());
-    ArgValue = CGF.emitBoolVecConversion(
-        ArgValue, StorageType->getPrimitiveSizeInBits(), "insertvec");
     ArgValue = CGF.Builder.CreateBitCast(ArgValue, StorageType);
-    ArgType = ArgValue->getType();
   }
 
   return ArgValue;

>From 10f6bb2f6a0f86796daa46f60e5659ad751de4a6 Mon Sep 17 00:00:00 2001
From: Joseph Huber <hube...@outlook.com>
Date: Thu, 21 Aug 2025 16:26:47 -0500
Subject: [PATCH 4/6] Improve docs

---
 clang/docs/LanguageExtensions.rst | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst 
b/clang/docs/LanguageExtensions.rst
index d6584121b148f..1299582b2f5ea 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -4214,9 +4214,13 @@ such as ``unsigned __int128`` and C23 ``unsigned 
_BitInt(N)``.
 
 ``__builtin_clzg`` (respectively ``__builtin_ctzg``) returns the number of
 leading (respectively trailing) 0 bits in the first argument. The first 
argument
-can be of any unsigned integer type or fixed boolean vector. Boolean vectors
-behave like a bit field where the least significant bits are trailing and the
-most significant bits are leading.
+can be of any unsigned integer type or fixed boolean vector.
+
+For boolean vectors, these builtins interpret the vector like a bit-field where
+the ith element of the vector is bit i of the bit-field, counting from the
+least significant end. ``__builtin_clzg`` returns the number of zero elements 
at
+the end of the vector, while ``__builtin_ctzg`` returns the number of zero
+elements at the start of the vector.
 
 If the first argument is 0 and an optional second argument of ``int`` type is
 provided, then the second argument is returned. If the first argument is 0, but

>From f29abf9ba336a9eb0bcceb4375cf86bfabeb3e5a Mon Sep 17 00:00:00 2001
From: Joseph Huber <hube...@outlook.com>
Date: Thu, 21 Aug 2025 16:29:26 -0500
Subject: [PATCH 5/6] Fix missing line

---
 clang/docs/ReleaseNotes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f0732deec23c3..3e495d8dfa136 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -149,6 +149,7 @@ Non-comprehensive list of changes in this release
 - Use of ``__has_feature`` to detect the ``ptrauth_qualifier`` and 
``ptrauth_intrinsics``
   features has been deprecated, and is restricted to the arm64e target only. 
The
   correct method to check for these features is to test for the ``__PTRAUTH__``
+  macro.
 
 - Added a new builtin, ``__builtin_dedup_pack``, to remove duplicate types 
from a parameter pack.
   This feature is particularly useful in template metaprogramming for 
normalizing type lists.

>From acd708cf5e8a8b57126df0fcbe3cca35654b2038 Mon Sep 17 00:00:00 2001
From: Joseph Huber <hube...@outlook.com>
Date: Thu, 21 Aug 2025 22:13:12 -0500
Subject: [PATCH 6/6] comments

---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp 
b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 5039d9950a4a4..79040d45cb010 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -145,11 +145,11 @@ static llvm::APSInt convertBoolVectorToInt(const Pointer 
&Val) {
   assert(Val.getFieldDesc()->isPrimitiveArray() &&
          Val.getFieldDesc()->getElemQualType()->isBooleanType() &&
          "Not a boolean vector");
-  unsigned NumElts = Val.getNumElems();
+  unsigned NumElems = Val.getNumElems();
 
   // Each element is one bit, so create an integer with NumElts bits.
-  llvm::APSInt Result(NumElts, 0);
-  for (unsigned I = 0; I < NumElts; ++I) {
+  llvm::APSInt Result(NumElems, 0);
+  for (unsigned I = 0; I != NumElems; ++I) {
     if (Val.elem<bool>(I))
       Result.setBit(I);
   }

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to