sagarkulkarni19 updated this revision to Diff 459803.
sagarkulkarni19 added a comment.
- Support Opaque pointers
- Correct predicate types for the intrinsics.
- Decorate intrinsics with SME attributes.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D127910/new/
https://reviews.llvm.org/D127910
Files:
clang/include/clang/Basic/TargetBuiltins.h
clang/include/clang/Basic/arm_sve.td
clang/lib/CodeGen/CGBuiltin.cpp
clang/lib/CodeGen/CodeGenFunction.h
clang/lib/Headers/CMakeLists.txt
clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
clang/utils/TableGen/SveEmitter.cpp
clang/utils/TableGen/TableGen.cpp
clang/utils/TableGen/TableGenBackends.h
Index: clang/utils/TableGen/TableGenBackends.h
===================================================================
--- clang/utils/TableGen/TableGenBackends.h
+++ clang/utils/TableGen/TableGenBackends.h
@@ -101,6 +101,8 @@
void EmitSveTypeFlags(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitSveRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSmeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+
void EmitMveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitMveBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
void EmitMveBuiltinSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
Index: clang/utils/TableGen/TableGen.cpp
===================================================================
--- clang/utils/TableGen/TableGen.cpp
+++ clang/utils/TableGen/TableGen.cpp
@@ -81,6 +81,7 @@
GenArmSveBuiltinCG,
GenArmSveTypeFlags,
GenArmSveRangeChecks,
+ GenArmSmeHeader,
GenArmCdeHeader,
GenArmCdeBuiltinDef,
GenArmCdeBuiltinSema,
@@ -219,6 +220,8 @@
"Generate arm_sve_typeflags.inc for clang"),
clEnumValN(GenArmSveRangeChecks, "gen-arm-sve-sema-rangechecks",
"Generate arm_sve_sema_rangechecks.inc for clang"),
+ clEnumValN(GenArmSmeHeader, "gen-arm-sme-header",
+ "Generate arm_sme.h for clang"),
clEnumValN(GenArmMveHeader, "gen-arm-mve-header",
"Generate arm_mve.h for clang"),
clEnumValN(GenArmMveBuiltinDef, "gen-arm-mve-builtin-def",
@@ -438,6 +441,9 @@
case GenArmSveRangeChecks:
EmitSveRangeChecks(Records, OS);
break;
+ case GenArmSmeHeader:
+ EmitSmeHeader(Records, OS);
+ break;
case GenArmCdeHeader:
EmitCdeHeader(Records, OS);
break;
Index: clang/utils/TableGen/SveEmitter.cpp
===================================================================
--- clang/utils/TableGen/SveEmitter.cpp
+++ clang/utils/TableGen/SveEmitter.cpp
@@ -169,6 +169,11 @@
SmallVector<ImmCheck, 2> ImmChecks;
+ /// True if this is an SME intrinsic.
+ bool IsSMEIntrinsic;
+ /// Attributes for SME intrinsics.
+ std::string SMEAttributes;
+
public:
Intrinsic(StringRef Name, StringRef Proto, uint64_t MergeTy,
StringRef MergeSuffix, uint64_t MemoryElementTy, StringRef LLVMName,
@@ -194,6 +199,10 @@
uint64_t getFlags() const { return Flags; }
bool isFlagSet(uint64_t Flag) const { return Flags & Flag;}
+ bool isSMEIntrinsic() const { return IsSMEIntrinsic; }
+ // Return a comma seperated string of SME attributes.
+ std::string getSMEAttributes() const { return SMEAttributes; }
+
ArrayRef<ImmCheck> getImmChecks() const { return ImmChecks; }
/// Return the type string for a BUILTIN() macro in Builtins.def.
@@ -334,6 +343,9 @@
/// Emit arm_sve.h.
void createHeader(raw_ostream &o);
+ /// Emit arm_sme.h.
+ void createSMEHeader(raw_ostream &o);
+
/// Emit all the __builtin prototypes and code needed by Sema.
void createBuiltins(raw_ostream &o);
@@ -347,7 +359,9 @@
void createTypeFlags(raw_ostream &o);
/// Create intrinsic and add it to \p Out
- void createIntrinsic(Record *R, SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out);
+ void createIntrinsic(Record *R,
+ SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out,
+ bool IsSME = false);
};
} // end anonymous namespace
@@ -757,6 +771,11 @@
NumVectors = 0;
Signed = true;
break;
+ case '%':
+ Pointer = true;
+ Void = true;
+ NumVectors = 0;
+ break;
case 'A':
Pointer = true;
ElementBitwidth = Bitwidth = 8;
@@ -840,6 +859,16 @@
this->Flags |= Emitter.encodeMergeType(MergeTy);
if (hasSplat())
this->Flags |= Emitter.encodeSplatOperand(getSplatIdx());
+
+ // Set attributes for SME intrinsics.
+ if (this->Flags & Emitter.getEnumValueForFlag("IsSME")) {
+ this->IsSMEIntrinsic = true;
+ if (this->Flags & Emitter.getEnumValueForFlag("IsSMELd1"))
+ this->SMEAttributes = "arm_streaming, arm_shared_za";
+ else if (this->Flags & Emitter.getEnumValueForFlag("IsSMESt1"))
+ this->SMEAttributes = "arm_streaming, arm_shared_za, arm_preserves_za";
+ } else
+ this->IsSMEIntrinsic = false;
}
std::string Intrinsic::getBuiltinTypeStr() {
@@ -926,7 +955,10 @@
OS << (IsOverloaded ? "__aio " : "__ai ")
<< "__attribute__((__clang_arm_builtin_alias("
- << "__builtin_sve_" << FullName << ")))\n";
+ << "__builtin_sve_" << FullName << ")";
+ if (isSMEIntrinsic())
+ OS << ", " << getSMEAttributes();
+ OS << "))\n";
OS << getTypes()[0].str() << " " << ProtoName << "(";
for (unsigned I = 0; I < getTypes().size() - 1; ++I) {
@@ -989,7 +1021,7 @@
}
void SVEEmitter::createIntrinsic(
- Record *R, SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out) {
+ Record *R, SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out, bool IsSME) {
StringRef Name = R->getValueAsString("Name");
StringRef Proto = R->getValueAsString("Prototype");
StringRef Types = R->getValueAsString("Types");
@@ -1005,6 +1037,9 @@
for (auto FlagRec : FlagsList)
Flags |= FlagRec->getValueAsInt("Value");
+ bool SMEFlag = Flags & getEnumValueForFlag("IsSME");
+ if (SMEFlag != IsSME)
+ return;
// Create a dummy TypeSpec for non-overloaded builtins.
if (Types.empty()) {
assert((Flags & getEnumValueForFlag("IsOverloadNone")) &&
@@ -1288,11 +1323,90 @@
OS << "#endif /* __ARM_SVE_H */\n";
}
+void SVEEmitter::createSMEHeader(raw_ostream &OS) {
+ OS << "/*===---- arm_sme.h - ARM SME intrinsics "
+ "-----------------------------------===\n"
+ " *\n"
+ " *\n"
+ " * Part of the LLVM Project, under the Apache License v2.0 with LLVM "
+ "Exceptions.\n"
+ " * See https://llvm.org/LICENSE.txt for license information.\n"
+ " * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception\n"
+ " *\n"
+ " *===-----------------------------------------------------------------"
+ "------===\n"
+ " */\n\n";
+
+ OS << "#ifndef __ARM_SME_H\n";
+ OS << "#define __ARM_SME_H\n\n";
+
+ OS << "#if !defined(__ARM_FEATURE_SME)\n";
+ OS << "#error \"SME support not enabled\"\n";
+ OS << "#else\n\n";
+
+ OS << "#include <arm_sve.h> \n\n";
+
+ OS << "/* Function attributes */\n";
+ OS << "#define __ai static __inline__ __attribute__((__always_inline__, "
+ "__nodebug__))\n\n";
+
+ OS << "#ifdef __cplusplus\n";
+ OS << "extern \"C\" {\n";
+ OS << "#endif\n\n";
+
+ SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
+ std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+ for (auto *R : RV)
+ createIntrinsic(R, Defs, true);
+
+ // Sort intrinsics in header file by following order/priority similar to SVE:
+ // - Architectural guard
+ // - Class (is intrinsic overloaded or not)
+ // - Intrinsic name
+ std::stable_sort(Defs.begin(), Defs.end(),
+ [](const std::unique_ptr<Intrinsic> &A,
+ const std::unique_ptr<Intrinsic> &B) {
+ auto ToTuple = [](const std::unique_ptr<Intrinsic> &I) {
+ return std::make_tuple(I->getGuard(),
+ (unsigned)I->getClassKind(),
+ I->getName());
+ };
+ return ToTuple(A) < ToTuple(B);
+ });
+
+ StringRef InGuard = "";
+ for (auto &I : Defs) {
+ // Emit #endif/#if pair if needed.
+ if (I->getGuard() != InGuard) {
+ if (!InGuard.empty())
+ OS << "#endif //" << InGuard << "\n";
+ InGuard = I->getGuard();
+ if (!InGuard.empty())
+ OS << "\n#if " << InGuard << "\n";
+ }
+
+ // Actually emit the intrinsic declaration.
+ I->emitIntrinsic(OS);
+ }
+
+ if (!InGuard.empty())
+ OS << "#endif //" << InGuard << "\n";
+
+ OS << "#ifdef __cplusplus\n";
+ OS << "} // extern \"C\"\n";
+ OS << "#endif\n\n";
+ OS << "#undef __ai\n\n";
+ OS << "#endif /*__ARM_FEATURE_SME */\n\n";
+ OS << "#endif /* __ARM_SME_H */\n";
+}
+
void SVEEmitter::createBuiltins(raw_ostream &OS) {
std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
- for (auto *R : RV)
+ for (auto *R : RV) {
createIntrinsic(R, Defs);
+ createIntrinsic(R, Defs, true);
+ }
// The mappings must be sorted based on BuiltinID.
llvm::sort(Defs, [](const std::unique_ptr<Intrinsic> &A,
@@ -1322,8 +1436,10 @@
void SVEEmitter::createCodeGenMap(raw_ostream &OS) {
std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
- for (auto *R : RV)
+ for (auto *R : RV) {
createIntrinsic(R, Defs);
+ createIntrinsic(R, Defs, true);
+ }
// The mappings must be sorted based on BuiltinID.
llvm::sort(Defs, [](const std::unique_ptr<Intrinsic> &A,
@@ -1355,8 +1471,10 @@
void SVEEmitter::createRangeChecks(raw_ostream &OS) {
std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
- for (auto *R : RV)
+ for (auto *R : RV) {
createIntrinsic(R, Defs);
+ createIntrinsic(R, Defs, true);
+ }
// The mappings must be sorted based on BuiltinID.
llvm::sort(Defs, [](const std::unique_ptr<Intrinsic> &A,
@@ -1420,6 +1538,10 @@
SVEEmitter(Records).createHeader(OS);
}
+void EmitSmeHeader(RecordKeeper &Records, raw_ostream &OS) {
+ SVEEmitter(Records).createSMEHeader(OS);
+}
+
void EmitSveBuiltins(RecordKeeper &Records, raw_ostream &OS) {
SVEEmitter(Records).createBuiltins(OS);
}
Index: clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
@@ -0,0 +1,299 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sme.h>
+
+//
+//
+
+// CHECK-LABEL: @test_svst1_hor_vnum_za8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svst1_hor_vnum_za8ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+ svst1_hor_vnum_za8(0, slice_base, 0, pg, ptr, vnum);
+ svst1_hor_vnum_za8(0, slice_base, 15, pg, ptr, vnum);
+}
+
+//
+//
+
+// CHECK-LABEL: @test_svst1_hor_vnum_za16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za16ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+ svst1_hor_vnum_za16(0, slice_base, 0, pg, ptr, vnum);
+ svst1_hor_vnum_za16(1, slice_base, 7, pg, ptr, vnum);
+}
+
+//
+//
+
+// CHECK-LABEL: @test_svst1_hor_vnum_za32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za32ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+ svst1_hor_vnum_za32(0, slice_base, 0, pg, ptr, vnum);
+ svst1_hor_vnum_za32(3, slice_base, 3, pg, ptr, vnum);
+}
+
+//
+//
+
+// CHECK-LABEL: @test_svst1_hor_vnum_za64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svst1_hor_vnum_za64ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+ svst1_hor_vnum_za64(0, slice_base, 0, pg, ptr, vnum);
+ svst1_hor_vnum_za64(7, slice_base, 1, pg, ptr, vnum);
+}
+
+//
+//
+
+// CHECK-LABEL: @test_svst1_hor_vnum_za128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i128, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 15, i32 [[SLICE_BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svst1_hor_vnum_za128ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i128, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 15, i32 [[SLICE_BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+ svst1_hor_vnum_za128(0, slice_base, 0, pg, ptr, vnum);
+ svst1_hor_vnum_za128(15, slice_base, 0, pg, ptr, vnum);
+}
+
+//
+//
+
+// CHECK-LABEL: @test_svst1_ver_vnum_za8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svst1_ver_vnum_za8ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_ver_vnum_za8(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+ svst1_ver_vnum_za8(0, slice_base, 0, pg, ptr, vnum);
+ svst1_ver_vnum_za8(0, slice_base, 15, pg, ptr, vnum);
+}
+
+//
+//
+
+// CHECK-LABEL: @test_svst1_ver_vnum_za16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za16ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+ svst1_ver_vnum_za16(0, slice_base, 0, pg, ptr, vnum);
+ svst1_ver_vnum_za16(1, slice_base, 7, pg, ptr, vnum);
+}
+
+//
+//
+
+// CHECK-LABEL: @test_svst1_ver_vnum_za32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za32ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+ svst1_ver_vnum_za32(0, slice_base, 0, pg, ptr, vnum);
+ svst1_ver_vnum_za32(3, slice_base, 3, pg, ptr, vnum);
+}
+
+//
+//
+
+// CHECK-LABEL: @test_svst1_ver_vnum_za64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svst1_ver_vnum_za64ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+ svst1_ver_vnum_za64(0, slice_base, 0, pg, ptr, vnum);
+ svst1_ver_vnum_za64(7, slice_base, 1, pg, ptr, vnum);
+}
+
+//
+//
+
+// CHECK-LABEL: @test_svst1_ver_vnum_za128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i128, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 15, i32 [[SLICE_BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svst1_ver_vnum_za128ju10__SVBool_tPvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i128, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 15, i32 [[SLICE_BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, void *ptr, int64_t vnum) {
+ svst1_ver_vnum_za128(0, slice_base, 0, pg, ptr, vnum);
+ svst1_ver_vnum_za128(15, slice_base, 0, pg, ptr, vnum);
+}
Index: clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
@@ -0,0 +1,209 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sme.h>
+
+// CHECK-LABEL: @test_svst1_hor_za8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG:%.*]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z18test_svst1_hor_za8ju10__SVBool_tPv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG:%.*]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) {
+ svst1_hor_za8(0, slice_base, 0, pg, ptr);
+ svst1_hor_za8(0, slice_base, 15, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_hor_za16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i64 1, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za16ju10__SVBool_tPv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i64 1, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) {
+ svst1_hor_za16(0, slice_base, 0, pg, ptr);
+ svst1_hor_za16(1, slice_base, 7, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_hor_za32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i64 3, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za32ju10__SVBool_tPv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i64 3, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) {
+ svst1_hor_za32(0, slice_base, 0, pg, ptr);
+ svst1_hor_za32(3, slice_base, 3, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_hor_za64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i64 7, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za64ju10__SVBool_tPv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i64 7, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) {
+ svst1_hor_za64(0, slice_base, 0, pg, ptr);
+ svst1_hor_za64(7, slice_base, 1, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_hor_za128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i64 15, i32 [[SLICE_BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst1_hor_za128ju10__SVBool_tPv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i64 15, i32 [[SLICE_BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) {
+ svst1_hor_za128(0, slice_base, 0, pg, ptr);
+ svst1_hor_za128(15, slice_base, 0, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_ver_za8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG:%.*]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z18test_svst1_ver_za8ju10__SVBool_tPv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG:%.*]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1b.vert(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) {
+ svst1_ver_za8(0, slice_base, 0, pg, ptr);
+ svst1_ver_za8(0, slice_base, 15, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_ver_za16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i64 1, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za16ju10__SVBool_tPv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i64 1, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) {
+ svst1_ver_za16(0, slice_base, 0, pg, ptr);
+ svst1_ver_za16(1, slice_base, 7, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_ver_za32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i64 3, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za32ju10__SVBool_tPv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i64 3, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) {
+ svst1_ver_za32(0, slice_base, 0, pg, ptr);
+ svst1_ver_za32(3, slice_base, 3, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_ver_za64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i64 7, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za64ju10__SVBool_tPv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i64 7, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) {
+ svst1_ver_za64(0, slice_base, 0, pg, ptr);
+ svst1_ver_za64(7, slice_base, 1, pg, ptr);
+}
+
+// CHECK-LABEL: @test_svst1_ver_za128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i64 15, i32 [[SLICE_BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svst1_ver_za128ju10__SVBool_tPv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i64 15, i32 [[SLICE_BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) {
+ svst1_ver_za128(0, slice_base, 0, pg, ptr);
+ svst1_ver_za128(15, slice_base, 0, pg, ptr);
+}
Index: clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
@@ -0,0 +1,269 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sme.h>
+
+// CHECK-LABEL: @test_svld1_hor_vnum_za8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svld1_hor_vnum_za8ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_hor_vnum_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+ svld1_hor_vnum_za8(0, slice_base, 0, pg, ptr, vnum);
+ svld1_hor_vnum_za8(0, slice_base, 15, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_hor_vnum_za16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za16ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_hor_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+ svld1_hor_vnum_za16(0, slice_base, 0, pg, ptr, vnum);
+ svld1_hor_vnum_za16(1, slice_base, 7, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_hor_vnum_za32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za32ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_hor_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+ svld1_hor_vnum_za32(0, slice_base, 0, pg, ptr, vnum);
+ svld1_hor_vnum_za32(3, slice_base, 3, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_hor_vnum_za64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svld1_hor_vnum_za64ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_hor_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+ svld1_hor_vnum_za64(0, slice_base, 0, pg, ptr, vnum);
+ svld1_hor_vnum_za64(7, slice_base, 1, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_hor_vnum_za128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i128, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 15, i32 [[SLICE_BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svld1_hor_vnum_za128ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i128, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 15, i32 [[SLICE_BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_hor_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+ svld1_hor_vnum_za128(0, slice_base, 0, pg, ptr, vnum);
+ svld1_hor_vnum_za128(15, slice_base, 0, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_ver_hor_za8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svld1_ver_hor_za8ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP0]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG:%.*]], ptr [[TMP1]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[TMP1]], i64 0, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_ver_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+ svld1_ver_vnum_za8(0, slice_base, 0, pg, ptr, vnum);
+ svld1_ver_vnum_za8(0, slice_base, 15, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_ver_vnum_za16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za16ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[TMP2]], i64 1, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_ver_vnum_za16(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+ svld1_ver_vnum_za16(0, slice_base, 0, pg, ptr, vnum);
+ svld1_ver_vnum_za16(1, slice_base, 7, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_ver_vnum_za32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za32ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[TMP2]], i64 3, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_ver_vnum_za32(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+ svld1_ver_vnum_za32(0, slice_base, 0, pg, ptr, vnum);
+ svld1_ver_vnum_za32(3, slice_base, 3, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_ver_vnum_za64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svld1_ver_vnum_za64ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[TMP2]], i64 7, i32 [[TILESLICE2]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_ver_vnum_za64(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+ svld1_ver_vnum_za64(0, slice_base, 0, pg, ptr, vnum);
+ svld1_ver_vnum_za64(7, slice_base, 1, pg, ptr, vnum);
+}
+
+// CHECK-LABEL: @test_svld1_ver_vnum_za128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CHECK-NEXT: [[TMP2:%.*]] = getelementptr i128, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 15, i32 [[SLICE_BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svld1_ver_vnum_za128ju10__SVBool_tPKvl(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb()
+// CPP-CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[TMP1]], [[VNUM:%.*]]
+// CPP-CHECK-NEXT: [[TMP2:%.*]] = getelementptr i128, ptr [[PTR:%.*]], i64 [[MULVL]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[TMP2]], i64 15, i32 [[SLICE_BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_ver_vnum_za128(uint32_t slice_base, svbool_t pg, const void *ptr, int64_t vnum) {
+ svld1_ver_vnum_za128(0, slice_base, 0, pg, ptr, vnum);
+ svld1_ver_vnum_za128(15, slice_base, 0, pg, ptr, vnum);
+}
Index: clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
@@ -0,0 +1,239 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --force-update
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -D__ARM_FEATURE_SME -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -o /dev/null %s
+
+#include <arm_sme.h>
+
+//
+//
+
+// CHECK-LABEL: @test_svld1_hor_za8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG:%.*]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z18test_svld1_hor_za8ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG:%.*]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) {
+ svld1_hor_za8(0, slice_base, 0, pg, ptr);
+ svld1_hor_za8(0, slice_base, 15, pg, ptr);
+}
+
+//
+//
+__attribute__((arm_streaming_compatible))
+// CHECK-LABEL: @test_svld1_hor_za16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i64 1, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za16ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i64 1, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) {
+ svld1_hor_za16(0, slice_base, 0, pg, ptr);
+ svld1_hor_za16(1, slice_base, 7, pg, ptr);
+}
+
+//
+//
+__attribute__((arm_streaming_compatible))
+// CHECK-LABEL: @test_svld1_hor_za32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i64 3, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za32ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i64 3, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) {
+ svld1_hor_za32(0, slice_base, 0, pg, ptr);
+ svld1_hor_za32(3, slice_base, 3, pg, ptr);
+}
+
+//
+//
+__attribute__((arm_streaming_compatible))
+// CHECK-LABEL: @test_svld1_hor_za64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i64 7, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za64ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i64 7, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) {
+ svld1_hor_za64(0, slice_base, 0, pg, ptr);
+ svld1_hor_za64(7, slice_base, 1, pg, ptr);
+}
+
+//
+//
+__attribute__((arm_streaming_compatible))
+// CHECK-LABEL: @test_svld1_hor_za128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i64 15, i32 [[SLICE_BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svld1_hor_za128ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i64 15, i32 [[SLICE_BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) {
+ svld1_hor_za128(0, slice_base, 0, pg, ptr);
+ svld1_hor_za128(15, slice_base, 0, pg, ptr);
+}
+
+//
+//
+__attribute__((arm_streaming_compatible))
+// CHECK-LABEL: @test_svld1_ver_za8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG:%.*]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z18test_svld1_ver_za8ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG:%.*]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 15
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> [[PG]], ptr [[PTR]], i64 0, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) {
+ svld1_ver_za8(0, slice_base, 0, pg, ptr);
+ svld1_ver_za8(0, slice_base, 15, pg, ptr);
+}
+
+//
+//
+__attribute__((arm_streaming_compatible))
+// CHECK-LABEL: @test_svld1_ver_za16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i64 1, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za16ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 7
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> [[TMP0]], ptr [[PTR]], i64 1, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) {
+ svld1_ver_za16(0, slice_base, 0, pg, ptr);
+ svld1_ver_za16(1, slice_base, 7, pg, ptr);
+}
+
+//
+//
+__attribute__((arm_streaming_compatible))
+// CHECK-LABEL: @test_svld1_ver_za32(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i64 3, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za32ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 3
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> [[TMP0]], ptr [[PTR]], i64 3, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) {
+ svld1_ver_za32(0, slice_base, 0, pg, ptr);
+ svld1_ver_za32(3, slice_base, 3, pg, ptr);
+}
+
+//
+//
+__attribute__((arm_streaming_compatible))
+// CHECK-LABEL: @test_svld1_ver_za64(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i64 7, i32 [[TILESLICE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za64ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> [[TMP0]], ptr [[PTR]], i64 7, i32 [[TILESLICE1]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) {
+ svld1_ver_za64(0, slice_base, 0, pg, ptr);
+ svld1_ver_za64(7, slice_base, 1, pg, ptr);
+}
+
+//
+//
+__attribute__((arm_streaming_compatible))
+// CHECK-LABEL: @test_svld1_ver_za128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i64 15, i32 [[SLICE_BASE]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svld1_ver_za128ju10__SVBool_tPKv(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> [[TMP0]], ptr [[PTR]], i64 15, i32 [[SLICE_BASE]])
+// CPP-CHECK-NEXT: ret void
+//
+__attribute__((arm_streaming_compatible)) void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) {
+ svld1_ver_za128(0, slice_base, 0, pg, ptr);
+ svld1_ver_za128(15, slice_base, 0, pg, ptr);
+}
Index: clang/lib/Headers/CMakeLists.txt
===================================================================
--- clang/lib/Headers/CMakeLists.txt
+++ clang/lib/Headers/CMakeLists.txt
@@ -308,6 +308,8 @@
clang_generate_header(-gen-arm-fp16 arm_fp16.td arm_fp16.h)
# Generate arm_sve.h
clang_generate_header(-gen-arm-sve-header arm_sve.td arm_sve.h)
+ # Generate arm_sme.h
+ clang_generate_header(-gen-arm-sme-header arm_sve.td arm_sme.h)
# Generate arm_bf16.h
clang_generate_header(-gen-arm-bf16 arm_bf16.td arm_bf16.h)
# Generate arm_mve.h
@@ -332,6 +334,7 @@
list(APPEND aarch64_only_generated_files
"${CMAKE_CURRENT_BINARY_DIR}/arm_sve.h"
+ "${CMAKE_CURRENT_BINARY_DIR}/arm_sme.h"
"${CMAKE_CURRENT_BINARY_DIR}/arm_bf16.h"
"${output_dir}/arm_neon_sve_bridge.h"
)
Index: clang/lib/CodeGen/CodeGenFunction.h
===================================================================
--- clang/lib/CodeGen/CodeGenFunction.h
+++ clang/lib/CodeGen/CodeGenFunction.h
@@ -4230,6 +4230,10 @@
llvm::Value *EmitSVEMaskedStore(const CallExpr *,
SmallVectorImpl<llvm::Value *> &Ops,
unsigned BuiltinID);
+ llvm::Value *EmitTileslice(llvm::Value *Offset, llvm::Value *Base);
+ llvm::Value *EmitSMELd1St1(SVETypeFlags TypeFlags,
+ llvm::SmallVectorImpl<llvm::Value *> &Ops,
+ unsigned IntID);
llvm::Value *EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
SmallVectorImpl<llvm::Value *> &Ops,
unsigned BuiltinID);
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -8673,6 +8673,7 @@
switch (VTy->getMinNumElements()) {
default:
llvm_unreachable("unsupported element count!");
+ case 1:
case 2:
case 4:
case 8:
@@ -9026,6 +9027,75 @@
return Store;
}
+Value *CodeGenFunction::EmitTileslice(Value *Offset, Value *Base) {
+ llvm::Value *CastOffset = Builder.CreateIntCast(Offset, Int32Ty, false);
+ return Builder.CreateAdd(Base, CastOffset, "tileslice");
+}
+
+Value *CodeGenFunction::EmitSMELd1St1(SVETypeFlags TypeFlags,
+ SmallVectorImpl<Value *> &Ops,
+ unsigned IntID) {
+ llvm::Type *BasePtrType;
+ switch (IntID) {
+ case Intrinsic::aarch64_sme_ld1h_horiz:
+ case Intrinsic::aarch64_sme_ld1h_vert:
+ case Intrinsic::aarch64_sme_st1h_horiz:
+ case Intrinsic::aarch64_sme_st1h_vert:
+ BasePtrType = Int16Ty;
+ Ops[3] = EmitSVEPredicateCast(
+ Ops[3], llvm::ScalableVectorType::get(Builder.getInt1Ty(), 8));
+ break;
+ case Intrinsic::aarch64_sme_ld1w_horiz:
+ case Intrinsic::aarch64_sme_ld1w_vert:
+ case Intrinsic::aarch64_sme_st1w_horiz:
+ case Intrinsic::aarch64_sme_st1w_vert:
+ BasePtrType = Int32Ty;
+ Ops[3] = EmitSVEPredicateCast(
+ Ops[3], llvm::ScalableVectorType::get(Builder.getInt1Ty(), 4));
+ break;
+ case Intrinsic::aarch64_sme_ld1d_horiz:
+ case Intrinsic::aarch64_sme_ld1d_vert:
+ case Intrinsic::aarch64_sme_st1d_horiz:
+ case Intrinsic::aarch64_sme_st1d_vert:
+ BasePtrType = Int64Ty;
+ Ops[3] = EmitSVEPredicateCast(
+ Ops[3], llvm::ScalableVectorType::get(Builder.getInt1Ty(), 2));
+ break;
+ case Intrinsic::aarch64_sme_ld1q_horiz:
+ case Intrinsic::aarch64_sme_ld1q_vert:
+ case Intrinsic::aarch64_sme_st1q_horiz:
+ case Intrinsic::aarch64_sme_st1q_vert:
+ BasePtrType = llvm::IntegerType::get(getLLVMContext(), 128);
+ Ops[3] = EmitSVEPredicateCast(
+ Ops[3], llvm::ScalableVectorType::get(Builder.getInt1Ty(), 1));
+ break;
+ default:
+ BasePtrType = Int8Ty;
+ break;
+ }
+
+ SmallVector<Value *> NewOps;
+ NewOps.push_back(Ops[3]);
+
+ llvm::Value *BasePtr = Ops[4];
+
+ // Contains the vnum parameter
+ if (Ops.size() == 6) {
+ Function *StreamingVectorLength =
+ CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb, {});
+ llvm::Value *StreamingVectorLengthCall =
+ Builder.CreateCall(StreamingVectorLength, {});
+ llvm::Value *Mulvl =
+ Builder.CreateMul(StreamingVectorLengthCall, Ops[5], "mulvl");
+ BasePtr = Builder.CreateGEP(BasePtrType, Ops[4], Mulvl);
+ }
+ NewOps.push_back(BasePtr);
+ NewOps.push_back(Builder.CreateIntCast(Ops[0], Int64Ty, false));
+ NewOps.push_back(EmitTileslice(Ops[2], Ops[1]));
+ Function *F = CGM.getIntrinsic(IntID, {});
+ return Builder.CreateCall(F, NewOps);
+}
+
// Limit the usage of scalable llvm IR generated by the ACLE by using the
// sve dup.x intrinsic instead of IRBuilder::CreateVectorSplat.
Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) {
@@ -9156,6 +9226,8 @@
TypeFlags.isZExtReturn());
else if (TypeFlags.isStore())
return EmitSVEMaskedStore(E, Ops, Builtin->LLVMIntrinsic);
+ else if (TypeFlags.isSMELd1() || TypeFlags.isSMESt1())
+ return EmitSMELd1St1(TypeFlags, Ops, Builtin->LLVMIntrinsic);
else if (TypeFlags.isGatherLoad())
return EmitSVEGatherLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
else if (TypeFlags.isScatterStore())
Index: clang/include/clang/Basic/arm_sve.td
===================================================================
--- clang/include/clang/Basic/arm_sve.td
+++ clang/include/clang/Basic/arm_sve.td
@@ -98,6 +98,7 @@
// N: svfloat64_t
// J: Prefetch type (sv_prfop)
+// %: pointer to void
// A: pointer to int8_t
// B: pointer to int16_t
// C: pointer to int32_t
@@ -205,6 +206,9 @@
def IsTupleCreate : FlagType<0x100000000>;
def IsTupleGet : FlagType<0x200000000>;
def IsTupleSet : FlagType<0x400000000>;
+def IsSME : FlagType<0x800000000>;
+def IsSMELd1 : FlagType<0x1000000000>;
+def IsSMESt1 : FlagType<0x2000000000>;
// These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
class ImmCheckType<int val> {
@@ -542,6 +546,28 @@
def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddn", "b", MergeNone, "aarch64_sve_bfmlalt_lane", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
}
+def SVLD1_HOR_ZA8 : MInst<"svld1_hor_za8", "vimiPQ", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1b_horiz">;
+def SVLD1_HOR_ZA16 : MInst<"svld1_hor_za16", "vimiPQ", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1h_horiz">;
+def SVLD1_HOR_ZA32 : MInst<"svld1_hor_za32", "vimiPQ", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1w_horiz">;
+def SVLD1_HOR_ZA64 : MInst<"svld1_hor_za64", "vimiPQ", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1d_horiz">;
+def SVLD1_HOR_ZA128 : MInst<"svld1_hor_za128", "vimiPQ", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1q_horiz">;
+def SVLD1_VER_ZA8 : MInst<"svld1_ver_za8", "vimiPQ", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1b_vert">;
+def SVLD1_VER_ZA16 : MInst<"svld1_ver_za16", "vimiPQ", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1h_vert">;
+def SVLD1_VER_ZA32 : MInst<"svld1_ver_za32", "vimiPQ", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1w_vert">;
+def SVLD1_VER_ZA64 : MInst<"svld1_ver_za64", "vimiPQ", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1d_vert">;
+def SVLD1_VER_ZA128 : MInst<"svld1_ver_za128", "vimiPQ", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1q_vert">;
+
+def SVLD1_HOR_VNUM_ZA8 : MInst<"svld1_hor_vnum_za8", "vimiPQl", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1b_horiz">;
+def SVLD1_HOR_VNUM_ZA16 : MInst<"svld1_hor_vnum_za16", "vimiPQl", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1h_horiz">;
+def SVLD1_HOR_VNUM_ZA32 : MInst<"svld1_hor_vnum_za32", "vimiPQl", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1w_horiz">;
+def SVLD1_HOR_VNUM_ZA64 : MInst<"svld1_hor_vnum_za64", "vimiPQl", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1d_horiz">;
+def SVLD1_HOR_VNUM_ZA128 : MInst<"svld1_hor_vnum_za128", "vimiPQl", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1q_horiz">;
+def SVLD1_VER_VNUM_ZA8 : MInst<"svld1_ver_vnum_za8", "vimiPQl", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1b_vert">;
+def SVLD1_VER_VNUM_ZA16 : MInst<"svld1_ver_vnum_za16", "vimiPQl", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1h_vert">;
+def SVLD1_VER_VNUM_ZA32 : MInst<"svld1_ver_vnum_za32", "vimiPQl", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1w_vert">;
+def SVLD1_VER_VNUM_ZA64 : MInst<"svld1_ver_vnum_za64", "vimiPQl", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1d_vert">;
+def SVLD1_VER_VNUM_ZA128 : MInst<"svld1_ver_vnum_za128", "vimiPQl", "", [IsOverloadNone, IsSME, IsSMELd1], MemEltTyDefault, "aarch64_sme_ld1q_vert">;
+
////////////////////////////////////////////////////////////////////////////////
// Stores
@@ -664,6 +690,28 @@
def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">;
}
+def SVST1_HOR_ZA8 : MInst<"svst1_hor_za8", "vimiP%", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1b_horiz">;
+def SVST1_HOR_ZA16 : MInst<"svst1_hor_za16", "vimiP%", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1h_horiz">;
+def SVST1_HOR_ZA32 : MInst<"svst1_hor_za32", "vimiP%", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1w_horiz">;
+def SVST1_HOR_ZA64 : MInst<"svst1_hor_za64", "vimiP%", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1d_horiz">;
+def SVST1_HOR_ZA128 : MInst<"svst1_hor_za128", "vimiP%", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1q_horiz">;
+def SVST1_VER_ZA8 : MInst<"svst1_ver_za8", "vimiP%", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1b_vert">;
+def SVST1_VER_ZA16 : MInst<"svst1_ver_za16", "vimiP%", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1h_vert">;
+def SVST1_VER_ZA32 : MInst<"svst1_ver_za32", "vimiP%", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1w_vert">;
+def SVST1_VER_ZA64 : MInst<"svst1_ver_za64", "vimiP%", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1d_vert">;
+def SVST1_VER_ZA128 : MInst<"svst1_ver_za128", "vimiP%", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1q_vert">;
+
+def SVST1_HOR_VNUM_ZA8 : MInst<"svst1_hor_vnum_za8", "vimiP%l", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1b_horiz">;
+def SVST1_HOR_VNUM_ZA16 : MInst<"svst1_hor_vnum_za16", "vimiP%l", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1h_horiz">;
+def SVST1_HOR_VNUM_ZA32 : MInst<"svst1_hor_vnum_za32", "vimiP%l", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1w_horiz">;
+def SVST1_HOR_VNUM_ZA64 : MInst<"svst1_hor_vnum_za64", "vimiP%l", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1d_horiz">;
+def SVST1_HOR_VNUM_ZA128 : MInst<"svst1_hor_vnum_za128", "vimiP%l", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1q_horiz">;
+def SVST1_VER_VNUM_ZA8 : MInst<"svst1_ver_vnum_za8", "vimiP%l", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1b_vert">;
+def SVST1_VER_VNUM_ZA16 : MInst<"svst1_ver_vnum_za16", "vimiP%l", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1h_vert">;
+def SVST1_VER_VNUM_ZA32 : MInst<"svst1_ver_vnum_za32", "vimiP%l", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1w_vert">;
+def SVST1_VER_VNUM_ZA64 : MInst<"svst1_ver_vnum_za64", "vimiP%l", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1d_vert">;
+def SVST1_VER_VNUM_ZA128 : MInst<"svst1_ver_vnum_za128", "vimiP%l", "", [IsOverloadNone, IsSME, IsSMESt1], MemEltTyDefault, "aarch64_sme_st1q_vert">;
+
////////////////////////////////////////////////////////////////////////////////
// Prefetches
Index: clang/include/clang/Basic/TargetBuiltins.h
===================================================================
--- clang/include/clang/Basic/TargetBuiltins.h
+++ clang/include/clang/Basic/TargetBuiltins.h
@@ -281,6 +281,9 @@
bool isTupleCreate() const { return Flags & IsTupleCreate; }
bool isTupleGet() const { return Flags & IsTupleGet; }
bool isTupleSet() const { return Flags & IsTupleSet; }
+ bool isSME() const { return Flags & IsSME; }
+ bool isSMELd1() const { return Flags & IsSMELd1; }
+ bool isSMESt1() const { return Flags & IsSMESt1; }
uint64_t getBits() const { return Flags; }
bool isFlagSet(uint64_t Flag) const { return Flags & Flag; }
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits