https://github.com/kmclaughlin-arm updated 
https://github.com/llvm/llvm-project/pull/145346

>From b2d9f70eb33ebbb26166bea4ba79f05204fc3cc2 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaugh...@arm.com>
Date: Mon, 23 Jun 2025 13:07:34 +0000
Subject: [PATCH 1/3] [Clang][AArch64] Add FP8 variants of Neon store
 intrinsics

Adds FP8 variants for existing VST1, VST2, VST3 & VST4 intrinsics.
---
 clang/include/clang/Basic/arm_neon.td         |  22 +-
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      |  22 +
 .../fp8-intrinsics/acle_neon_fp8_stores.c     | 475 ++++++++++++++++++
 3 files changed, 518 insertions(+), 1 deletion(-)
 create mode 100644 
clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c

diff --git a/clang/include/clang/Basic/arm_neon.td 
b/clang/include/clang/Basic/arm_neon.td
index 7251cc2d1759a..314330ed9fde6 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2119,6 +2119,26 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = 
"lut" in {
   }
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
+  def VST1_MF8 : WInst<"vst1", "v*(.!)", "mQm">;
+  def VST2_MF8 : WInst<"vst2", "v*(2!)", "mQm">;
+  def VST3_MF8 : WInst<"vst3", "v*(3!)", "mQm">;
+  def VST4_MF8 : WInst<"vst4", "v*(4!)", "mQm">;
+
+  def VST1_X2_MF8 : WInst<"vst1_x2", "v*(2!)", "mQm">;
+  def VST1_X3_MF8 : WInst<"vst1_x3", "v*(3!)", "mQm">;
+  def VST1_X4_MF8 : WInst<"vst1_x4", "v*(4!)", "mQm">;
+
+  def VST1_LANE_MF8 : WInst<"vst1_lane", "v*(.!)I", "mQm",
+                           [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
+  def VST2_LANE_MF8 : WInst<"vst2_lane", "v*(2!)I", "mQm",
+                           [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
+  def VST3_LANE_MF8 : WInst<"vst3_lane", "v*(3!)I", "mQm",
+                           [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
+  def VST4_LANE_MF8 : WInst<"vst4_lane", "v*(4!)I", "mQm",
+                           [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
+}
+
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   def VBF1CVT_BF16_MF8        : VInst<"vcvt1_bf16_mf8_fpm",      "(QB).V", 
"m">;
   def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    
"Hm">;
@@ -2194,4 +2214,4 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = 
"fp8,neon" in {
   // fscale
   def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">;
   def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">;
-}
\ No newline at end of file
+}
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 6738d4be6dd21..3bd5054050036 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -1553,6 +1553,28 @@ static const std::pair<unsigned, unsigned> 
NEONEquivalentIntrinsicMap[] = {
   { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v 
},
   { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v },
   { NEON::BI__builtin_neon_vst4q_lane_bf16, 
NEON::BI__builtin_neon_vst4q_lane_v },
+  { NEON::BI__builtin_neon_vst1_mf8_x2, NEON::BI__builtin_neon_vst1_x2_v },
+  { NEON::BI__builtin_neon_vst1_mf8_x3, NEON::BI__builtin_neon_vst1_x3_v },
+  { NEON::BI__builtin_neon_vst1_mf8_x4, NEON::BI__builtin_neon_vst1_x4_v },
+  { NEON::BI__builtin_neon_vst1_mf8, NEON::BI__builtin_neon_vst1_v },
+  { NEON::BI__builtin_neon_vst1_lane_mf8, NEON::BI__builtin_neon_vst1_lane_v },
+  { NEON::BI__builtin_neon_vst1q_mf8_x2, NEON::BI__builtin_neon_vst1q_x2_v },
+  { NEON::BI__builtin_neon_vst1q_mf8_x3, NEON::BI__builtin_neon_vst1q_x3_v },
+  { NEON::BI__builtin_neon_vst1q_mf8_x4, NEON::BI__builtin_neon_vst1q_x4_v },
+  { NEON::BI__builtin_neon_vst1q_mf8, NEON::BI__builtin_neon_vst1q_v },
+  { NEON::BI__builtin_neon_vst1q_lane_mf8, NEON::BI__builtin_neon_vst1q_lane_v 
},
+  { NEON::BI__builtin_neon_vst2_mf8, NEON::BI__builtin_neon_vst2_v },
+  { NEON::BI__builtin_neon_vst2_lane_mf8, NEON::BI__builtin_neon_vst2_lane_v },
+  { NEON::BI__builtin_neon_vst2q_mf8, NEON::BI__builtin_neon_vst2q_v },
+  { NEON::BI__builtin_neon_vst2q_lane_mf8, NEON::BI__builtin_neon_vst2q_lane_v 
},
+  { NEON::BI__builtin_neon_vst3_mf8, NEON::BI__builtin_neon_vst3_v },
+  { NEON::BI__builtin_neon_vst3_lane_mf8, NEON::BI__builtin_neon_vst3_lane_v },
+  { NEON::BI__builtin_neon_vst3q_mf8, NEON::BI__builtin_neon_vst3q_v },
+  { NEON::BI__builtin_neon_vst3q_lane_mf8, NEON::BI__builtin_neon_vst3q_lane_v 
},
+  { NEON::BI__builtin_neon_vst4_mf8, NEON::BI__builtin_neon_vst4_v },
+  { NEON::BI__builtin_neon_vst4_lane_mf8, NEON::BI__builtin_neon_vst4_lane_v },
+  { NEON::BI__builtin_neon_vst4q_mf8, NEON::BI__builtin_neon_vst4q_v },
+  { NEON::BI__builtin_neon_vst4q_lane_mf8, NEON::BI__builtin_neon_vst4q_lane_v 
},
   // The mangling rules cause us to have one ID for each type for 
vldap1(q)_lane
   // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an
   // arbitrary one to be handled as tha canonical variation.
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c 
b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c
new file mode 100644
index 0000000000000..f09bacdbe6302
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c
@@ -0,0 +1,475 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon 
-target-feature +fp8  \
+// RUN:  -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p 
mem2reg,sroa | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon 
-target-feature +fp8  \
+// RUN:  -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p 
mem2reg,sroa | FileCheck %s -check-prefix CHECK-CXX
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon 
-target-feature +fp8 -O3 -Werror -Wall -S -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include "arm_neon.h"
+
+// CHECK-LABEL: define dso_local void @test_vst1_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], <8 x i8> [[VAL:%.*]]) 
#[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <8 x i8> [[VAL]], ptr [[PTR]], align 1
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z13test_vst1_mf8Pu6__mfp813__Mfloat8x8_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], <8 x i8> [[VAL:%.*]]) 
#[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    store <8 x i8> [[VAL]], ptr [[PTR]], align 1
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst1_mf8(mfloat8_t *ptr, mfloat8x8_t val) {
+  vst1_mf8(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <16 x i8> [[VAL]], ptr [[PTR]], align 1
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z14test_vst1q_mf8Pu6__mfp814__Mfloat8x16_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    store <16 x i8> [[VAL]], ptr [[PTR]], align 1
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst1q_mf8(mfloat8_t *ptr, mfloat8x16_t val) {
+  vst1q_mf8(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_lane_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], <8 x i8> [[VAL:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[VAL]], i32 7
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[PTR]], align 1
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z18test_vst1_lane_mf8Pu6__mfp813__Mfloat8x8_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], <8 x i8> [[VAL:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[VAL]], i32 7
+// CHECK-CXX-NEXT:    store i8 [[TMP0]], ptr [[PTR]], align 1
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst1_lane_mf8(mfloat8_t *ptr, mfloat8x8_t val) {
+  vst1_lane_mf8(ptr, val, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[VAL]], i32 15
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[PTR]], align 1
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z19test_vst1q_lane_mf8Pu6__mfp814__Mfloat8x16_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[VAL]], i32 15
+// CHECK-CXX-NEXT:    store i8 [[TMP0]], ptr [[PTR]], align 1
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst1q_lane_mf8(mfloat8_t *ptr, mfloat8x16_t val) {
+  vst1q_lane_mf8(ptr, val, 15);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_mf8_x2(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z16test_vst1_mf8_x2Pu6__mfp813mfloat8x8x2_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst1_mf8_x2(mfloat8_t *ptr, mfloat8x8x2_t val) {
+  vst1_mf8_x2(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x2(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z17test_vst1q_mf8_x2Pu6__mfp814mfloat8x16x2_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst1q_mf8_x2(mfloat8_t *ptr, mfloat8x16x2_t val) {
+  vst1q_mf8_x2(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_mf8_x3(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z16test_vst1_mf8_x3Pu6__mfp813mfloat8x8x3_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 2
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst1_mf8_x3(mfloat8_t *ptr, mfloat8x8x3_t val) {
+  vst1_mf8_x3(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x3(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z17test_vst1q_mf8_x3Pu6__mfp814mfloat8x16x3_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 2
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst1q_mf8_x3(mfloat8_t *ptr, mfloat8x16x3_t val) {
+  vst1q_mf8_x3(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1_mf8_x4(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z16test_vst1_mf8_x4Pu6__mfp813mfloat8x8x4_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 2
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 3
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst1_mf8_x4(mfloat8_t *ptr, mfloat8x8x4_t val) {
+  vst1_mf8_x4(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x4(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z17test_vst1q_mf8_x4Pu6__mfp814mfloat8x16x4_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 2
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 3
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst1q_mf8_x4(mfloat8_t *ptr, mfloat8x16x4_t val) {
+  vst1q_mf8_x4(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z13test_vst2_mf8Pu6__mfp813mfloat8x8x2_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst2_mf8(mfloat8_t *ptr, mfloat8x8x2_t val) {
+  vst2_mf8(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z14test_vst2q_mf8Pu6__mfp814mfloat8x16x2_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst2q_mf8(mfloat8_t *ptr, mfloat8x16x2_t val) {
+  vst2q_mf8(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2_lane_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], i64 7, ptr 
[[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z18test_vst2_lane_mf8Pu6__mfp813mfloat8x8x2_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], i64 7, ptr 
[[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst2_lane_mf8(mfloat8_t *ptr, mfloat8x8x2_t val) {
+  vst2_lane_mf8(ptr, val, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], i64 15, 
ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z19test_vst2q_lane_mf8Pu6__mfp814mfloat8x16x2_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], i64 15, 
ptr [[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst2q_lane_mf8(mfloat8_t *ptr, mfloat8x16x2_t val) {
+  vst2q_lane_mf8(ptr, val, 15);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z13test_vst3_mf8Pu6__mfp813mfloat8x8x3_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 2
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst3_mf8(mfloat8_t *ptr, mfloat8x8x3_t val) {
+  vst3_mf8(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z14test_vst3q_mf8Pu6__mfp814mfloat8x16x3_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 2
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst3q_mf8(mfloat8_t *ptr, mfloat8x16x3_t val) {
+  vst3q_mf8(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3_lane_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z18test_vst3_lane_mf8Pu6__mfp813mfloat8x8x3_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 2
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst3_lane_mf8(mfloat8_t *ptr, mfloat8x8x3_t val) {
+  vst3_lane_mf8(ptr, val, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z19test_vst3q_lane_mf8Pu6__mfp814mfloat8x16x3_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 2
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst3q_lane_mf8(mfloat8_t *ptr, mfloat8x16x3_t val) {
+  vst3q_lane_mf8(ptr, val, 15);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z13test_vst4_mf8Pu6__mfp813mfloat8x8x4_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 2
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 3
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst4_mf8(mfloat8_t *ptr, mfloat8x8x4_t val) {
+  vst4_mf8(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z14test_vst4q_mf8Pu6__mfp814mfloat8x16x4_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 2
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 3
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst4q_mf8(mfloat8_t *ptr, mfloat8x16x4_t val) {
+  vst4q_mf8(ptr, val);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4_lane_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], i64 7, ptr 
[[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z18test_vst4_lane_mf8Pu6__mfp813mfloat8x8x4_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 2
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 3
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], i64 7, ptr 
[[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst4_lane_mf8(mfloat8_t *ptr, mfloat8x8x4_t val) {
+  vst4_lane_mf8(ptr, val, 7);
+}
+
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_mf8(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 0
+// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 1
+// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 2
+// CHECK-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], i64 15, 
ptr [[PTR]])
+// CHECK-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: define dso_local void 
@_Z19test_vst4q_lane_mf8Pu6__mfp814mfloat8x16x4_t(
+// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 0
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 1
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 2
+// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 3
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], i64 15, 
ptr [[PTR]])
+// CHECK-CXX-NEXT:    ret void
+//
+void test_vst4q_lane_mf8(mfloat8_t *ptr, mfloat8x16x4_t val) {
+  vst4q_lane_mf8(ptr, val, 15);
+}

>From dcbbaa8368251507c6181aab835cba6376078f55 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaugh...@arm.com>
Date: Tue, 24 Jun 2025 10:35:23 +0000
Subject: [PATCH 2/3] - Removed TargetGuard = "fp8,neon"

---
 clang/include/clang/Basic/arm_neon.td         | 42 +++++--------------
 clang/lib/CodeGen/TargetBuiltins/ARM.cpp      | 22 ----------
 .../fp8-intrinsics/acle_neon_fp8_stores.c     |  6 +--
 3 files changed, 14 insertions(+), 56 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td 
b/clang/include/clang/Basic/arm_neon.td
index 314330ed9fde6..a79bb6c27aceb 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -466,15 +466,15 @@ def VLD1_LANE : WInst<"vld1_lane", ".(c*!).I",
 def VLD1_DUP  : WInst<"vld1_dup", ".(c*!)",
                       "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
 def VST1      : WInst<"vst1", "v*(.!)",
-                      "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
+                      "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPsmQm">;
 def VST1_X2   : WInst<"vst1_x2", "v*(2!)",
-                      "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
+                      "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPsmQm">;
 def VST1_X3   : WInst<"vst1_x3", "v*(3!)",
-                      "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
+                      "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPsmQm">;
 def VST1_X4   : WInst<"vst1_x4", "v*(4!)",
-                      "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
+                      "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPsmQm">;
 def VST1_LANE : WInst<"vst1_lane", "v*(.!)I",
-                      "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs",
+                      "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPsmQm",
                       [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
 
 let ArchGuard = "(__ARM_FP & 2)" in {
@@ -510,14 +510,14 @@ def VLD3_LANE : WInst<"vld3_lane", "3(c*!)3I", 
"QUsQUiQsQiQfQPsUcUsUicsifPcPs",
                       [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
 def VLD4_LANE : WInst<"vld4_lane", "4(c*!)4I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
                       [ImmCheck<6, ImmCheckLaneIndex, 1>]>;
-def VST2 : WInst<"vst2", "v*(2!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VST3 : WInst<"vst3", "v*(3!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VST4 : WInst<"vst4", "v*(4!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VST2_LANE : WInst<"vst2_lane", "v*(2!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
+def VST2 : WInst<"vst2", "v*(2!)", 
"QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPsmQm">;
+def VST3 : WInst<"vst3", "v*(3!)", 
"QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPsmQm">;
+def VST4 : WInst<"vst4", "v*(4!)", 
"QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPsmQm">;
+def VST2_LANE : WInst<"vst2_lane", "v*(2!)I", 
"QUsQUiQsQiQfQPsUcUsUicsifPcPsmQm",
                       [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
-def VST3_LANE : WInst<"vst3_lane", "v*(3!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
+def VST3_LANE : WInst<"vst3_lane", "v*(3!)I", 
"QUsQUiQsQiQfQPsUcUsUicsifPcPsmQm",
                       [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
-def VST4_LANE : WInst<"vst4_lane", "v*(4!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs",
+def VST4_LANE : WInst<"vst4_lane", "v*(4!)I", 
"QUsQUiQsQiQfQPsUcUsUicsifPcPsmQm",
                       [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
 let ArchGuard = "(__ARM_FP & 2)" in {
 def VLD2_F16      : WInst<"vld2", "2(c*!)", "hQh">;
@@ -2119,26 +2119,6 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = 
"lut" in {
   }
 }
 
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
-  def VST1_MF8 : WInst<"vst1", "v*(.!)", "mQm">;
-  def VST2_MF8 : WInst<"vst2", "v*(2!)", "mQm">;
-  def VST3_MF8 : WInst<"vst3", "v*(3!)", "mQm">;
-  def VST4_MF8 : WInst<"vst4", "v*(4!)", "mQm">;
-
-  def VST1_X2_MF8 : WInst<"vst1_x2", "v*(2!)", "mQm">;
-  def VST1_X3_MF8 : WInst<"vst1_x3", "v*(3!)", "mQm">;
-  def VST1_X4_MF8 : WInst<"vst1_x4", "v*(4!)", "mQm">;
-
-  def VST1_LANE_MF8 : WInst<"vst1_lane", "v*(.!)I", "mQm",
-                           [ImmCheck<2, ImmCheckLaneIndex, 1>]>;
-  def VST2_LANE_MF8 : WInst<"vst2_lane", "v*(2!)I", "mQm",
-                           [ImmCheck<3, ImmCheckLaneIndex, 1>]>;
-  def VST3_LANE_MF8 : WInst<"vst3_lane", "v*(3!)I", "mQm",
-                           [ImmCheck<4, ImmCheckLaneIndex, 1>]>;
-  def VST4_LANE_MF8 : WInst<"vst4_lane", "v*(4!)I", "mQm",
-                           [ImmCheck<5, ImmCheckLaneIndex, 1>]>;
-}
-
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   def VBF1CVT_BF16_MF8        : VInst<"vcvt1_bf16_mf8_fpm",      "(QB).V", 
"m">;
   def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    
"Hm">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp 
b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 3bd5054050036..6738d4be6dd21 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -1553,28 +1553,6 @@ static const std::pair<unsigned, unsigned> 
NEONEquivalentIntrinsicMap[] = {
   { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v 
},
   { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v },
   { NEON::BI__builtin_neon_vst4q_lane_bf16, 
NEON::BI__builtin_neon_vst4q_lane_v },
-  { NEON::BI__builtin_neon_vst1_mf8_x2, NEON::BI__builtin_neon_vst1_x2_v },
-  { NEON::BI__builtin_neon_vst1_mf8_x3, NEON::BI__builtin_neon_vst1_x3_v },
-  { NEON::BI__builtin_neon_vst1_mf8_x4, NEON::BI__builtin_neon_vst1_x4_v },
-  { NEON::BI__builtin_neon_vst1_mf8, NEON::BI__builtin_neon_vst1_v },
-  { NEON::BI__builtin_neon_vst1_lane_mf8, NEON::BI__builtin_neon_vst1_lane_v },
-  { NEON::BI__builtin_neon_vst1q_mf8_x2, NEON::BI__builtin_neon_vst1q_x2_v },
-  { NEON::BI__builtin_neon_vst1q_mf8_x3, NEON::BI__builtin_neon_vst1q_x3_v },
-  { NEON::BI__builtin_neon_vst1q_mf8_x4, NEON::BI__builtin_neon_vst1q_x4_v },
-  { NEON::BI__builtin_neon_vst1q_mf8, NEON::BI__builtin_neon_vst1q_v },
-  { NEON::BI__builtin_neon_vst1q_lane_mf8, NEON::BI__builtin_neon_vst1q_lane_v 
},
-  { NEON::BI__builtin_neon_vst2_mf8, NEON::BI__builtin_neon_vst2_v },
-  { NEON::BI__builtin_neon_vst2_lane_mf8, NEON::BI__builtin_neon_vst2_lane_v },
-  { NEON::BI__builtin_neon_vst2q_mf8, NEON::BI__builtin_neon_vst2q_v },
-  { NEON::BI__builtin_neon_vst2q_lane_mf8, NEON::BI__builtin_neon_vst2q_lane_v 
},
-  { NEON::BI__builtin_neon_vst3_mf8, NEON::BI__builtin_neon_vst3_v },
-  { NEON::BI__builtin_neon_vst3_lane_mf8, NEON::BI__builtin_neon_vst3_lane_v },
-  { NEON::BI__builtin_neon_vst3q_mf8, NEON::BI__builtin_neon_vst3q_v },
-  { NEON::BI__builtin_neon_vst3q_lane_mf8, NEON::BI__builtin_neon_vst3q_lane_v 
},
-  { NEON::BI__builtin_neon_vst4_mf8, NEON::BI__builtin_neon_vst4_v },
-  { NEON::BI__builtin_neon_vst4_lane_mf8, NEON::BI__builtin_neon_vst4_lane_v },
-  { NEON::BI__builtin_neon_vst4q_mf8, NEON::BI__builtin_neon_vst4q_v },
-  { NEON::BI__builtin_neon_vst4q_lane_mf8, NEON::BI__builtin_neon_vst4q_lane_v 
},
   // The mangling rules cause us to have one ID for each type for 
vldap1(q)_lane
   // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an
   // arbitrary one to be handled as tha canonical variation.
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c 
b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c
index f09bacdbe6302..a463347efdcf8 100644
--- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c
@@ -1,10 +1,10 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
 
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon 
-target-feature +fp8  \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon  \
 // RUN:  -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p 
mem2reg,sroa | FileCheck %s
-// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon 
-target-feature +fp8  \
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon 
\
 // RUN:  -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p 
mem2reg,sroa | FileCheck %s -check-prefix CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon 
-target-feature +fp8 -O3 -Werror -Wall -S -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -O3 
-Werror -Wall -S -o /dev/null %s
 
 // REQUIRES: aarch64-registered-target
 

>From 3fb7f6fa3ae0f90eea79027edbb2b0db766ce469 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaugh...@arm.com>
Date: Fri, 27 Jun 2025 09:15:29 +0000
Subject: [PATCH 3/3] - Remove new test file and add tests to neon-intrinsics.c
 & neon-ldst-one.c

---
 .../fp8-intrinsics/acle_neon_fp8_stores.c     | 475 ------------------
 clang/test/CodeGen/AArch64/neon-intrinsics.c  | 176 +++++++
 clang/test/CodeGen/AArch64/neon-ldst-one.c    | 100 ++++
 3 files changed, 276 insertions(+), 475 deletions(-)
 delete mode 100644 
clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c

diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c 
b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c
deleted file mode 100644
index a463347efdcf8..0000000000000
--- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c
+++ /dev/null
@@ -1,475 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 5
-
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon  \
-// RUN:  -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p 
mem2reg,sroa | FileCheck %s
-// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon 
\
-// RUN:  -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p 
mem2reg,sroa | FileCheck %s -check-prefix CHECK-CXX
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -O3 
-Werror -Wall -S -o /dev/null %s
-
-// REQUIRES: aarch64-registered-target
-
-#include "arm_neon.h"
-
-// CHECK-LABEL: define dso_local void @test_vst1_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], <8 x i8> [[VAL:%.*]]) 
#[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    store <8 x i8> [[VAL]], ptr [[PTR]], align 1
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z13test_vst1_mf8Pu6__mfp813__Mfloat8x8_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], <8 x i8> [[VAL:%.*]]) 
#[[ATTR0:[0-9]+]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    store <8 x i8> [[VAL]], ptr [[PTR]], align 1
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst1_mf8(mfloat8_t *ptr, mfloat8x8_t val) {
-  vst1_mf8(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    store <16 x i8> [[VAL]], ptr [[PTR]], align 1
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z14test_vst1q_mf8Pu6__mfp814__Mfloat8x16_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    store <16 x i8> [[VAL]], ptr [[PTR]], align 1
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst1q_mf8(mfloat8_t *ptr, mfloat8x16_t val) {
-  vst1q_mf8(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_lane_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], <8 x i8> [[VAL:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[VAL]], i32 7
-// CHECK-NEXT:    store i8 [[TMP0]], ptr [[PTR]], align 1
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z18test_vst1_lane_mf8Pu6__mfp813__Mfloat8x8_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], <8 x i8> [[VAL:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[VAL]], i32 7
-// CHECK-CXX-NEXT:    store i8 [[TMP0]], ptr [[PTR]], align 1
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst1_lane_mf8(mfloat8_t *ptr, mfloat8x8_t val) {
-  vst1_lane_mf8(ptr, val, 7);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_lane_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[VAL]], i32 15
-// CHECK-NEXT:    store i8 [[TMP0]], ptr [[PTR]], align 1
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z19test_vst1q_lane_mf8Pu6__mfp814__Mfloat8x16_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[VAL]], i32 15
-// CHECK-CXX-NEXT:    store i8 [[TMP0]], ptr [[PTR]], align 1
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst1q_lane_mf8(mfloat8_t *ptr, mfloat8x16_t val) {
-  vst1q_lane_mf8(ptr, val, 15);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_mf8_x2(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z16test_vst1_mf8_x2Pu6__mfp813mfloat8x8x2_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst1_mf8_x2(mfloat8_t *ptr, mfloat8x8x2_t val) {
-  vst1_mf8_x2(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x2(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z17test_vst1q_mf8_x2Pu6__mfp814mfloat8x16x2_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst1q_mf8_x2(mfloat8_t *ptr, mfloat8x16x2_t val) {
-  vst1q_mf8_x2(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_mf8_x3(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 2
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z16test_vst1_mf8_x3Pu6__mfp813mfloat8x8x3_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 2
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst1_mf8_x3(mfloat8_t *ptr, mfloat8x8x3_t val) {
-  vst1_mf8_x3(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x3(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 2
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z17test_vst1q_mf8_x3Pu6__mfp814mfloat8x16x3_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 2
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst1q_mf8_x3(mfloat8_t *ptr, mfloat8x16x3_t val) {
-  vst1q_mf8_x3(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1_mf8_x4(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 2
-// CHECK-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 3
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z16test_vst1_mf8_x4Pu6__mfp813mfloat8x8x4_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 2
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 3
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst1_mf8_x4(mfloat8_t *ptr, mfloat8x8x4_t val) {
-  vst1_mf8_x4(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x4(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 2
-// CHECK-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 3
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z17test_vst1q_mf8_x4Pu6__mfp814mfloat8x16x4_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 2
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 3
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst1q_mf8_x4(mfloat8_t *ptr, mfloat8x16x4_t val) {
-  vst1q_mf8_x4(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z13test_vst2_mf8Pu6__mfp813mfloat8x8x2_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst2_mf8(mfloat8_t *ptr, mfloat8x8x2_t val) {
-  vst2_mf8(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z14test_vst2q_mf8Pu6__mfp814mfloat8x16x2_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr 
[[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst2q_mf8(mfloat8_t *ptr, mfloat8x16x2_t val) {
-  vst2q_mf8(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2_lane_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], i64 7, ptr 
[[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z18test_vst2_lane_mf8Pu6__mfp813mfloat8x8x2_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], i64 7, ptr 
[[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst2_lane_mf8(mfloat8_t *ptr, mfloat8x8x2_t val) {
-  vst2_lane_mf8(ptr, val, 7);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst2q_lane_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], i64 15, 
ptr [[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z19test_vst2q_lane_mf8Pu6__mfp814mfloat8x16x2_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], i64 15, 
ptr [[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst2q_lane_mf8(mfloat8_t *ptr, mfloat8x16x2_t val) {
-  vst2q_lane_mf8(ptr, val, 15);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 2
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z13test_vst3_mf8Pu6__mfp813mfloat8x8x3_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 2
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst3_mf8(mfloat8_t *ptr, mfloat8x8x3_t val) {
-  vst3_mf8(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 2
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z14test_vst3q_mf8Pu6__mfp814mfloat8x16x3_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 2
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst3q_mf8(mfloat8_t *ptr, mfloat8x16x3_t val) {
-  vst3q_mf8(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3_lane_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x 
i8>] [[VAL_COERCE]], 2
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z18test_vst3_lane_mf8Pu6__mfp813mfloat8x8x3_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 
x i8>] [[VAL_COERCE]], 2
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst3_lane_mf8(mfloat8_t *ptr, mfloat8x8x3_t val) {
-  vst3_lane_mf8(ptr, val, 7);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst3q_lane_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[VAL_COERCE]], 2
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z19test_vst3q_lane_mf8Pu6__mfp814mfloat8x16x3_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 
x i8>] [[VAL_COERCE]], 2
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst3q_lane_mf8(mfloat8_t *ptr, mfloat8x16x3_t val) {
-  vst3q_lane_mf8(ptr, val, 15);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 2
-// CHECK-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 3
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z13test_vst4_mf8Pu6__mfp813mfloat8x8x4_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 2
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 3
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst4_mf8(mfloat8_t *ptr, mfloat8x8x4_t val) {
-  vst4_mf8(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 2
-// CHECK-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 3
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z14test_vst4q_mf8Pu6__mfp814mfloat8x16x4_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 2
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 3
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr 
[[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst4q_mf8(mfloat8_t *ptr, mfloat8x16x4_t val) {
-  vst4q_mf8(ptr, val);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4_lane_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 2
-// CHECK-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x 
i8>] [[VAL_COERCE]], 3
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], i64 7, ptr 
[[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z18test_vst4_lane_mf8Pu6__mfp813mfloat8x8x4_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 2
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 
x i8>] [[VAL_COERCE]], 3
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], i64 7, ptr 
[[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst4_lane_mf8(mfloat8_t *ptr, mfloat8x8x4_t val) {
-  vst4_lane_mf8(ptr, val, 7);
-}
-
-// CHECK-LABEL: define dso_local void @test_vst4q_lane_mf8(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 0
-// CHECK-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 1
-// CHECK-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 2
-// CHECK-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[VAL_COERCE]], 3
-// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], i64 15, 
ptr [[PTR]])
-// CHECK-NEXT:    ret void
-//
-// CHECK-CXX-LABEL: define dso_local void 
@_Z19test_vst4q_lane_mf8Pu6__mfp814mfloat8x16x4_t(
-// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) 
[[VAL_COERCE:%.*]]) #[[ATTR0]] {
-// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 0
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 1
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 2
-// CHECK-CXX-NEXT:    [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 
x i8>] [[VAL_COERCE]], 3
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> 
[[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], i64 15, 
ptr [[PTR]])
-// CHECK-CXX-NEXT:    ret void
-//
-void test_vst4q_lane_mf8(mfloat8_t *ptr, mfloat8x16x4_t val) {
-  vst4q_lane_mf8(ptr, val, 15);
-}
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c 
b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index 791f0a1a29409..24dc61180c68e 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -14732,6 +14732,16 @@ void test_vst1q_s64(int64_t *a, int64x2_t b) {
   vst1q_s64(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst1q_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <16 x i8> [[VAL]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_mf8(mfloat8_t *a, mfloat8x16_t val) {
+  vst1q_mf8(a, val);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst1q_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] 
{
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -14885,6 +14895,16 @@ void test_vst1_s64(int64_t *a, int64x1_t b) {
   vst1_s64(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst1_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> [[VAL:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    store <8 x i8> [[VAL]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vst1_mf8(mfloat8_t *a, mfloat8x8_t val) {
+  vst1_mf8(a, val);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst1_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] 
{
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -15067,6 +15087,18 @@ void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
   vst2q_s64(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst2q_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_mf8(mfloat8_t *a, mfloat8x16x2_t b) {
+  vst2q_mf8(a, b);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst2q_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -15269,6 +15301,18 @@ void test_vst2_s64(int64_t *a, int64x1x2_t b) {
   vst2_s64(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst2_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] 
[[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] 
[[B_COERCE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst2_mf8(mfloat8_t *a, mfloat8x8x2_t b) {
+  vst2_mf8(a, b);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst2_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -15493,6 +15537,19 @@ void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
   vst3q_s64(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst3q_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[B_COERCE_FCA_2_EXTRACT]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_mf8(mfloat8_t *a, mfloat8x16x3_t b) {
+  vst3q_mf8(a, b);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst3q_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -15731,6 +15788,19 @@ void test_vst3_s64(int64_t *a, int64x1x3_t b) {
   vst3_s64(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst3_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] 
[[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] 
[[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] 
[[B_COERCE]], 2
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[B_COERCE_FCA_2_EXTRACT]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst3_mf8(mfloat8_t *a, mfloat8x8x3_t b) {
+  vst3_mf8(a, b);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst3_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -15992,6 +16062,20 @@ void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
   vst4q_s64(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst4q_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_mf8(mfloat8_t *a, mfloat8x16x4_t b) {
+  vst4q_mf8(a, b);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst4q_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -16266,6 +16350,20 @@ void test_vst4_s64(int64_t *a, int64x1x4_t b) {
   vst4_s64(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst4_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] 
[[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] 
[[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] 
[[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] 
[[B_COERCE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst4_mf8(mfloat8_t *a, mfloat8x8x4_t b) {
+  vst4_mf8(a, b);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst4_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -16576,6 +16674,18 @@ poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
   return vld1_p64_x4(a);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_mf8_x2(mfloat8_t *a, mfloat8x16x2_t b) {
+  vst1q_mf8_x2(a, b);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst1q_f64_x2(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -16610,6 +16720,18 @@ void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) {
   vst1q_p64_x2(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst1_mf8_x2(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] 
[[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] 
[[B_COERCE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1_mf8_x2(mfloat8_t *a, mfloat8x8x2_t b) {
+  vst1_mf8_x2(a, b);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst1_f64_x2(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -16646,6 +16768,19 @@ void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) {
   vst1_p64_x2(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[B_COERCE_FCA_2_EXTRACT]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_mf8_x3(mfloat8_t *a, mfloat8x16x3_t b) {
+  vst1q_mf8_x3(a, b);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst1q_f64_x3(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -16687,6 +16822,19 @@ void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) {
   vst1q_p64_x3(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst1_mf8_x3(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] 
[[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] 
[[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] 
[[B_COERCE]], 2
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[B_COERCE_FCA_2_EXTRACT]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1_mf8_x3(mfloat8_t *a, mfloat8x8x3_t b) {
+  vst1_mf8_x3(a, b);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst1_f64_x3(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -16731,6 +16879,20 @@ void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) {
   vst1_p64_x3(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_mf8_x4(mfloat8_t *a, mfloat8x16x4_t b) {
+  vst1q_mf8_x4(a, b);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst1q_f64_x4(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -16779,6 +16941,20 @@ void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) {
   vst1q_p64_x4(a, b);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst1_mf8_x4(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] 
[[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] 
[[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] 
[[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] 
[[B_COERCE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst1_mf8_x4(mfloat8_t *a, mfloat8x8x4_t b) {
+  vst1_mf8_x4(a, b);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst1_f64_x4(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon-ldst-one.c 
b/clang/test/CodeGen/AArch64/neon-ldst-one.c
index 2cff007826ba6..486a3edbd82d9 100644
--- a/clang/test/CodeGen/AArch64/neon-ldst-one.c
+++ b/clang/test/CodeGen/AArch64/neon-ldst-one.c
@@ -3151,6 +3151,17 @@ void test_vst1q_lane_s64(int64_t  *a, int64x2_t b) {
   vst1q_lane_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst1q_lane_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vst1q_lane_mf8(mfloat8_t *a, mfloat8x16_t b) {
+  vst1q_lane_mf8(a, b, 15);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst1q_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] 
{
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -3330,6 +3341,17 @@ void test_vst1_lane_s64(int64_t  *a, int64x1_t b) {
   vst1_lane_s64(a, b, 0);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst1_lane_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[A]], align 1
+// CHECK-NEXT:    ret void
+//
+void test_vst1_lane_mf8(mfloat8_t *a, mfloat8x8_t b) {
+  vst1_lane_mf8(a, b, 7);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst1_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] 
{
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -3530,6 +3552,18 @@ void test_vst2q_lane_s64(int64_t  *a, int64x2x2_t b) {
   vst2q_lane_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst2q_lane_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x 
i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], i64 15, ptr 
[[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst2q_lane_mf8(mfloat8_t *a, mfloat8x16x2_t b) {
+  vst2q_lane_mf8(a, b, 15);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst2q_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -3748,6 +3782,18 @@ void test_vst2_lane_s64(int64_t  *a, int64x1x2_t b) {
   vst2_lane_s64(a, b, 0);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst2_lane_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] 
[[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] 
[[B_COERCE]], 1
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], i64 7, ptr 
[[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst2_lane_mf8(mfloat8_t *a, mfloat8x8x2_t b) {
+  vst2_lane_mf8(a, b, 7);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst2_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -3988,6 +4034,19 @@ void test_vst3q_lane_s64(int64_t  *a, int64x2x3_t b) {
   vst3q_lane_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst3q_lane_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x 
i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[B_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst3q_lane_mf8(mfloat8_t *a, mfloat8x16x3_t b) {
+  vst3q_lane_mf8(a, b, 15);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst3q_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -4245,6 +4304,19 @@ void test_vst3_lane_s64(int64_t  *a, int64x1x3_t b) {
   vst3_lane_s64(a, b, 0);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst3_lane_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] 
[[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] 
[[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] 
[[B_COERCE]], 2
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[B_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst3_lane_mf8(mfloat8_t *a, mfloat8x8x3_t b) {
+  vst3_lane_mf8(a, b, 7);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst3_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -4525,6 +4597,20 @@ void test_vst4q_lane_s64(int64_t  *a, int64x2x4_t b) {
   vst4q_lane_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst4q_lane_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x 
i8>] [[B_COERCE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> 
[[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 15, ptr 
[[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst4q_lane_mf8(mfloat8_t *a, mfloat8x16x4_t b) {
+  vst4q_lane_mf8(a, b, 15);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst4q_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -4821,6 +4907,20 @@ void test_vst4_lane_s64(int64_t  *a, int64x1x4_t b) {
   vst4_lane_s64(a, b, 0);
 }
 
+// CHECK-LABEL: define dso_local void @test_vst4_lane_mf8(
+// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] 
[[B_COERCE]], 0
+// CHECK-NEXT:    [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] 
[[B_COERCE]], 1
+// CHECK-NEXT:    [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] 
[[B_COERCE]], 2
+// CHECK-NEXT:    [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] 
[[B_COERCE]], 3
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> 
[[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> 
[[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 7, ptr 
[[A]])
+// CHECK-NEXT:    ret void
+//
+void test_vst4_lane_mf8(mfloat8_t *a, mfloat8x8x4_t b) {
+  vst4_lane_mf8(a, b, 7);
+}
+
 // CHECK-LABEL: define dso_local void @test_vst4_lane_f16(
 // CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) 
[[B_COERCE:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to