https://github.com/kmclaughlin-arm created https://github.com/llvm/llvm-project/pull/145346
Adds FP8 variants for existing VST1, VST2, VST3 & VST4 intrinsics. >From b2d9f70eb33ebbb26166bea4ba79f05204fc3cc2 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin <kerry.mclaugh...@arm.com> Date: Mon, 23 Jun 2025 13:07:34 +0000 Subject: [PATCH] [Clang][AArch64] Add FP8 variants of Neon store intrinsics Adds FP8 variants for existing VST1, VST2, VST3 & VST4 intrinsics. --- clang/include/clang/Basic/arm_neon.td | 22 +- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 22 + .../fp8-intrinsics/acle_neon_fp8_stores.c | 475 ++++++++++++++++++ 3 files changed, 518 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 7251cc2d1759a..314330ed9fde6 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2119,6 +2119,26 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in { } } +let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { + def VST1_MF8 : WInst<"vst1", "v*(.!)", "mQm">; + def VST2_MF8 : WInst<"vst2", "v*(2!)", "mQm">; + def VST3_MF8 : WInst<"vst3", "v*(3!)", "mQm">; + def VST4_MF8 : WInst<"vst4", "v*(4!)", "mQm">; + + def VST1_X2_MF8 : WInst<"vst1_x2", "v*(2!)", "mQm">; + def VST1_X3_MF8 : WInst<"vst1_x3", "v*(3!)", "mQm">; + def VST1_X4_MF8 : WInst<"vst1_x4", "v*(4!)", "mQm">; + + def VST1_LANE_MF8 : WInst<"vst1_lane", "v*(.!)I", "mQm", + [ImmCheck<2, ImmCheckLaneIndex, 1>]>; + def VST2_LANE_MF8 : WInst<"vst2_lane", "v*(2!)I", "mQm", + [ImmCheck<3, ImmCheckLaneIndex, 1>]>; + def VST3_LANE_MF8 : WInst<"vst3_lane", "v*(3!)I", "mQm", + [ImmCheck<4, ImmCheckLaneIndex, 1>]>; + def VST4_LANE_MF8 : WInst<"vst4_lane", "v*(4!)I", "mQm", + [ImmCheck<5, ImmCheckLaneIndex, 1>]>; +} + let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { def VBF1CVT_BF16_MF8 : VInst<"vcvt1_bf16_mf8_fpm", "(QB).V", "m">; def VBF1CVT_LOW_BF16_MF8 : VInst<"vcvt1_low_bf16_mf8_fpm", "B.V", "Hm">; @@ -2194,4 +2214,4 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { // fscale def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">; def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">; -} \ No newline at end of file +} diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 6738d4be6dd21..3bd5054050036 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -1553,6 +1553,28 @@ static const std::pair<unsigned, unsigned> NEONEquivalentIntrinsicMap[] = { { NEON::BI__builtin_neon_vst4_lane_bf16, NEON::BI__builtin_neon_vst4_lane_v }, { NEON::BI__builtin_neon_vst4q_bf16, NEON::BI__builtin_neon_vst4q_v }, { NEON::BI__builtin_neon_vst4q_lane_bf16, NEON::BI__builtin_neon_vst4q_lane_v }, + { NEON::BI__builtin_neon_vst1_mf8_x2, NEON::BI__builtin_neon_vst1_x2_v }, + { NEON::BI__builtin_neon_vst1_mf8_x3, NEON::BI__builtin_neon_vst1_x3_v }, + { NEON::BI__builtin_neon_vst1_mf8_x4, NEON::BI__builtin_neon_vst1_x4_v }, + { NEON::BI__builtin_neon_vst1_mf8, NEON::BI__builtin_neon_vst1_v }, + { NEON::BI__builtin_neon_vst1_lane_mf8, NEON::BI__builtin_neon_vst1_lane_v }, + { NEON::BI__builtin_neon_vst1q_mf8_x2, NEON::BI__builtin_neon_vst1q_x2_v }, + { NEON::BI__builtin_neon_vst1q_mf8_x3, NEON::BI__builtin_neon_vst1q_x3_v }, + { NEON::BI__builtin_neon_vst1q_mf8_x4, NEON::BI__builtin_neon_vst1q_x4_v }, + { NEON::BI__builtin_neon_vst1q_mf8, NEON::BI__builtin_neon_vst1q_v }, + { NEON::BI__builtin_neon_vst1q_lane_mf8, NEON::BI__builtin_neon_vst1q_lane_v }, + { NEON::BI__builtin_neon_vst2_mf8, NEON::BI__builtin_neon_vst2_v }, + { NEON::BI__builtin_neon_vst2_lane_mf8, NEON::BI__builtin_neon_vst2_lane_v }, + { NEON::BI__builtin_neon_vst2q_mf8, NEON::BI__builtin_neon_vst2q_v }, + { NEON::BI__builtin_neon_vst2q_lane_mf8, NEON::BI__builtin_neon_vst2q_lane_v }, + { NEON::BI__builtin_neon_vst3_mf8, NEON::BI__builtin_neon_vst3_v }, + { NEON::BI__builtin_neon_vst3_lane_mf8, NEON::BI__builtin_neon_vst3_lane_v }, + { NEON::BI__builtin_neon_vst3q_mf8, NEON::BI__builtin_neon_vst3q_v }, + { NEON::BI__builtin_neon_vst3q_lane_mf8, NEON::BI__builtin_neon_vst3q_lane_v }, + { NEON::BI__builtin_neon_vst4_mf8, NEON::BI__builtin_neon_vst4_v }, + { NEON::BI__builtin_neon_vst4_lane_mf8, NEON::BI__builtin_neon_vst4_lane_v }, + { NEON::BI__builtin_neon_vst4q_mf8, NEON::BI__builtin_neon_vst4q_v }, + { NEON::BI__builtin_neon_vst4q_lane_mf8, NEON::BI__builtin_neon_vst4q_lane_v }, // The mangling rules cause us to have one ID for each type for vldap1(q)_lane // and vstl1(q)_lane, but codegen is equivalent for all of them. Choose an // arbitrary one to be handled as tha canonical variation. diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c new file mode 100644 index 0000000000000..f09bacdbe6302 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_stores.c @@ -0,0 +1,475 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 \ +// RUN: -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,sroa | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 \ +// RUN: -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,sroa | FileCheck %s -check-prefix CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +fp8 -O3 -Werror -Wall -S -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include "arm_neon.h" + +// CHECK-LABEL: define dso_local void @test_vst1_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], <8 x i8> [[VAL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <8 x i8> [[VAL]], ptr [[PTR]], align 1 +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z13test_vst1_mf8Pu6__mfp813__Mfloat8x8_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], <8 x i8> [[VAL:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: store <8 x i8> [[VAL]], ptr [[PTR]], align 1 +// CHECK-CXX-NEXT: ret void +// +void test_vst1_mf8(mfloat8_t *ptr, mfloat8x8_t val) { + vst1_mf8(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst1q_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <16 x i8> [[VAL]], ptr [[PTR]], align 1 +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z14test_vst1q_mf8Pu6__mfp814__Mfloat8x16_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: store <16 x i8> [[VAL]], ptr [[PTR]], align 1 +// CHECK-CXX-NEXT: ret void +// +void test_vst1q_mf8(mfloat8_t *ptr, mfloat8x16_t val) { + vst1q_mf8(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst1_lane_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], <8 x i8> [[VAL:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[VAL]], i32 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[PTR]], align 1 +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z18test_vst1_lane_mf8Pu6__mfp813__Mfloat8x8_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], <8 x i8> [[VAL:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[VAL]], i32 7 +// CHECK-CXX-NEXT: store i8 [[TMP0]], ptr [[PTR]], align 1 +// CHECK-CXX-NEXT: ret void +// +void test_vst1_lane_mf8(mfloat8_t *ptr, mfloat8x8_t val) { + vst1_lane_mf8(ptr, val, 7); +} + +// CHECK-LABEL: define dso_local void @test_vst1q_lane_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[VAL]], i32 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[PTR]], align 1 +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z19test_vst1q_lane_mf8Pu6__mfp814__Mfloat8x16_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], <16 x i8> [[VAL:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[VAL]], i32 15 +// CHECK-CXX-NEXT: store i8 [[TMP0]], ptr [[PTR]], align 1 +// CHECK-CXX-NEXT: ret void +// +void test_vst1q_lane_mf8(mfloat8_t *ptr, mfloat8x16_t val) { + vst1q_lane_mf8(ptr, val, 15); +} + +// CHECK-LABEL: define dso_local void @test_vst1_mf8_x2( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z16test_vst1_mf8_x2Pu6__mfp813mfloat8x8x2_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst1_mf8_x2(mfloat8_t *ptr, mfloat8x8x2_t val) { + vst1_mf8_x2(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x2( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z17test_vst1q_mf8_x2Pu6__mfp814mfloat8x16x2_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst1q_mf8_x2(mfloat8_t *ptr, mfloat8x16x2_t val) { + vst1q_mf8_x2(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst1_mf8_x3( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z16test_vst1_mf8_x3Pu6__mfp813mfloat8x8x3_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 2 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst1_mf8_x3(mfloat8_t *ptr, mfloat8x8x3_t val) { + vst1_mf8_x3(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x3( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z17test_vst1q_mf8_x3Pu6__mfp814mfloat8x16x3_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 2 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst1q_mf8_x3(mfloat8_t *ptr, mfloat8x16x3_t val) { + vst1q_mf8_x3(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst1_mf8_x4( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 2 +// CHECK-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z16test_vst1_mf8_x4Pu6__mfp813mfloat8x8x4_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 2 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 3 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst1_mf8_x4(mfloat8_t *ptr, mfloat8x8x4_t val) { + vst1_mf8_x4(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst1q_mf8_x4( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 2 +// CHECK-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z17test_vst1q_mf8_x4Pu6__mfp814mfloat8x16x4_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 2 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 3 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst1q_mf8_x4(mfloat8_t *ptr, mfloat8x16x4_t val) { + vst1q_mf8_x4(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst2_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z13test_vst2_mf8Pu6__mfp813mfloat8x8x2_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst2_mf8(mfloat8_t *ptr, mfloat8x8x2_t val) { + vst2_mf8(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst2q_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z14test_vst2q_mf8Pu6__mfp814mfloat8x16x2_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst2q_mf8(mfloat8_t *ptr, mfloat8x16x2_t val) { + vst2q_mf8(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst2_lane_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z18test_vst2_lane_mf8Pu6__mfp813mfloat8x8x2_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst2_lane_mf8(mfloat8_t *ptr, mfloat8x8x2_t val) { + vst2_lane_mf8(ptr, val, 7); +} + +// CHECK-LABEL: define dso_local void @test_vst2q_lane_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z19test_vst2q_lane_mf8Pu6__mfp814mfloat8x16x2_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], i64 15, ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst2q_lane_mf8(mfloat8_t *ptr, mfloat8x16x2_t val) { + vst2q_lane_mf8(ptr, val, 15); +} + +// CHECK-LABEL: define dso_local void @test_vst3_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z13test_vst3_mf8Pu6__mfp813mfloat8x8x3_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 2 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst3_mf8(mfloat8_t *ptr, mfloat8x8x3_t val) { + vst3_mf8(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst3q_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z14test_vst3q_mf8Pu6__mfp814mfloat8x16x3_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 2 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst3q_mf8(mfloat8_t *ptr, mfloat8x16x3_t val) { + vst3q_mf8(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst3_lane_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z18test_vst3_lane_mf8Pu6__mfp813mfloat8x8x3_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[VAL_COERCE]], 2 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst3_lane_mf8(mfloat8_t *ptr, mfloat8x8x3_t val) { + vst3_lane_mf8(ptr, val, 7); +} + +// CHECK-LABEL: define dso_local void @test_vst3q_lane_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z19test_vst3q_lane_mf8Pu6__mfp814mfloat8x16x3_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[VAL_COERCE]], 2 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst3q_lane_mf8(mfloat8_t *ptr, mfloat8x16x3_t val) { + vst3q_lane_mf8(ptr, val, 15); +} + +// CHECK-LABEL: define dso_local void @test_vst4_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 2 +// CHECK-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z13test_vst4_mf8Pu6__mfp813mfloat8x8x4_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 2 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 3 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst4_mf8(mfloat8_t *ptr, mfloat8x8x4_t val) { + vst4_mf8(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst4q_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 2 +// CHECK-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z14test_vst4q_mf8Pu6__mfp814mfloat8x16x4_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 2 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 3 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst4q_mf8(mfloat8_t *ptr, mfloat8x16x4_t val) { + vst4q_mf8(ptr, val); +} + +// CHECK-LABEL: define dso_local void @test_vst4_lane_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 2 +// CHECK-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z18test_vst4_lane_mf8Pu6__mfp813mfloat8x8x4_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <8 x i8>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 2 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[VAL_COERCE]], 3 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst4_lane_mf8(mfloat8_t *ptr, mfloat8x8x4_t val) { + vst4_lane_mf8(ptr, val, 7); +} + +// CHECK-LABEL: define dso_local void @test_vst4q_lane_mf8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 2 +// CHECK-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: ret void +// +// CHECK-CXX-LABEL: define dso_local void @_Z19test_vst4q_lane_mf8Pu6__mfp814mfloat8x16x4_t( +// CHECK-CXX-SAME: ptr noundef [[PTR:%.*]], [4 x <16 x i8>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 0 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 1 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 2 +// CHECK-CXX-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[VAL_COERCE]], 3 +// CHECK-CXX-NEXT: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[VAL_COERCE_FCA_0_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_1_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_2_EXTRACT]], <16 x i8> [[VAL_COERCE_FCA_3_EXTRACT]], i64 15, ptr [[PTR]]) +// CHECK-CXX-NEXT: ret void +// +void test_vst4q_lane_mf8(mfloat8_t *ptr, mfloat8x16x4_t val) { + vst4q_lane_mf8(ptr, val, 15); +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits