https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/68565
>From de07976922782b9dcf5d13d44551b782dc8b3b94 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs <samuel.te...@arm.com> Date: Fri, 6 Oct 2023 17:09:36 +0100 Subject: [PATCH 1/6] [AArch64][SME] Remove immediate argument restriction for svldr and svstr The svldr_vnum_za and svstr_vnum_za builtins/intrinsics currently require that the vnum argument be an immediate, since the instructions take an immediate vector number. However, we emit 0 as the immediate for the instruction no matter what, and instead modify the base register. This patch removes that restriction on the argument, so that the argument can be a non-immediate. If an appropriate immediate was passed to the builtin then CGBuiltin passes that directly to the LLVM intrinsic, otherwise it modifies the base register as is existing behaviour. --- clang/lib/CodeGen/CGBuiltin.cpp | 45 ++++++++---- .../aarch64-sme-intrinsics/acle_sme_ldr.c | 71 ++++++++----------- .../aarch64-sme-intrinsics/acle_sme_str.c | 51 ++++--------- llvm/include/llvm/IR/IntrinsicsAArch64.td | 4 +- llvm/lib/Target/AArch64/SMEInstrFormats.td | 10 +-- .../CostModel/ARM/unaligned_double_load.ll | 59 +++++++++++++++ .../CodeGen/AArch64/sme-intrinsics-loads.ll | 33 +++++++-- 7 files changed, 166 insertions(+), 107 deletions(-) create mode 100644 llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index e1211bb8949b665..63508b40096141e 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9712,6 +9712,11 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E, return Store; } +Value *CodeGenFunction::EmitTileslice(Value *Offset, Value *Base) { + llvm::Value *CastOffset = Builder.CreateIntCast(Offset, Int64Ty, false); + return Builder.CreateAdd(Base, CastOffset, "tileslice"); +} + Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags, SmallVectorImpl<Value *> &Ops, unsigned IntID) { @@ -9767,18 +9772,34 @@ Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags, Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags, SmallVectorImpl<Value *> &Ops, unsigned IntID) { - if (Ops.size() == 3) { - Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); - llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb"); - - llvm::Value *VecNum = Ops[2]; - llvm::Value *MulVL = Builder.CreateMul(CntsbCall, VecNum, "mulvl"); - - Ops[1] = Builder.CreateGEP(Int8Ty, Ops[1], MulVL); - Ops[0] = Builder.CreateAdd( - Ops[0], Builder.CreateIntCast(VecNum, Int32Ty, true), "tileslice"); - Ops.erase(&Ops[2]); - } + if (Ops.size() == 2) { + // Intrinsics without a vecnum also use this function, so just provide 0 + Ops.push_back(Ops[1]); + Ops[1] = Builder.getInt32(0); + } else { + int Imm = -1; + if (ConstantInt* C = dyn_cast<ConstantInt>(Ops[2])) + if (C->getZExtValue() <= 15) + Imm = C->getZExtValue(); + + if (Imm != -1) { + Ops[2] = Ops[1]; + Ops[1] = Builder.getInt32(Imm); + } else { + Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); + llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb"); + + llvm::Value *VecNum = Ops[2]; + llvm::Value *MulVL = Builder.CreateMul( + CntsbCall, + VecNum, + "mulvl"); + + Ops[2] = Builder.CreateGEP(Int8Ty, Ops[1], MulVL); + Ops[1] = Builder.getInt32(0); + Ops[0] = Builder.CreateIntCast(EmitTileslice(Ops[0], VecNum), Int32Ty, false); + } + } Function *F = CGM.getIntrinsic(IntID, {}); return Builder.CreateCall(F, Ops); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c index e85c47072f2df80..8e07cf1d11c19b2 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c @@ -6,57 +6,46 @@ #include <arm_sme_draft_spec_subject_to_change.h> -// CHECK-C-LABEL: define dso_local void @test_svldr_vnum_za( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z18test_svldr_vnum_zajPKv( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svldr_vnum_za( +// CHECK-CXX-LABEL: @_Z18test_svldr_vnum_zajPKv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]]) +// CHECK-NEXT: ret void // void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) { svldr_vnum_za(slice_base, ptr, 0); } -// CHECK-C-LABEL: define dso_local void @test_svldr_vnum_za_1( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15 -// CHECK-C-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE]], 15 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z20test_svldr_vnum_za_1jPKv( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15 -// CHECK-CXX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE]], 15 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svldr_vnum_za_1( +// CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_1jPKv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], i32 15, ptr [[PTR]]) +// CHECK-NEXT: ret void // void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) { svldr_vnum_za(slice_base, ptr, 15); } -// CHECK-C-LABEL: define dso_local void @test_svldr_za( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-C-NEXT: ret void +// CHECK-C-LABEL: @test_svldr_vnum_za_var( +// CHECK-CXX-LABEL: @_Z22test_svldr_vnum_za_varjPKvm( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() +// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]] +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TMP2]], i32 0, ptr [[TMP0]]) +// CHECK-NEXT: ret void // -// CHECK-CXX-LABEL: define dso_local void @_Z13test_svldr_zajPKv( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-CXX-NEXT: ret void +void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, uint64_t vnum) { + svldr_vnum_za(slice_base, ptr, vnum); +} + +// CHECK-C-LABEL: @test_svldr_za( +// CHECK-CXX-LABEL: @_Z13test_svldr_zajPKv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]]) +// CHECK-NEXT: ret void // void test_svldr_za(uint32_t slice_base, const void *ptr) { svldr_za(slice_base, ptr); @@ -87,5 +76,3 @@ void test_svldr_za(uint32_t slice_base, const void *ptr) { void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) { svldr_vnum_za(slice_base, ptr, vnum); } -//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -// CHECK: {{.*}} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c index e53a3c6c57de323..532f570b6aaa444 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c @@ -6,57 +6,32 @@ #include <arm_sme_draft_spec_subject_to_change.h> -// CHECK-C-LABEL: define dso_local void @test_svstr_vnum_za( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z18test_svstr_vnum_zajPv( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svstr_vnum_za( +// CHECK-CXX-LABEL: @_Z18test_svstr_vnum_zajPv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]]) +// CHECK-NEXT: ret void // void test_svstr_vnum_za(uint32_t slice_base, void *ptr) { svstr_vnum_za(slice_base, ptr, 0); } // CHECK-C-LABEL: define dso_local void @test_svstr_vnum_za_1( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15 -// CHECK-C-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE]], 15 -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-C-NEXT: ret void -// // CHECK-CXX-LABEL: define dso_local void @_Z20test_svstr_vnum_za_1jPv( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], 15 -// CHECK-CXX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TILESLICE:%.*]] = add i32 [[SLICE_BASE]], 15 -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-CXX-NEXT: ret void +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], i32 15, ptr [[PTR]]) +// CHECK-NEXT: ret void // void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) { svstr_vnum_za(slice_base, ptr, 15); } // CHECK-C-LABEL: define dso_local void @test_svstr_za( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-C-NEXT: ret void -// // CHECK-CXX-LABEL: define dso_local void @_Z13test_svstr_zajPv( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], ptr [[PTR]]) -// CHECK-CXX-NEXT: ret void +// CHECK-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]]) +// CHECK-NEXT: ret void // void test_svstr_za(uint32_t slice_base, void *ptr) { svstr_za(slice_base, ptr); @@ -87,5 +62,3 @@ void test_svstr_za(uint32_t slice_base, void *ptr) { void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) { svstr_vnum_za(slice_base, ptr, vnum); } -//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -// CHECK: {{.*}} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index a42e2c49cb477ba..36b552c123dd1ab 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2680,9 +2680,9 @@ let TargetPrefix = "aarch64" in { // Spill + fill def int_aarch64_sme_ldr : DefaultAttrsIntrinsic< - [], [llvm_i32_ty, llvm_ptr_ty]>; + [], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<1>>]>; def int_aarch64_sme_str : DefaultAttrsIntrinsic< - [], [llvm_i32_ty, llvm_ptr_ty]>; + [], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<1>>]>; class SME_TileToVector_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 823115c7d025005..90b269574adebde 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -794,8 +794,8 @@ multiclass sme_spill<string opcodestr> { (!cast<Instruction>(NAME) MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; // base - def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base), - (!cast<Instruction>(NAME) ZA, $idx, 0, $base, 0)>; + def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm, GPR64sp:$base), + (!cast<Instruction>(NAME) ZA, $idx, $imm, $base, 0)>; } multiclass sme_fill<string opcodestr> { @@ -805,7 +805,7 @@ multiclass sme_fill<string opcodestr> { MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; def NAME # _PSEUDO : Pseudo<(outs), - (ins MatrixIndexGPR32Op12_15:$idx, imm0_15:$imm4, + (ins MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm4, GPR64sp:$base), []>, Sched<[]> { // Translated to actual instruction in AArch64ISelLowering.cpp @@ -813,8 +813,8 @@ multiclass sme_fill<string opcodestr> { let mayLoad = 1; } // base - def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base), - (!cast<Instruction>(NAME # _PSEUDO) $idx, 0, $base)>; + def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm, GPR64sp:$base), + (!cast<Instruction>(NAME # _PSEUDO) $idx, $imm, $base)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll b/llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll new file mode 100644 index 000000000000000..8d457220ea9c5ae --- /dev/null +++ b/llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-NOVEC +; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-FP + +define float @f(ptr %x) { +; CHECK-NOVEC-LABEL: 'f' +; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a.0.copyload = load float, ptr %x, align 1 +; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float %a.0.copyload +; +; CHECK-FP-LABEL: 'f' +; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a.0.copyload = load float, ptr %x, align 1 +; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float %a.0.copyload +; +entry: + %a.0.copyload = load float, ptr %x, align 1 + ret float %a.0.copyload +} + +define float @ff(ptr %x, float %f) { +; CHECK-NOVEC-LABEL: 'ff' +; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float %f, ptr %x, align 1 +; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef +; +; CHECK-FP-LABEL: 'ff' +; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store float %f, ptr %x, align 1 +; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef +; +entry: + store float %f, ptr %x, align 1 + ret float undef +} + +define double @d(ptr %x) { +; CHECK-NOVEC-LABEL: 'd' +; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a.0.copyload = load double, ptr %x, align 1 +; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double %a.0.copyload +; +; CHECK-FP-LABEL: 'd' +; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a.0.copyload = load double, ptr %x, align 1 +; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double %a.0.copyload +; +entry: + %a.0.copyload = load double, ptr %x, align 1 + ret double %a.0.copyload +} + +define double @dd(ptr %x, double %f) { +; CHECK-NOVEC-LABEL: 'dd' +; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store double %f, ptr %x, align 1 +; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double undef +; +; CHECK-FP-LABEL: 'dd' +; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store double %f, ptr %x, align 1 +; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double undef +; +entry: + store double %f, ptr %x, align 1 + ret double undef +} diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll index c96aca366ed43f2..f5d25a3229a7f82 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll @@ -252,10 +252,28 @@ define void @ldr(ptr %ptr) { ; CHECK-NEXT: mov w12, wzr ; CHECK-NEXT: ldr za[w12, 0], [x0] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr) + call void @llvm.aarch64.sme.ldr(i32 0, i32 0, ptr %ptr) ret void; } +define void @ldr_vnum(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: ldr_vnum: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add w12, w2, w0 +; CHECK-NEXT: madd x8, x8, x2, x1 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ret +entry: + %svlb = tail call i64 @llvm.aarch64.sme.cntsb() + %mulvl = mul i64 %svlb, %vnum + %0 = getelementptr i8, ptr %ptr, i64 %mulvl + %1 = trunc i64 %vnum to i32 + %2 = add i32 %1, %tile_slice + tail call void @llvm.aarch64.sme.ldr(i32 %2, i32 0, ptr %0) + ret void +} + define void @ldr_with_off_15(ptr %ptr) { ; CHECK-LABEL: ldr_with_off_15: ; CHECK: // %bb.0: @@ -264,7 +282,7 @@ define void @ldr_with_off_15(ptr %ptr) { ; CHECK-NEXT: ldr za[w12, 0], [x8] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 - call void @llvm.aarch64.sme.ldr(i32 15, ptr %base) + call void @llvm.aarch64.sme.ldr(i32 15, i32 0, ptr %base) ret void; } @@ -278,7 +296,7 @@ define void @ldr_with_off_15mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 240 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.ldr(i32 15, ptr %base) + call void @llvm.aarch64.sme.ldr(i32 15, i32 0, ptr %base) ret void; } @@ -292,7 +310,7 @@ define void @ldr_with_off_16mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 256 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.ldr(i32 16, ptr %base) + call void @llvm.aarch64.sme.ldr(i32 16, i32 0, ptr %base) ret void; } @@ -302,13 +320,13 @@ define void @test_ld1_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src, ; CHECK-LABEL: test_ld1_sink_tile0_offset_operand: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w12, w1 -; CHECK-NEXT: .LBB14_1: // %for.body +; CHECK-NEXT: .LBB15_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1w {za0h.s[w12, 0]}, p0/z, [x0] ; CHECK-NEXT: subs w2, w2, #1 ; CHECK-NEXT: ld1w {za0h.s[w12, 1]}, p0/z, [x0] ; CHECK-NEXT: ld1w {za0h.s[w12, 2]}, p0/z, [x0] -; CHECK-NEXT: b.ne .LBB14_1 +; CHECK-NEXT: b.ne .LBB15_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: @@ -341,5 +359,6 @@ declare void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1>, ptr, i32, i32) declare void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1>, ptr, i32, i32) declare void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1>, ptr, i32, i32) -declare void @llvm.aarch64.sme.ldr(i32, ptr) +declare void @llvm.aarch64.sme.ldr(i32, i32, ptr) declare i64 @llvm.vscale.i64() +declare i64 @llvm.aarch64.sme.cntsb() >From c44d2a76055548c6eaf020b2b1008e528de11535 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs <samuel.te...@arm.com> Date: Mon, 9 Oct 2023 09:52:28 +0100 Subject: [PATCH 2/6] fixup: remove erroneously included file --- .../CostModel/ARM/unaligned_double_load.ll | 59 ------------------- 1 file changed, 59 deletions(-) delete mode 100644 llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll diff --git a/llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll b/llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll deleted file mode 100644 index 8d457220ea9c5ae..000000000000000 --- a/llvm/test/Analysis/CostModel/ARM/unaligned_double_load.ll +++ /dev/null @@ -1,59 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-NOVEC -; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-FP - -define float @f(ptr %x) { -; CHECK-NOVEC-LABEL: 'f' -; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a.0.copyload = load float, ptr %x, align 1 -; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float %a.0.copyload -; -; CHECK-FP-LABEL: 'f' -; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a.0.copyload = load float, ptr %x, align 1 -; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float %a.0.copyload -; -entry: - %a.0.copyload = load float, ptr %x, align 1 - ret float %a.0.copyload -} - -define float @ff(ptr %x, float %f) { -; CHECK-NOVEC-LABEL: 'ff' -; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float %f, ptr %x, align 1 -; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef -; -; CHECK-FP-LABEL: 'ff' -; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store float %f, ptr %x, align 1 -; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef -; -entry: - store float %f, ptr %x, align 1 - ret float undef -} - -define double @d(ptr %x) { -; CHECK-NOVEC-LABEL: 'd' -; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a.0.copyload = load double, ptr %x, align 1 -; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double %a.0.copyload -; -; CHECK-FP-LABEL: 'd' -; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a.0.copyload = load double, ptr %x, align 1 -; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double %a.0.copyload -; -entry: - %a.0.copyload = load double, ptr %x, align 1 - ret double %a.0.copyload -} - -define double @dd(ptr %x, double %f) { -; CHECK-NOVEC-LABEL: 'dd' -; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store double %f, ptr %x, align 1 -; CHECK-NOVEC-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double undef -; -; CHECK-FP-LABEL: 'dd' -; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store double %f, ptr %x, align 1 -; CHECK-FP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret double undef -; -entry: - store double %f, ptr %x, align 1 - ret double undef -} >From 91ddfb768b545934d819fc400a5df1b9208d8b60 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs <samuel.te...@arm.com> Date: Thu, 19 Oct 2023 11:32:30 +0100 Subject: [PATCH 3/6] fixup! Use DAGToDAG approach --- clang/lib/CodeGen/CGBuiltin.cpp | 37 ++-------- .../aarch64-sme-intrinsics/acle_sme_ldr.c | 58 ++++++---------- .../aarch64-sme-intrinsics/acle_sme_str.c | 52 +++++++------- llvm/include/llvm/IR/IntrinsicsAArch64.td | 4 +- .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 54 +++++++++++++++ llvm/lib/Target/AArch64/SMEInstrFormats.td | 4 +- .../CodeGen/AArch64/sme-intrinsics-loads.ll | 68 ++++++++++++------- .../CodeGen/AArch64/sme-intrinsics-stores.ll | 54 +++++++++++++-- 8 files changed, 193 insertions(+), 138 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 63508b40096141e..65c6530bcbcf243 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9712,11 +9712,6 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E, return Store; } -Value *CodeGenFunction::EmitTileslice(Value *Offset, Value *Base) { - llvm::Value *CastOffset = Builder.CreateIntCast(Offset, Int64Ty, false); - return Builder.CreateAdd(Base, CastOffset, "tileslice"); -} - Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags, SmallVectorImpl<Value *> &Ops, unsigned IntID) { @@ -9772,34 +9767,10 @@ Value *CodeGenFunction::EmitSMEZero(const SVETypeFlags &TypeFlags, Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags, SmallVectorImpl<Value *> &Ops, unsigned IntID) { - if (Ops.size() == 2) { - // Intrinsics without a vecnum also use this function, so just provide 0 - Ops.push_back(Ops[1]); - Ops[1] = Builder.getInt32(0); - } else { - int Imm = -1; - if (ConstantInt* C = dyn_cast<ConstantInt>(Ops[2])) - if (C->getZExtValue() <= 15) - Imm = C->getZExtValue(); - - if (Imm != -1) { - Ops[2] = Ops[1]; - Ops[1] = Builder.getInt32(Imm); - } else { - Function *Cntsb = CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); - llvm::Value *CntsbCall = Builder.CreateCall(Cntsb, {}, "svlb"); - - llvm::Value *VecNum = Ops[2]; - llvm::Value *MulVL = Builder.CreateMul( - CntsbCall, - VecNum, - "mulvl"); - - Ops[2] = Builder.CreateGEP(Int8Ty, Ops[1], MulVL); - Ops[1] = Builder.getInt32(0); - Ops[0] = Builder.CreateIntCast(EmitTileslice(Ops[0], VecNum), Int32Ty, false); - } - } + if (Ops.size() == 2) + Ops.push_back(Builder.getInt32(0)); + else + Ops[2] = Builder.CreateIntCast(Ops[2], Int32Ty, true); Function *F = CGM.getIntrinsic(IntID, {}); return Builder.CreateCall(F, Ops); } diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c index 8e07cf1d11c19b2..9af0778e89c5ec0 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c @@ -9,7 +9,7 @@ // CHECK-C-LABEL: @test_svldr_vnum_za( // CHECK-CXX-LABEL: @_Z18test_svldr_vnum_zajPKv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) { @@ -19,60 +19,40 @@ void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) { // CHECK-C-LABEL: @test_svldr_vnum_za_1( // CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_1jPKv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], i32 15, ptr [[PTR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15) // CHECK-NEXT: ret void // void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) { svldr_vnum_za(slice_base, ptr, 15); } -// CHECK-C-LABEL: @test_svldr_vnum_za_var( -// CHECK-CXX-LABEL: @_Z22test_svldr_vnum_za_varjPKvm( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]] -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]] -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TMP2]], i32 0, ptr [[TMP0]]) -// CHECK-NEXT: ret void -// -void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, uint64_t vnum) { - svldr_vnum_za(slice_base, ptr, vnum); -} - // CHECK-C-LABEL: @test_svldr_za( // CHECK-CXX-LABEL: @_Z13test_svldr_zajPKv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // void test_svldr_za(uint32_t slice_base, const void *ptr) { svldr_za(slice_base, ptr); } -// CHECK-C-LABEL: define dso_local void @test_svldr_vnum_za_var( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]] -// CHECK-C-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z22test_svldr_vnum_za_varjPKvl( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svldr_vnum_za_var( +// CHECK-CXX-LABEL: @_Z22test_svldr_vnum_za_varjPKvl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VNUM:%.*]] to i32 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]]) +// CHECK-NEXT: ret void // void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) { svldr_vnum_za(slice_base, ptr, vnum); } + +// CHECK-C-LABEL: @test_svldr_vnum_za_2( +// CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_2jPKv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16) +// CHECK-NEXT: ret void +// +void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) { + svldr_vnum_za(slice_base, ptr, 16); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c index 532f570b6aaa444..baadfc18563a005 100644 --- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c @@ -9,56 +9,50 @@ // CHECK-C-LABEL: @test_svstr_vnum_za( // CHECK-CXX-LABEL: @_Z18test_svstr_vnum_zajPv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // void test_svstr_vnum_za(uint32_t slice_base, void *ptr) { svstr_vnum_za(slice_base, ptr, 0); } -// CHECK-C-LABEL: define dso_local void @test_svstr_vnum_za_1( -// CHECK-CXX-LABEL: define dso_local void @_Z20test_svstr_vnum_za_1jPv( +// CHECK-C-LABEL: @test_svstr_vnum_za_1( +// CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_1jPv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], i32 15, ptr [[PTR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15) // CHECK-NEXT: ret void // void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) { svstr_vnum_za(slice_base, ptr, 15); } -// CHECK-C-LABEL: define dso_local void @test_svstr_za( -// CHECK-CXX-LABEL: define dso_local void @_Z13test_svstr_zajPv( -// CHECK-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-C-LABEL: @test_svstr_za( +// CHECK-CXX-LABEL: @_Z13test_svstr_zajPv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE]], i32 0, ptr [[PTR]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) // CHECK-NEXT: ret void // void test_svstr_za(uint32_t slice_base, void *ptr) { svstr_za(slice_base, ptr); } -// CHECK-C-LABEL: define dso_local void @test_svstr_vnum_za_var( -// CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]] -// CHECK-C-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-C-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-C-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]] -// CHECK-C-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-C-NEXT: ret void -// -// CHECK-CXX-LABEL: define dso_local void @_Z22test_svstr_vnum_za_varjPvl( -// CHECK-CXX-SAME: i32 noundef [[SLICE_BASE:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[SVLB:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: [[MULVL:%.*]] = mul i64 [[SVLB]], [[VNUM]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[MULVL]] -// CHECK-CXX-NEXT: [[TMP1:%.*]] = trunc i64 [[VNUM]] to i32 -// CHECK-CXX-NEXT: [[TILESLICE:%.*]] = add i32 [[TMP1]], [[SLICE_BASE]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[TILESLICE]], ptr [[TMP0]]) -// CHECK-CXX-NEXT: ret void +// CHECK-C-LABEL: @test_svstr_vnum_za_var( +// CHECK-CXX-LABEL: @_Z22test_svstr_vnum_za_varjPvl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VNUM:%.*]] to i32 +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]]) +// CHECK-NEXT: ret void // void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) { svstr_vnum_za(slice_base, ptr, vnum); } + +// CHECK-C-LABEL: @test_svstr_vnum_za_2( +// CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_2jPv( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16) +// CHECK-NEXT: ret void +// +void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) { + svstr_vnum_za(slice_base, ptr, 16); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 36b552c123dd1ab..222d04de876bbf8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2680,9 +2680,9 @@ let TargetPrefix = "aarch64" in { // Spill + fill def int_aarch64_sme_ldr : DefaultAttrsIntrinsic< - [], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<1>>]>; + [], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty]>; def int_aarch64_sme_str : DefaultAttrsIntrinsic< - [], [llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty], [ImmArg<ArgIndex<1>>]>; + [], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty]>; class SME_TileToVector_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty], diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 38759a2474518fc..473e34b14e383af 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -379,6 +379,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { void SelectPExtPair(SDNode *N, unsigned Opc); void SelectWhilePair(SDNode *N, unsigned Opc); void SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode); + void SelectSMELdrStrZA(SDNode *N, bool IsLoad); void SelectClamp(SDNode *N, unsigned NumVecs, unsigned Opcode); void SelectUnaryMultiIntrinsic(SDNode *N, unsigned NumOutVecs, bool IsTupleInput, unsigned Opc); @@ -1741,6 +1742,54 @@ void AArch64DAGToDAGISel::SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, CurDAG->RemoveDeadNode(N); } +void AArch64DAGToDAGISel::SelectSMELdrStrZA(SDNode *N, bool IsLoad) { + // Lower an SME LDR/STR ZA intrinsic to LDR_ZA_PSEUDO or STR_ZA. + // If the vector select parameter is an immediate in the range 0-15 then we + // can emit it directly into the instruction as it's a legal operand. + // Otherwise we must emit 0 as the vector select operand and modify the base + // register instead. + SDLoc DL(N); + + SDValue VecNum = N->getOperand(4), Base = N->getOperand(3), + TileSlice = N->getOperand(2); + int Imm = -1; + if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) + Imm = ImmNode->getZExtValue(); + + if (Imm >= 0 && Imm <= 15) { + // 0-15 is a legal immediate so just pass it directly as a TargetConstant + VecNum = CurDAG->getTargetConstant(Imm, DL, MVT::i32); + } else { + // Get the vector length that will be multiplied by vnum + auto SVL = SDValue( + CurDAG->getMachineNode(AArch64::RDSVLI_XI, DL, MVT::i64, + CurDAG->getTargetConstant(1, DL, MVT::i32)), + 0); + + // Multiply SVL and vnum then add it to the base register + if (VecNum.getValueType() == MVT::i32) + VecNum = Widen(CurDAG, VecNum); + SDValue AddOps[] = {SVL, VecNum, Base}; + auto Add = SDValue( + CurDAG->getMachineNode(AArch64::MADDXrrr, DL, MVT::i64, AddOps), 0); + + // The base register has been modified to take vnum into account so just + // pass 0 + VecNum = CurDAG->getTargetConstant(0, DL, MVT::i32); + Base = Add; + } + + SmallVector<SDValue, 6> Ops = {TileSlice, VecNum, Base}; + if (!IsLoad) { + Ops.insert(Ops.begin(), CurDAG->getRegister(AArch64::ZA, MVT::Other)); + Ops.push_back(VecNum); + } + auto LdrStr = + CurDAG->getMachineNode(IsLoad ? AArch64::LDR_ZA_PSEUDO : AArch64::STR_ZA, + DL, N->getValueType(0), Ops); + ReplaceNode(N, LdrStr); +} + void AArch64DAGToDAGISel::SelectDestructiveMultiIntrinsic(SDNode *N, unsigned NumVecs, bool IsZmMulti, @@ -5663,6 +5712,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { switch (IntNo) { default: break; + case Intrinsic::aarch64_sme_str: + case Intrinsic::aarch64_sme_ldr: { + SelectSMELdrStrZA(Node, IntNo == Intrinsic::aarch64_sme_ldr); + return; + } case Intrinsic::aarch64_neon_st1x2: { if (VT == MVT::v8i8) { SelectStore(Node, 2, AArch64::ST1Twov8b); diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 90b269574adebde..7f96929e4d47376 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -794,7 +794,7 @@ multiclass sme_spill<string opcodestr> { (!cast<Instruction>(NAME) MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; // base - def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm, GPR64sp:$base), + def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base, sme_elm_idx0_15:$imm), (!cast<Instruction>(NAME) ZA, $idx, $imm, $base, 0)>; } @@ -813,7 +813,7 @@ multiclass sme_fill<string opcodestr> { let mayLoad = 1; } // base - def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm, GPR64sp:$base), + def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base, sme_elm_idx0_15:$imm), (!cast<Instruction>(NAME # _PSEUDO) $idx, $imm, $base)>; } diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll index f5d25a3229a7f82..340b54cc0d2731f 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll @@ -252,28 +252,10 @@ define void @ldr(ptr %ptr) { ; CHECK-NEXT: mov w12, wzr ; CHECK-NEXT: ldr za[w12, 0], [x0] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.ldr(i32 0, i32 0, ptr %ptr) + call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr, i32 0) ret void; } -define void @ldr_vnum(i32 %tile_slice, ptr %ptr, i64 %vnum) { -; CHECK-LABEL: ldr_vnum: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: add w12, w2, w0 -; CHECK-NEXT: madd x8, x8, x2, x1 -; CHECK-NEXT: ldr za[w12, 0], [x8] -; CHECK-NEXT: ret -entry: - %svlb = tail call i64 @llvm.aarch64.sme.cntsb() - %mulvl = mul i64 %svlb, %vnum - %0 = getelementptr i8, ptr %ptr, i64 %mulvl - %1 = trunc i64 %vnum to i32 - %2 = add i32 %1, %tile_slice - tail call void @llvm.aarch64.sme.ldr(i32 %2, i32 0, ptr %0) - ret void -} - define void @ldr_with_off_15(ptr %ptr) { ; CHECK-LABEL: ldr_with_off_15: ; CHECK: // %bb.0: @@ -282,7 +264,7 @@ define void @ldr_with_off_15(ptr %ptr) { ; CHECK-NEXT: ldr za[w12, 0], [x8] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 - call void @llvm.aarch64.sme.ldr(i32 15, i32 0, ptr %base) + call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0) ret void; } @@ -296,7 +278,7 @@ define void @ldr_with_off_15mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 240 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.ldr(i32 15, i32 0, ptr %base) + call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0) ret void; } @@ -310,7 +292,42 @@ define void @ldr_with_off_16mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 256 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.ldr(i32 16, i32 0, ptr %base) + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 0) + ret void; +} + +define void @ldr_with_off_var(ptr %base, i32 %off) { +; CHECK-LABEL: ldr_with_off_var: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: mov w12, #16 // =0x10 +; CHECK-NEXT: madd x8, x8, x1, x0 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 %off) + ret void; +} + +define void @ldr_with_off_15imm(ptr %base) { +; CHECK-LABEL: ldr_with_off_15imm: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, #16 // =0x10 +; CHECK-NEXT: ldr za[w12, 15], [x0, #15, mul vl] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 15) + ret void; +} + +define void @ldr_with_off_16imm(ptr %base) { +; CHECK-LABEL: ldr_with_off_16imm: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov w12, #16 // =0x10 +; CHECK-NEXT: madd x8, x8, x12, x0 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 16) ret void; } @@ -320,13 +337,13 @@ define void @test_ld1_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src, ; CHECK-LABEL: test_ld1_sink_tile0_offset_operand: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w12, w1 -; CHECK-NEXT: .LBB15_1: // %for.body +; CHECK-NEXT: .LBB17_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1w {za0h.s[w12, 0]}, p0/z, [x0] ; CHECK-NEXT: subs w2, w2, #1 ; CHECK-NEXT: ld1w {za0h.s[w12, 1]}, p0/z, [x0] ; CHECK-NEXT: ld1w {za0h.s[w12, 2]}, p0/z, [x0] -; CHECK-NEXT: b.ne .LBB15_1 +; CHECK-NEXT: b.ne .LBB17_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: @@ -359,6 +376,5 @@ declare void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1>, ptr, i32, i32) declare void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1>, ptr, i32, i32) declare void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1>, ptr, i32, i32) -declare void @llvm.aarch64.sme.ldr(i32, i32, ptr) +declare void @llvm.aarch64.sme.ldr(i32, ptr, i32) declare i64 @llvm.vscale.i64() -declare i64 @llvm.aarch64.sme.cntsb() diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll index 2bb9c3d05b9da5c..b55c2bc78b0fcf0 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll @@ -252,7 +252,7 @@ define void @str(ptr %ptr) { ; CHECK-NEXT: mov w12, wzr ; CHECK-NEXT: str za[w12, 0], [x0] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.str(i32 0, ptr %ptr) + call void @llvm.aarch64.sme.str(i32 0, ptr %ptr, i32 0) ret void; } @@ -264,7 +264,7 @@ define void @str_with_off_15(ptr %ptr) { ; CHECK-NEXT: str za[w12, 0], [x8] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 - call void @llvm.aarch64.sme.str(i32 15, ptr %base) + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0) ret void; } @@ -278,7 +278,7 @@ define void @str_with_off_15mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 240 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.str(i32 15, ptr %base) + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0) ret void; } @@ -292,7 +292,47 @@ define void @str_with_off_16mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 256 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.str(i32 16, ptr %base) + call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 0) + ret void; +} + +define void @str_with_off_var(ptr %base, i32 %off) { +; CHECK-LABEL: str_with_off_var: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: mov w12, #16 // =0x10 +; CHECK-NEXT: madd x8, x8, x1, x0 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 %off) + ret void; +} + +define void @str_with_off_15imm(ptr %ptr) { +; CHECK-LABEL: str_with_off_15imm: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, #15 // =0xf +; CHECK-NEXT: add x8, x0, #15 +; CHECK-NEXT: str za[w12, 15], [x8, #15, mul vl] +; CHECK-NEXT: ret + %base = getelementptr i8, ptr %ptr, i64 15 + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 15) + ret void; +} + +define void @str_with_off_16imm(ptr %ptr) { +; CHECK-LABEL: str_with_off_16imm: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov w9, #16 // =0x10 +; CHECK-NEXT: add x10, x0, #15 +; CHECK-NEXT: madd x8, x8, x9, x10 +; CHECK-NEXT: mov w12, #15 // =0xf +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: ret + %base = getelementptr i8, ptr %ptr, i64 15 + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 16) ret void; } @@ -302,13 +342,13 @@ define void @test_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src, i32 ; CHECK-LABEL: test_sink_tile0_offset_operand: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w12, w1 -; CHECK-NEXT: .LBB14_1: // %for.body +; CHECK-NEXT: .LBB17_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: st1w {za0h.s[w12, 0]}, p0, [x0] ; CHECK-NEXT: subs w2, w2, #1 ; CHECK-NEXT: st1w {za0h.s[w12, 1]}, p0, [x0] ; CHECK-NEXT: st1w {za0h.s[w12, 2]}, p0, [x0] -; CHECK-NEXT: b.ne .LBB14_1 +; CHECK-NEXT: b.ne .LBB17_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: @@ -340,5 +380,5 @@ declare void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1>, ptr, i32, i32) declare void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1>, ptr, i32, i32) declare void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1>, ptr, i32, i32) -declare void @llvm.aarch64.sme.str(i32, ptr) +declare void @llvm.aarch64.sme.str(i32, ptr, i32) declare i64 @llvm.vscale.i64() >From 2f57f925e81b7e5171eb8d283e3450823d6acbf1 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs <samuel.te...@arm.com> Date: Fri, 3 Nov 2023 09:47:50 +0000 Subject: [PATCH 4/6] fixup! lower in ISelLowering instead --- .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 54 -------------- .../Target/AArch64/AArch64ISelLowering.cpp | 72 +++++++++++++++++++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 ++ llvm/lib/Target/AArch64/SMEInstrFormats.td | 21 +++--- .../CodeGen/AArch64/sme-intrinsics-loads.ll | 61 +++++++++++++--- .../CodeGen/AArch64/sme-intrinsics-stores.ll | 63 +++++++++++++--- 6 files changed, 193 insertions(+), 82 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 473e34b14e383af..38759a2474518fc 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -379,7 +379,6 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { void SelectPExtPair(SDNode *N, unsigned Opc); void SelectWhilePair(SDNode *N, unsigned Opc); void SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode); - void SelectSMELdrStrZA(SDNode *N, bool IsLoad); void SelectClamp(SDNode *N, unsigned NumVecs, unsigned Opcode); void SelectUnaryMultiIntrinsic(SDNode *N, unsigned NumOutVecs, bool IsTupleInput, unsigned Opc); @@ -1742,54 +1741,6 @@ void AArch64DAGToDAGISel::SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, CurDAG->RemoveDeadNode(N); } -void AArch64DAGToDAGISel::SelectSMELdrStrZA(SDNode *N, bool IsLoad) { - // Lower an SME LDR/STR ZA intrinsic to LDR_ZA_PSEUDO or STR_ZA. - // If the vector select parameter is an immediate in the range 0-15 then we - // can emit it directly into the instruction as it's a legal operand. - // Otherwise we must emit 0 as the vector select operand and modify the base - // register instead. - SDLoc DL(N); - - SDValue VecNum = N->getOperand(4), Base = N->getOperand(3), - TileSlice = N->getOperand(2); - int Imm = -1; - if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) - Imm = ImmNode->getZExtValue(); - - if (Imm >= 0 && Imm <= 15) { - // 0-15 is a legal immediate so just pass it directly as a TargetConstant - VecNum = CurDAG->getTargetConstant(Imm, DL, MVT::i32); - } else { - // Get the vector length that will be multiplied by vnum - auto SVL = SDValue( - CurDAG->getMachineNode(AArch64::RDSVLI_XI, DL, MVT::i64, - CurDAG->getTargetConstant(1, DL, MVT::i32)), - 0); - - // Multiply SVL and vnum then add it to the base register - if (VecNum.getValueType() == MVT::i32) - VecNum = Widen(CurDAG, VecNum); - SDValue AddOps[] = {SVL, VecNum, Base}; - auto Add = SDValue( - CurDAG->getMachineNode(AArch64::MADDXrrr, DL, MVT::i64, AddOps), 0); - - // The base register has been modified to take vnum into account so just - // pass 0 - VecNum = CurDAG->getTargetConstant(0, DL, MVT::i32); - Base = Add; - } - - SmallVector<SDValue, 6> Ops = {TileSlice, VecNum, Base}; - if (!IsLoad) { - Ops.insert(Ops.begin(), CurDAG->getRegister(AArch64::ZA, MVT::Other)); - Ops.push_back(VecNum); - } - auto LdrStr = - CurDAG->getMachineNode(IsLoad ? AArch64::LDR_ZA_PSEUDO : AArch64::STR_ZA, - DL, N->getValueType(0), Ops); - ReplaceNode(N, LdrStr); -} - void AArch64DAGToDAGISel::SelectDestructiveMultiIntrinsic(SDNode *N, unsigned NumVecs, bool IsZmMulti, @@ -5712,11 +5663,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { switch (IntNo) { default: break; - case Intrinsic::aarch64_sme_str: - case Intrinsic::aarch64_sme_ldr: { - SelectSMELdrStrZA(Node, IntNo == Intrinsic::aarch64_sme_ldr); - return; - } case Intrinsic::aarch64_neon_st1x2: { if (VT == MVT::v8i8) { SelectStore(Node, 2, AArch64::ST1Twov8b); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7211607fee528a6..ad76289d09a3a32 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2402,6 +2402,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::FCMP) MAKE_CASE(AArch64ISD::STRICT_FCMP) MAKE_CASE(AArch64ISD::STRICT_FCMPE) + MAKE_CASE(AArch64ISD::SME_ZA_LDR) + MAKE_CASE(AArch64ISD::SME_ZA_STR) MAKE_CASE(AArch64ISD::DUP) MAKE_CASE(AArch64ISD::DUPLANE8) MAKE_CASE(AArch64ISD::DUPLANE16) @@ -4825,6 +4827,72 @@ SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain, Mask); } +SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) { + // Lower an SME LDR/STR ZA intrinsic to LDR_ZA_PSEUDO or STR_ZA. + // If the vector number is an immediate between 0 and 15 inclusive then we can + // put that directly into the immediate field of the instruction. If it's + // outside of that range then we modify the base and slice by the greatest + // multiple of 15 smaller than that number and put the remainder in the + // instruction field. If it's not an immediate then we modify the base and + // slice registers by that number and put 0 in the instruction. + SDLoc DL(N); + + SDValue TileSlice = N->getOperand(2); + SDValue Base = N->getOperand(3); + SDValue VecNum = N->getOperand(4); + SDValue Remainder = DAG.getTargetConstant(0, DL, MVT::i32); + + // true if the base and slice registers need to me modified + bool NeedsAdd = true; + if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) { + int Imm = ImmNode->getSExtValue(); + if (Imm >= 0 && Imm <= 15) { + Remainder = DAG.getTargetConstant(Imm, DL, MVT::i32); + NeedsAdd = false; + } else { + Remainder = DAG.getTargetConstant(Imm % 15, DL, MVT::i32); + NeedsAdd = true; + VecNum = DAG.getConstant(Imm - (Imm % 15), DL, MVT::i32); + } + } else if (VecNum.getOpcode() == ISD::ADD) { + // If the vnum is an add, we can fold that add into the instruction if the + // operand is an immediate in range + if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum.getOperand(1))) { + int Imm = ImmNode->getSExtValue(); + if (Imm >= 0 && Imm <= 15) { + VecNum = VecNum.getOperand(0); + Remainder = DAG.getTargetConstant(Imm, DL, MVT::i32); + NeedsAdd = true; + } + } + } + if (NeedsAdd) { + // Get the vector length that will be multiplied by vnum + auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + + // Multiply SVL and vnum then add it to the base + // Just add vnum to the tileslice + SDValue BaseMulOps[] = { + SVL, VecNum.getValueType() == MVT::i32 + ? DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VecNum) + : VecNum}; + SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, BaseMulOps); + + SDValue BaseAddOps[] = {Base, Mul}; + Base = DAG.getNode(ISD::ADD, DL, MVT::i64, BaseAddOps); + + SDValue SliceAddOps[] = {TileSlice, VecNum}; + TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, SliceAddOps); + } + + SmallVector<SDValue, 4> Ops = {N.getOperand(0), TileSlice, Base, Remainder}; + auto LdrStr = + DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR, DL, + MVT::Other, Ops); + return LdrStr; +} + SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = Op.getConstantOperandVal(1); @@ -4848,6 +4916,10 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain, DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr); } + case Intrinsic::aarch64_sme_str: + case Intrinsic::aarch64_sme_ldr: { + return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr); + } case Intrinsic::aarch64_sme_za_enable: return DAG.getNode( AArch64ISD::SMSTART, DL, MVT::Other, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 52e519cd8a0c93c..506efe21e126735 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -440,6 +440,10 @@ enum NodeType : unsigned { STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, STRICT_FCMPE, + // SME ZA loads and stores + SME_ZA_LDR, + SME_ZA_STR, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 7f96929e4d47376..1483c4b3c26d1ae 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -33,6 +33,12 @@ def tileslicerange0s4 : ComplexPattern<i32, 2, "SelectSMETileSlice<0, 4>", []>; def am_sme_indexed_b4 :ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<0,15>", [], [SDNPWantRoot]>; +def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>; +def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore, + [SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>; +def AArch64SMEStr : SDNode<"AArch64ISD::SME_ZA_STR", SDTZALoadStore, + [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>; + //===----------------------------------------------------------------------===// // SME Pseudo Classes //===----------------------------------------------------------------------===// @@ -779,23 +785,23 @@ class sme_spill_inst<string opcodestr> : sme_spill_fill_base<0b1, (outs), (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, - imm0_15:$offset), + imm32_0_15:$offset), opcodestr>; let mayLoad = 1 in class sme_fill_inst<string opcodestr> : sme_spill_fill_base<0b0, (outs MatrixOp:$ZAt), (ins MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, - imm0_15:$offset), + imm32_0_15:$offset), opcodestr>; multiclass sme_spill<string opcodestr> { def NAME : sme_spill_inst<opcodestr>; def : InstAlias<opcodestr # "\t$ZAt[$Rv, $imm4], [$Rn]", (!cast<Instruction>(NAME) MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; - // base - def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base, sme_elm_idx0_15:$imm), - (!cast<Instruction>(NAME) ZA, $idx, $imm, $base, 0)>; + + def : Pat<(AArch64SMEStr (i32 MatrixIndexGPR32Op12_15:$slice), (i64 GPR64sp:$base), (i32 sme_elm_idx0_15:$imm)), + (!cast<Instruction>(NAME) ZA, MatrixIndexGPR32Op12_15:$slice, sme_elm_idx0_15:$imm, GPR64sp:$base, imm32_0_15:$imm)>; } multiclass sme_fill<string opcodestr> { @@ -812,9 +818,8 @@ multiclass sme_fill<string opcodestr> { let usesCustomInserter = 1; let mayLoad = 1; } - // base - def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base, sme_elm_idx0_15:$imm), - (!cast<Instruction>(NAME # _PSEUDO) $idx, $imm, $base)>; + def : Pat<(AArch64SMELdr MatrixIndexGPR32Op12_15:$slice, GPR64sp:$base, sme_elm_idx0_15:$imm), + (!cast<Instruction>(NAME # _PSEUDO) MatrixIndexGPR32Op12_15:$slice, sme_elm_idx0_15:$imm, GPR64sp:$base)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll index 340b54cc0d2731f..bcca2133984a6c8 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll @@ -299,10 +299,11 @@ define void @ldr_with_off_16mulvl(ptr %ptr) { define void @ldr_with_off_var(ptr %base, i32 %off) { ; CHECK-LABEL: ldr_with_off_var: ; CHECK: // %bb.0: -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: mov w12, #16 // =0x10 -; CHECK-NEXT: madd x8, x8, x1, x0 +; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: sxtw x8, w2 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: add w12, w0, w2 +; CHECK-NEXT: madd x8, x9, x8, x1 ; CHECK-NEXT: ldr za[w12, 0], [x8] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 %off) @@ -323,27 +324,69 @@ define void @ldr_with_off_16imm(ptr %base) { ; CHECK-LABEL: ldr_with_off_16imm: ; CHECK: // %bb.0: ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov w12, #16 // =0x10 -; CHECK-NEXT: madd x8, x8, x12, x0 -; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: add w12, w0, #15 +; CHECK-NEXT: sub x9, x1, x8 +; CHECK-NEXT: add x8, x9, x8, lsl #4 +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 16) ret void; } +define void @ldr_with_off_many_imm(i32 %tile_slice, ptr %ptr) { +; CHECK-LABEL: ldr_with_off_many_imm: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: ldr za[w12, 1], [x1, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x1, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x1, #3, mul vl] +; CHECK-NEXT: ldr za[w12, 4], [x1, #4, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 1) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 2) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 3) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 4) + ret void +} + +define void @ldr_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: ldr_with_off_many_var: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtw x8, w2 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: add w12, w0, w2 +; CHECK-NEXT: madd x8, x9, x8, x1 +; CHECK-NEXT: ldr za[w12, 0], [x8] +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + %0 = trunc i64 %vnum to i32 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %0) + %1 = add i32 %0, 1 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1) + %2 = add i32 %0, 2 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2) + %3 = add i32 %0, 3 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3) + ret void +} + ; Ensure that the tile offset is sunk, given that this is likely to be an 'add' ; that's decomposed into a base + offset in ISel. define void @test_ld1_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src, i32 %base, i32 %N) { ; CHECK-LABEL: test_ld1_sink_tile0_offset_operand: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w12, w1 -; CHECK-NEXT: .LBB17_1: // %for.body +; CHECK-NEXT: .LBB19_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1w {za0h.s[w12, 0]}, p0/z, [x0] ; CHECK-NEXT: subs w2, w2, #1 ; CHECK-NEXT: ld1w {za0h.s[w12, 1]}, p0/z, [x0] ; CHECK-NEXT: ld1w {za0h.s[w12, 2]}, p0/z, [x0] -; CHECK-NEXT: b.ne .LBB17_1 +; CHECK-NEXT: b.ne .LBB19_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll index b55c2bc78b0fcf0..f0239aacccada21 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll @@ -299,10 +299,11 @@ define void @str_with_off_16mulvl(ptr %ptr) { define void @str_with_off_var(ptr %base, i32 %off) { ; CHECK-LABEL: str_with_off_var: ; CHECK: // %bb.0: -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: mov w12, #16 // =0x10 -; CHECK-NEXT: madd x8, x8, x1, x0 +; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 +; CHECK-NEXT: sxtw x8, w2 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: add w12, w0, w2 +; CHECK-NEXT: madd x8, x9, x8, x1 ; CHECK-NEXT: str za[w12, 0], [x8] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 %off) @@ -325,30 +326,70 @@ define void @str_with_off_16imm(ptr %ptr) { ; CHECK-LABEL: str_with_off_16imm: ; CHECK: // %bb.0: ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov w9, #16 // =0x10 -; CHECK-NEXT: add x10, x0, #15 -; CHECK-NEXT: madd x8, x8, x9, x10 -; CHECK-NEXT: mov w12, #15 // =0xf -; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: add w12, w0, #15 +; CHECK-NEXT: sub x9, x1, x8 +; CHECK-NEXT: add x8, x9, x8, lsl #4 +; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 16) ret void; } +define void @str_with_off_many_imm(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: str_with_off_many_imm: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: str za[w12, 1], [x1, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x1, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x1, #3, mul vl] +; CHECK-NEXT: str za[w12, 4], [x1, #4, mul vl] +; CHECK-NEXT: ret +entry: + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 1) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 2) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 3) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 4) + ret void +} + +define void @str_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) { +; CHECK-LABEL: str_with_off_many_var: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtw x8, w2 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: add w12, w0, w2 +; CHECK-NEXT: madd x8, x9, x8, x1 +; CHECK-NEXT: str za[w12, 0], [x8] +; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ret +entry: + %0 = trunc i64 %vnum to i32 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %0) + %1 = add i32 %0, 1 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1) + %2 = add i32 %0, 2 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2) + %3 = add i32 %0, 3 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3) + ret void +} + ; Ensure that the tile offset is sunk, given that this is likely to be an 'add' ; that's decomposed into a base + offset in ISel. define void @test_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src, i32 %base, i32 %N) { ; CHECK-LABEL: test_sink_tile0_offset_operand: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w12, w1 -; CHECK-NEXT: .LBB17_1: // %for.body +; CHECK-NEXT: .LBB19_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: st1w {za0h.s[w12, 0]}, p0, [x0] ; CHECK-NEXT: subs w2, w2, #1 ; CHECK-NEXT: st1w {za0h.s[w12, 1]}, p0, [x0] ; CHECK-NEXT: st1w {za0h.s[w12, 2]}, p0, [x0] -; CHECK-NEXT: b.ne .LBB17_1 +; CHECK-NEXT: b.ne .LBB19_1 ; CHECK-NEXT: // %bb.2: // %exit ; CHECK-NEXT: ret entry: >From 44beb4d837e42ff49f84d1152afdae05d703160e Mon Sep 17 00:00:00 2001 From: Sam Tebbs <samuel.te...@arm.com> Date: Mon, 6 Nov 2023 10:34:27 +0000 Subject: [PATCH 5/6] fixup! Update check lines --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll | 12 ++++++------ llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll | 13 +++++++------ 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ad76289d09a3a32..25a1e8f293556f2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4842,7 +4842,7 @@ SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) { SDValue VecNum = N->getOperand(4); SDValue Remainder = DAG.getTargetConstant(0, DL, MVT::i32); - // true if the base and slice registers need to me modified + // true if the base and slice registers need to be modified bool NeedsAdd = true; if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) { int Imm = ImmNode->getSExtValue(); diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll index bcca2133984a6c8..09e7d7b4068ce17 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll @@ -299,11 +299,11 @@ define void @ldr_with_off_16mulvl(ptr %ptr) { define void @ldr_with_off_var(ptr %base, i32 %off) { ; CHECK-LABEL: ldr_with_off_var: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: sxtw x8, w2 +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 ; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: add w12, w0, w2 -; CHECK-NEXT: madd x8, x9, x8, x1 +; CHECK-NEXT: add w12, w1, #16 +; CHECK-NEXT: madd x8, x9, x8, x0 ; CHECK-NEXT: ldr za[w12, 0], [x8] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 %off) @@ -324,8 +324,8 @@ define void @ldr_with_off_16imm(ptr %base) { ; CHECK-LABEL: ldr_with_off_16imm: ; CHECK: // %bb.0: ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: add w12, w0, #15 -; CHECK-NEXT: sub x9, x1, x8 +; CHECK-NEXT: mov w12, #31 // =0x1f +; CHECK-NEXT: sub x9, x0, x8 ; CHECK-NEXT: add x8, x9, x8, lsl #4 ; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll index f0239aacccada21..40327b80a1b96d7 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll @@ -299,11 +299,11 @@ define void @str_with_off_16mulvl(ptr %ptr) { define void @str_with_off_var(ptr %base, i32 %off) { ; CHECK-LABEL: str_with_off_var: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 -; CHECK-NEXT: sxtw x8, w2 +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: sxtw x8, w1 ; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: add w12, w0, w2 -; CHECK-NEXT: madd x8, x9, x8, x1 +; CHECK-NEXT: add w12, w1, #16 +; CHECK-NEXT: madd x8, x9, x8, x0 ; CHECK-NEXT: str za[w12, 0], [x8] ; CHECK-NEXT: ret call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 %off) @@ -326,9 +326,10 @@ define void @str_with_off_16imm(ptr %ptr) { ; CHECK-LABEL: str_with_off_16imm: ; CHECK: // %bb.0: ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: add w12, w0, #15 -; CHECK-NEXT: sub x9, x1, x8 +; CHECK-NEXT: mov w12, #30 // =0x1e +; CHECK-NEXT: sub x9, x0, x8 ; CHECK-NEXT: add x8, x9, x8, lsl #4 +; CHECK-NEXT: add x8, x8, #15 ; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 >From 0dc4861d0d2ce730703c669e40f3352ec85b0744 Mon Sep 17 00:00:00 2001 From: Sam Tebbs <samuel.te...@arm.com> Date: Mon, 6 Nov 2023 11:46:34 +0000 Subject: [PATCH 6/6] fixup! Clean up node creation --- .../Target/AArch64/AArch64ISelLowering.cpp | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 25a1e8f293556f2..dc9e3432b1c93a3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4872,21 +4872,16 @@ SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) { DAG.getConstant(1, DL, MVT::i32)); // Multiply SVL and vnum then add it to the base + SDValue Mul = + DAG.getNode(ISD::MUL, DL, MVT::i64, + {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VecNum)}); + Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul}); // Just add vnum to the tileslice - SDValue BaseMulOps[] = { - SVL, VecNum.getValueType() == MVT::i32 - ? DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VecNum) - : VecNum}; - SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, BaseMulOps); - - SDValue BaseAddOps[] = {Base, Mul}; - Base = DAG.getNode(ISD::ADD, DL, MVT::i64, BaseAddOps); - - SDValue SliceAddOps[] = {TileSlice, VecNum}; - TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, SliceAddOps); + TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VecNum}); } - SmallVector<SDValue, 4> Ops = {N.getOperand(0), TileSlice, Base, Remainder}; + SmallVector<SDValue, 4> Ops = {/*Chain=*/N.getOperand(0), TileSlice, Base, + Remainder}; auto LdrStr = DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR, DL, MVT::Other, Ops); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits