https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/78632
>From a786cdedc2c9a9898cd0b80d84f5b11aace5da1c Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Tue, 28 Nov 2023 15:44:02 +0000 Subject: [PATCH 01/11] [AArch64] Add custom lowering for load <3 x i8>. Add custom combine to lower load <3 x i8> as the more efficient sequence below: ldrb wX, [x0, #2] ldrh wY, [x0] orr wX, wY, wX, lsl #16 fmov s0, wX At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: https://github.com/llvm/llvm-project/pull/77790 --- .../Target/AArch64/AArch64ISelLowering.cpp | 54 ++++++++++++++++++- .../AArch64/vec3-loads-ext-trunc-stores.ll | 44 +++++---------- 2 files changed, 65 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8a6f1dc7487b..e1139c2fede8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21095,6 +21095,50 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) { return SDValue(); } +// A custom combine to lower load <3 x i8> as the more efficient sequence +// below: +// ldrb wX, [x0, #2] +// ldrh wY, [x0] +// orr wX, wY, wX, lsl #16 +// fmov s0, wX +// +static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { + EVT MemVT = LD->getMemoryVT(); + if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) || + LD->getOriginalAlign() >= 4) + return SDValue(); + + SDLoc DL(LD); + SDValue Chain = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + + // Load 2 x i8, then 1 x i8. + SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(), + LD->getOriginalAlign()); + SDValue L8 = + DAG.getLoad(MVT::i8, DL, Chain, + DAG.getMemBasePlusOffset(BasePtr, TypeSize::getFixed(2), DL), + LD->getPointerInfo(), LD->getOriginalAlign()); + + // Extend to i32. + SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16); + SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8); + + // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8. + SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8, + DAG.getConstant(16, DL, MVT::i32)); + SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or); + + // Extract v3i8 again. + SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast, + DAG.getConstant(0, DL, MVT::i64)); + SDValue TokenFactor = DAG.getNode( + ISD::TokenFactor, DL, MVT::Other, + {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)}); + return DAG.getMergeValues({Extract, TokenFactor}, DL); +} + // Perform TBI simplification if supported by the target and try to break up // nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit // load instructions can be selected. @@ -21106,10 +21150,16 @@ static SDValue performLOADCombine(SDNode *N, performTBISimplification(N->getOperand(1), DCI, DAG); LoadSDNode *LD = cast<LoadSDNode>(N); - EVT MemVT = LD->getMemoryVT(); - if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian()) + if (LD->isVolatile() || !Subtarget->isLittleEndian()) + return SDValue(N, 0); + + if (SDValue Res = combineV3I8LoadExt(LD, DAG)) + return Res; + + if (!LD->isNonTemporal()) return SDValue(N, 0); + EVT MemVT = LD->getMemoryVT(); if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 || MemVT.getSizeInBits() % 256 == 0 || 256 % MemVT.getScalarSizeInBits() != 0) diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 9eeb194409df..7cac4134f0e1 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -5,19 +5,10 @@ define <16 x i8> @load_v3i8(ptr %src, ptr %dst) { ; CHECK-LABEL: load_v3i8: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: umov.h w8, v0[0] -; CHECK-NEXT: umov.h w9, v0[1] +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: mov.b v0[1], w9 -; CHECK-NEXT: ld1.b { v0 }[2], [x8] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8: @@ -47,19 +38,14 @@ define <16 x i8> @load_v3i8(ptr %src, ptr %dst) { define <4 x i32> @load_v3i8_to_4xi32(ptr %src, ptr %dst) { ; CHECK-LABEL: load_v3i8_to_4xi32: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ldrsb w8, [x0, #2] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: mov.h v0[1], v0[1] -; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8_to_4xi32: @@ -193,19 +179,15 @@ entry: define void @load_ext_to_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: load_ext_to_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ld1.b { v0 }[4], [x8] +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: st1.h { v0 }[2], [x8] ; CHECK-NEXT: str s0, [x1] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_ext_to_64bits: >From 192233f0fda044c759054ae9d79c5b33d66fb1af Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Fri, 19 Jan 2024 16:49:34 +0000 Subject: [PATCH 02/11] !fixup adjust alignment and pointer info --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e1139c2fede8..95bc6b5cdff5 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21115,10 +21115,10 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { // Load 2 x i8, then 1 x i8. SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(), LD->getOriginalAlign()); - SDValue L8 = - DAG.getLoad(MVT::i8, DL, Chain, - DAG.getMemBasePlusOffset(BasePtr, TypeSize::getFixed(2), DL), - LD->getPointerInfo(), LD->getOriginalAlign()); + TypeSize Offset2 = TypeSize::getFixed(2); + SDValue L8 = DAG.getLoad( + MVT::i8, DL, Chain, DAG.getMemBasePlusOffset(BasePtr, Offset2, DL), + LD->getPointerInfo(), commonAlignment(LD->getOriginalAlign(), Offset2)); // Extend to i32. SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16); >From 39d6794cceb832afbf3e3bafe2c00413ef405eb7 Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Mon, 22 Jan 2024 16:11:35 +0000 Subject: [PATCH 03/11] !fixup add offset assert and update new tests. --- .../Target/AArch64/AArch64ISelLowering.cpp | 1 + .../AArch64/vec3-loads-ext-trunc-stores.ll | 30 +++++++------------ 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c4e2a6f90827..e26bb093ee5c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21173,6 +21173,7 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { SDLoc DL(LD); SDValue Chain = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); + assert(LD->getOffset().isUndef() && "undef offset expected"); // Load 2 x i8, then 1 x i8. SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(), diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 31a3874126d4..7435dde4f551 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -76,19 +76,14 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: ldrb w8, [x0, #3] +; CHECK-NEXT: ldurh w9, [x0, #1] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ldrsb w8, [x0, #3] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: mov.h v0[1], v0[1] -; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8_to_4xi32_const_offset_1: @@ -120,19 +115,14 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldurh w8, [x0, #3] +; CHECK-NEXT: ldrb w8, [x0, #5] +; CHECK-NEXT: ldurh w9, [x0, #3] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ldrsb w8, [x0, #5] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: mov.h v0[1], v0[1] -; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8_to_4xi32_const_offset_3: >From e96af2fa4ca83bade36e1f0aa1ab2e2b1d6dc49e Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Tue, 23 Jan 2024 14:19:32 +0000 Subject: [PATCH 04/11] !fixup update on top of new test coverage. Update checks after adding more tests in e7b4ff8 --- .../CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index a4698c27cfd2..5a253bea6f1e 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -348,24 +348,20 @@ entry: define void @load_ext_add_to_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: load_ext_add_to_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: ldrb w9, [x0, #2] +; CHECK-NEXT: ldrh w10, [x0] ; CHECK-NEXT: Lloh2: ; CHECK-NEXT: adrp x8, lCPI9_0@PAGE ; CHECK-NEXT: Lloh3: ; CHECK-NEXT: ldr d1, [x8, lCPI9_0@PAGEOFF] ; CHECK-NEXT: add x8, x1, #4 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: add x9, x0, #2 -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ld1.b { v0 }[4], [x9] +; CHECK-NEXT: orr w9, w10, w9, lsl #16 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: add.4h v0, v0, v1 ; CHECK-NEXT: st1.h { v0 }[2], [x8] ; CHECK-NEXT: str s0, [x1] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3 ; >From 7e2bf68358fc55e6e770601872b4a6ffd9349ec3 Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Wed, 24 Jan 2024 21:17:34 +0000 Subject: [PATCH 05/11] !fixup update tests and use MMO. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 11 ++++++----- .../AArch64/vec3-loads-ext-trunc-stores.ll | 15 +++++---------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a53c4740bd3d..00d62b7450f3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21262,17 +21262,18 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { return SDValue(); SDLoc DL(LD); + MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); + MachineMemOperand *MMO = LD->getMemOperand(); assert(LD->getOffset().isUndef() && "undef offset expected"); // Load 2 x i8, then 1 x i8. - SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(), - LD->getOriginalAlign()); + SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO); TypeSize Offset2 = TypeSize::getFixed(2); - SDValue L8 = DAG.getLoad( - MVT::i8, DL, Chain, DAG.getMemBasePlusOffset(BasePtr, Offset2, DL), - LD->getPointerInfo(), commonAlignment(LD->getOriginalAlign(), Offset2)); + SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain, + DAG.getMemBasePlusOffset(BasePtr, Offset2, DL), + MF.getMachineMemOperand(MMO, 2, 1)); // Extend to i32. SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16); diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 796580f083d0..275e5ac8b706 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -76,19 +76,14 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_align_2: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ldrsb w8, [x0, #2] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: mov.h v0[1], v0[1] -; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8_to_4xi32_align_2: >From 109038bab1328d667a6e2eaf01acc82c33c95431 Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Fri, 19 Jan 2024 16:40:46 +0000 Subject: [PATCH 06/11] Try using LD1r. --- .../Target/AArch64/AArch64ISelLowering.cpp | 68 ++++++++++++++++--- .../AArch64/vec3-loads-ext-trunc-stores.ll | 58 +++++----------- 2 files changed, 75 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 00d62b7450f3..6dc56ab3347a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11012,6 +11012,48 @@ SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) { MaskSourceVec); } +// Check if Op is a BUILD_VECTOR with 2 extracts and a load that is cheaper to +// insert into a vector and use a shuffle. This improves lowering for loads of +// <3 x i8>. +static SDValue shuffleWithSingleLoad(SDValue Op, SelectionDAG &DAG) { + if (Op.getNumOperands() != 4 || Op.getValueType() != MVT::v4i16) + return SDValue(); + + SDValue V0 = Op.getOperand(0); + SDValue V1 = Op.getOperand(1); + SDValue V2 = Op.getOperand(2); + SDValue V3 = Op.getOperand(3); + if (V0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + V1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + V2.getOpcode() != ISD::LOAD || !(V3.isUndef() || V3.getOpcode() == ISD::EXTRACT_VECTOR_ELT)) + return SDValue(); + + if (V0.getOperand(0) != V1.getOperand(0) || + V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 || !(V3.isUndef() || V3.getConstantOperandVal(1) == 3)) + return SDValue(); + + SDLoc dl(Op); + auto *L = cast<LoadSDNode>(Op.getOperand(2)); + auto Vec = V0.getOperand(0); + + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Vec.getValueType(), Vec, + SDValue(L, 0), DAG.getConstant(2, dl, MVT::i64)); + Vec = DAG.getNode(ISD::BITCAST, dl, MVT::v4i16, Vec); + + SDValue ShuffleOps[] = {DAG.getUNDEF(MVT::v4i16), DAG.getUNDEF(MVT::v4i16)}; + ShuffleOps[0] = Vec; + + SmallVector<int, 8> Mask(4, -1); + Mask[0] = 0; + Mask[1] = 1; + Mask[2] = 2; + if (!V3.isUndef()) + Mask[3] = 3; + SDValue Shuffle = + DAG.getVectorShuffle(MVT::v4i16, dl, ShuffleOps[0], ShuffleOps[1], Mask); + return Shuffle; +} + // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, @@ -11022,6 +11064,10 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, EVT VT = Op.getValueType(); assert(!VT.isScalableVector() && "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); + + if (SDValue S = shuffleWithSingleLoad(Op, DAG)) + return S; + unsigned NumElts = VT.getVectorNumElements(); struct ShuffleSourceInfo { @@ -11048,6 +11094,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, // First gather all vectors used as an immediate source for this BUILD_VECTOR // node. + // SmallVector<ShuffleSourceInfo, 2> Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); @@ -21269,24 +21316,23 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { assert(LD->getOffset().isUndef() && "undef offset expected"); // Load 2 x i8, then 1 x i8. - SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO); + SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, + MF.getMachineMemOperand(MMO, 0, 2)); TypeSize Offset2 = TypeSize::getFixed(2); SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain, DAG.getMemBasePlusOffset(BasePtr, Offset2, DL), MF.getMachineMemOperand(MMO, 2, 1)); - // Extend to i32. - SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16); - SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8); + SDValue Ins16 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::v4i16, L16); - // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8. - SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8, - DAG.getConstant(16, DL, MVT::i32)); - SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr); - SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Ins16); + + SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8); + SDValue Trunc8 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Ext8); - // Extract v3i8 again. - SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast, + SDValue Ins8 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i8, Cast, + Trunc8, DAG.getConstant(2, DL, MVT::i64)); + SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Ins8, DAG.getConstant(0, DL, MVT::i64)); SDValue TokenFactor = DAG.getNode( ISD::TokenFactor, DL, MVT::Other, diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 275e5ac8b706..248aa20bab63 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -5,10 +5,8 @@ define <16 x i8> @load_v3i8(ptr %src) { ; CHECK-LABEL: load_v3i8: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrh w9, [x0] -; CHECK-NEXT: orr w8, w9, w8, lsl #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 +; CHECK-NEXT: ld1.b { v0 }[2], [x0] ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8: @@ -38,12 +36,9 @@ define <16 x i8> @load_v3i8(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: orr w8, w9, w8, lsl #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ld1.b { v0 }[2], [x0] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret @@ -59,7 +54,6 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #2] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 -; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -76,12 +70,9 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_align_2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: orr w8, w9, w8, lsl #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ld1.b { v0 }[2], [x0] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret @@ -97,7 +88,6 @@ define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #2] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 -; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -141,12 +131,11 @@ define <4 x i32> @load_v3i8_to_4xi32_align_4(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #3] -; CHECK-NEXT: ldurh w9, [x0, #1] +; CHECK-NEXT: add x8, x0, #1 ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: orr w8, w9, w8, lsl #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: add x8, x0, #3 +; CHECK-NEXT: ld1.b { v0 }[2], [x8] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret @@ -162,7 +151,6 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #3] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 -; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -180,12 +168,11 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3: ; CHECK: ; %bb.0: -; CHECK-NEXT: ldrb w8, [x0, #5] -; CHECK-NEXT: ldurh w9, [x0, #3] +; CHECK-NEXT: add x8, x0, #3 ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: orr w8, w9, w8, lsl #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ld1r.4h { v0 }, [x8] +; CHECK-NEXT: add x8, x0, #5 +; CHECK-NEXT: ld1.b { v0 }[2], [x8] ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret @@ -201,7 +188,6 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #5] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 -; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -263,7 +249,6 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) { ; CHECK-NEXT: ldr s0, [sp, #12] ; CHECK-NEXT: ldrsb w8, [x0, #2] ; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: mov.h v0[1], v0[1] ; CHECK-NEXT: mov.h v0[2], w8 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 @@ -281,7 +266,6 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #2] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 -; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -410,12 +394,9 @@ entry: define void @load_ext_to_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: load_ext_to_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrh w9, [x0] -; CHECK-NEXT: orr w8, w9, w8, lsl #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 ; CHECK-NEXT: add x8, x1, #4 -; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ld1.b { v0 }[2], [x0] ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: st1.h { v0 }[2], [x8] ; CHECK-NEXT: str s0, [x1] @@ -507,16 +488,13 @@ entry: define void @load_ext_add_to_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: load_ext_add_to_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ldrb w9, [x0, #2] -; CHECK-NEXT: ldrh w10, [x0] +; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 ; CHECK-NEXT: Lloh2: ; CHECK-NEXT: adrp x8, lCPI13_0@PAGE ; CHECK-NEXT: Lloh3: ; CHECK-NEXT: ldr d1, [x8, lCPI13_0@PAGEOFF] ; CHECK-NEXT: add x8, x1, #4 -; CHECK-NEXT: orr w9, w10, w9, lsl #16 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: zip1.8b v0, v0, v0 +; CHECK-NEXT: ld1.b { v0 }[2], [x0] ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: add.4h v0, v0, v1 ; CHECK-NEXT: st1.h { v0 }[2], [x8] >From e6d5725008779d0232465f81f84b7bfd4a1fcb34 Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Thu, 25 Jan 2024 18:06:13 +0000 Subject: [PATCH 07/11] !fixup fix formatting --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 6dc56ab3347a..a201fdadc367 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11025,11 +11025,13 @@ static SDValue shuffleWithSingleLoad(SDValue Op, SelectionDAG &DAG) { SDValue V3 = Op.getOperand(3); if (V0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || V1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - V2.getOpcode() != ISD::LOAD || !(V3.isUndef() || V3.getOpcode() == ISD::EXTRACT_VECTOR_ELT)) + V2.getOpcode() != ISD::LOAD || + !(V3.isUndef() || V3.getOpcode() == ISD::EXTRACT_VECTOR_ELT)) return SDValue(); if (V0.getOperand(0) != V1.getOperand(0) || - V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 || !(V3.isUndef() || V3.getConstantOperandVal(1) == 3)) + V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 || + !(V3.isUndef() || V3.getConstantOperandVal(1) == 3)) return SDValue(); SDLoc dl(Op); >From fac632405afbe34ae5c66ae118928aab933faccb Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Tue, 30 Jan 2024 09:22:46 +0000 Subject: [PATCH 08/11] Revert "!fixup fix formatting" This reverts commit 667b5c1fa2527c2fe756673ea2dad54eeecc3e82. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 9b762e3a6c2b..b01bddc31474 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11025,13 +11025,11 @@ static SDValue shuffleWithSingleLoad(SDValue Op, SelectionDAG &DAG) { SDValue V3 = Op.getOperand(3); if (V0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || V1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - V2.getOpcode() != ISD::LOAD || - !(V3.isUndef() || V3.getOpcode() == ISD::EXTRACT_VECTOR_ELT)) + V2.getOpcode() != ISD::LOAD || !(V3.isUndef() || V3.getOpcode() == ISD::EXTRACT_VECTOR_ELT)) return SDValue(); if (V0.getOperand(0) != V1.getOperand(0) || - V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 || - !(V3.isUndef() || V3.getConstantOperandVal(1) == 3)) + V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 || !(V3.isUndef() || V3.getConstantOperandVal(1) == 3)) return SDValue(); SDLoc dl(Op); >From ebb84fc5389f914e014c92fb60338ed00a41a4ab Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Tue, 30 Jan 2024 09:22:53 +0000 Subject: [PATCH 09/11] Revert "Try using LD1r." This reverts commit 109038bab1328d667a6e2eaf01acc82c33c95431. --- .../Target/AArch64/AArch64ISelLowering.cpp | 68 +++---------------- .../AArch64/vec3-loads-ext-trunc-stores.ll | 58 +++++++++++----- 2 files changed, 51 insertions(+), 75 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b01bddc31474..f130beca5e15 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11012,48 +11012,6 @@ SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG) { MaskSourceVec); } -// Check if Op is a BUILD_VECTOR with 2 extracts and a load that is cheaper to -// insert into a vector and use a shuffle. This improves lowering for loads of -// <3 x i8>. -static SDValue shuffleWithSingleLoad(SDValue Op, SelectionDAG &DAG) { - if (Op.getNumOperands() != 4 || Op.getValueType() != MVT::v4i16) - return SDValue(); - - SDValue V0 = Op.getOperand(0); - SDValue V1 = Op.getOperand(1); - SDValue V2 = Op.getOperand(2); - SDValue V3 = Op.getOperand(3); - if (V0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - V1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - V2.getOpcode() != ISD::LOAD || !(V3.isUndef() || V3.getOpcode() == ISD::EXTRACT_VECTOR_ELT)) - return SDValue(); - - if (V0.getOperand(0) != V1.getOperand(0) || - V0.getConstantOperandVal(1) != 0 || V1.getConstantOperandVal(1) != 1 || !(V3.isUndef() || V3.getConstantOperandVal(1) == 3)) - return SDValue(); - - SDLoc dl(Op); - auto *L = cast<LoadSDNode>(Op.getOperand(2)); - auto Vec = V0.getOperand(0); - - Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Vec.getValueType(), Vec, - SDValue(L, 0), DAG.getConstant(2, dl, MVT::i64)); - Vec = DAG.getNode(ISD::BITCAST, dl, MVT::v4i16, Vec); - - SDValue ShuffleOps[] = {DAG.getUNDEF(MVT::v4i16), DAG.getUNDEF(MVT::v4i16)}; - ShuffleOps[0] = Vec; - - SmallVector<int, 8> Mask(4, -1); - Mask[0] = 0; - Mask[1] = 1; - Mask[2] = 2; - if (!V3.isUndef()) - Mask[3] = 3; - SDValue Shuffle = - DAG.getVectorShuffle(MVT::v4i16, dl, ShuffleOps[0], ShuffleOps[1], Mask); - return Shuffle; -} - // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, @@ -11064,10 +11022,6 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, EVT VT = Op.getValueType(); assert(!VT.isScalableVector() && "Scalable vectors cannot be used with ISD::BUILD_VECTOR"); - - if (SDValue S = shuffleWithSingleLoad(Op, DAG)) - return S; - unsigned NumElts = VT.getVectorNumElements(); struct ShuffleSourceInfo { @@ -11094,7 +11048,6 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, // First gather all vectors used as an immediate source for this BUILD_VECTOR // node. - // SmallVector<ShuffleSourceInfo, 2> Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); @@ -21316,23 +21269,24 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { assert(LD->getOffset().isUndef() && "undef offset expected"); // Load 2 x i8, then 1 x i8. - SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, - MF.getMachineMemOperand(MMO, 0, 2)); + SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO); TypeSize Offset2 = TypeSize::getFixed(2); SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain, DAG.getMemBasePlusOffset(BasePtr, Offset2, DL), MF.getMachineMemOperand(MMO, 2, 1)); - SDValue Ins16 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::v4i16, L16); - - SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Ins16); - + // Extend to i32. + SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16); SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8); - SDValue Trunc8 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Ext8); - SDValue Ins8 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i8, Cast, - Trunc8, DAG.getConstant(2, DL, MVT::i64)); - SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Ins8, + // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8. + SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8, + DAG.getConstant(16, DL, MVT::i32)); + SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or); + + // Extract v3i8 again. + SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast, DAG.getConstant(0, DL, MVT::i64)); SDValue TokenFactor = DAG.getNode( ISD::TokenFactor, DL, MVT::Other, diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index a4e490cb66e0..0494851bd074 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -5,8 +5,10 @@ define <16 x i8> @load_v3i8(ptr %src) { ; CHECK-LABEL: load_v3i8: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 -; CHECK-NEXT: ld1.b { v0 }[2], [x0] +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8: @@ -36,9 +38,12 @@ define <16 x i8> @load_v3i8(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: ld1.b { v0 }[2], [x0] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret @@ -54,6 +59,7 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #2] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -70,9 +76,12 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_align_2: ; CHECK: ; %bb.0: -; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: ld1.b { v0 }[2], [x0] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret @@ -88,6 +97,7 @@ define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #2] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -131,11 +141,12 @@ define <4 x i32> @load_v3i8_to_4xi32_align_4(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #1 +; CHECK-NEXT: ldrb w8, [x0, #3] +; CHECK-NEXT: ldurh w9, [x0, #1] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: ld1r.4h { v0 }, [x8] -; CHECK-NEXT: add x8, x0, #3 -; CHECK-NEXT: ld1.b { v0 }[2], [x8] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret @@ -151,6 +162,7 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #3] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -168,11 +180,12 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3: ; CHECK: ; %bb.0: -; CHECK-NEXT: add x8, x0, #3 +; CHECK-NEXT: ldrb w8, [x0, #5] +; CHECK-NEXT: ldurh w9, [x0, #3] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: ld1r.4h { v0 }, [x8] -; CHECK-NEXT: add x8, x0, #5 -; CHECK-NEXT: ld1.b { v0 }[2], [x8] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ret @@ -188,6 +201,7 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #5] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -249,6 +263,7 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) { ; CHECK-NEXT: ldr s0, [sp, #12] ; CHECK-NEXT: ldrsb w8, [x0, #2] ; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: mov.h v0[1], v0[1] ; CHECK-NEXT: mov.h v0[2], w8 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 @@ -266,6 +281,7 @@ define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) { ; BE-NEXT: ldrsb w8, [x0, #2] ; BE-NEXT: rev32 v0.8b, v0.8b ; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: mov v0.h[1], v0.h[1] ; BE-NEXT: mov v0.h[2], w8 ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b @@ -388,9 +404,12 @@ entry: define void @load_ext_to_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: load_ext_to_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: add x8, x1, #4 -; CHECK-NEXT: ld1.b { v0 }[2], [x0] +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: st1.h { v0 }[2], [x8] ; CHECK-NEXT: str s0, [x1] @@ -482,13 +501,16 @@ entry: define void @load_ext_add_to_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: load_ext_add_to_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: ld1r.4h { v0 }, [x0], #2 +; CHECK-NEXT: ldrb w9, [x0, #2] +; CHECK-NEXT: ldrh w10, [x0] ; CHECK-NEXT: Lloh2: ; CHECK-NEXT: adrp x8, lCPI13_0@PAGE ; CHECK-NEXT: Lloh3: ; CHECK-NEXT: ldr d1, [x8, lCPI13_0@PAGEOFF] ; CHECK-NEXT: add x8, x1, #4 -; CHECK-NEXT: ld1.b { v0 }[2], [x0] +; CHECK-NEXT: orr w9, w10, w9, lsl #16 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: add.4h v0, v0, v1 ; CHECK-NEXT: st1.h { v0 }[2], [x8] >From c1013f847c12c72d24120f2cf48bbfa7782457a8 Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Tue, 30 Jan 2024 09:52:41 +0000 Subject: [PATCH 10/11] Add note about alternative sequence. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f130beca5e15..62af99d817ee 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21255,6 +21255,15 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) { // orr wX, wY, wX, lsl #16 // fmov s0, wX // +// Note that an alternative sequence with even fewer (although usually more +// complex/expensive) instructions would be: +// ld1r.4h { v0 }, [x0], #2 +// ld1.b { v0 }[2], [x0] +// +// Generating this sequence unfortunately results in noticeably worse codegen +// for code that extends the loaded v3i8, due to legalization breaking vector +// shuffle detection in a way that is very difficult to work around. +// TODO: Revisit once v3i8 legalization has been improved in general. static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { EVT MemVT = LD->getMemoryVT(); if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) || >From ca48e7809eda32ad4c3c8ab388149a22ff9fbf1e Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Tue, 30 Jan 2024 13:25:11 +0000 Subject: [PATCH 11/11] !fixup fix naming + update new tests. --- .../Target/AArch64/AArch64ISelLowering.cpp | 4 +- .../AArch64/vec3-loads-ext-trunc-stores.ll | 70 ++++++++----------- 2 files changed, 30 insertions(+), 44 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 62af99d817ee..9e982db8b70c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21289,9 +21289,9 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8); // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8. - SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8, + SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8, DAG.getConstant(16, DL, MVT::i32)); - SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr); + SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl); SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or); // Extract v3i8 again. diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index d17650fff6df..90328f73f86b 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -319,18 +319,14 @@ define <3 x i32> @load_v3i32(ptr %src) { define <3 x i32> @load_v3i8_zext_to_3xi32(ptr %src) { ; CHECK-LABEL: load_v3i8_zext_to_3xi32: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ld1.b { v0 }[4], [x8] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8_zext_to_3xi32: @@ -359,18 +355,14 @@ define <3 x i32> @load_v3i8_zext_to_3xi32(ptr %src) { define <3 x i32> @load_v3i8_sext_to_3xi32(ptr %src) { ; CHECK-LABEL: load_v3i8_sext_to_3xi32: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ld1.b { v0 }[4], [x8] +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: shl.4s v0, v0, #24 ; CHECK-NEXT: sshr.4s v0, v0, #24 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8_sext_to_3xi32: @@ -843,24 +835,21 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) { define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) { ; CHECK-LABEL: load_v3i8_zext_to_3xi32_add_trunc_store: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: ldrb w9, [x0, #2] +; CHECK-NEXT: ldrh w10, [x0] ; CHECK-NEXT: Lloh4: ; CHECK-NEXT: adrp x8, lCPI22_0@PAGE ; CHECK-NEXT: Lloh5: ; CHECK-NEXT: ldr q1, [x8, lCPI22_0@PAGEOFF] -; CHECK-NEXT: add x8, x0, #1 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: add x9, x0, #2 -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ld1.b { v0 }[4], [x9] +; CHECK-NEXT: add x8, x0, #2 +; CHECK-NEXT: orr w9, w10, w9, lsl #16 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: uaddw.4s v0, v1, v0 -; CHECK-NEXT: st1.b { v0 }[4], [x8] -; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[8], [x8] +; CHECK-NEXT: add x8, x0, #1 ; CHECK-NEXT: st1.b { v0 }[0], [x0] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh4, Lloh5 ; @@ -899,24 +888,21 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) { define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) { ; CHECK-LABEL: load_v3i8_sext_to_3xi32_add_trunc_store: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: ldrb w9, [x0, #2] +; CHECK-NEXT: ldrh w10, [x0] ; CHECK-NEXT: Lloh6: ; CHECK-NEXT: adrp x8, lCPI23_0@PAGE ; CHECK-NEXT: Lloh7: ; CHECK-NEXT: ldr q1, [x8, lCPI23_0@PAGEOFF] -; CHECK-NEXT: add x8, x0, #1 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: add x9, x0, #2 -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ld1.b { v0 }[4], [x9] +; CHECK-NEXT: add x8, x0, #2 +; CHECK-NEXT: orr w9, w10, w9, lsl #16 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: uaddw.4s v0, v1, v0 -; CHECK-NEXT: st1.b { v0 }[4], [x8] -; CHECK-NEXT: st1.b { v0 }[8], [x9] +; CHECK-NEXT: st1.b { v0 }[8], [x8] +; CHECK-NEXT: add x8, x0, #1 ; CHECK-NEXT: st1.b { v0 }[0], [x0] -; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: st1.b { v0 }[4], [x8] ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh6, Lloh7 ; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits