https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/78632
>From a786cdedc2c9a9898cd0b80d84f5b11aace5da1c Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Tue, 28 Nov 2023 15:44:02 +0000 Subject: [PATCH 1/5] [AArch64] Add custom lowering for load <3 x i8>. Add custom combine to lower load <3 x i8> as the more efficient sequence below: ldrb wX, [x0, #2] ldrh wY, [x0] orr wX, wY, wX, lsl #16 fmov s0, wX At the moment, there are almost no cases in which such vector operations will be generated automatically. The motivating case is non-power-of-2 SLP vectorization: https://github.com/llvm/llvm-project/pull/77790 --- .../Target/AArch64/AArch64ISelLowering.cpp | 54 ++++++++++++++++++- .../AArch64/vec3-loads-ext-trunc-stores.ll | 44 +++++---------- 2 files changed, 65 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8a6f1dc7487bae8..e1139c2fede8e41 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21095,6 +21095,50 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) { return SDValue(); } +// A custom combine to lower load <3 x i8> as the more efficient sequence +// below: +// ldrb wX, [x0, #2] +// ldrh wY, [x0] +// orr wX, wY, wX, lsl #16 +// fmov s0, wX +// +static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { + EVT MemVT = LD->getMemoryVT(); + if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) || + LD->getOriginalAlign() >= 4) + return SDValue(); + + SDLoc DL(LD); + SDValue Chain = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + + // Load 2 x i8, then 1 x i8. + SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(), + LD->getOriginalAlign()); + SDValue L8 = + DAG.getLoad(MVT::i8, DL, Chain, + DAG.getMemBasePlusOffset(BasePtr, TypeSize::getFixed(2), DL), + LD->getPointerInfo(), LD->getOriginalAlign()); + + // Extend to i32. + SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16); + SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8); + + // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8. + SDValue Shr = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8, + DAG.getConstant(16, DL, MVT::i32)); + SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shr); + SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or); + + // Extract v3i8 again. + SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast, + DAG.getConstant(0, DL, MVT::i64)); + SDValue TokenFactor = DAG.getNode( + ISD::TokenFactor, DL, MVT::Other, + {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)}); + return DAG.getMergeValues({Extract, TokenFactor}, DL); +} + // Perform TBI simplification if supported by the target and try to break up // nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit // load instructions can be selected. @@ -21106,10 +21150,16 @@ static SDValue performLOADCombine(SDNode *N, performTBISimplification(N->getOperand(1), DCI, DAG); LoadSDNode *LD = cast<LoadSDNode>(N); - EVT MemVT = LD->getMemoryVT(); - if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian()) + if (LD->isVolatile() || !Subtarget->isLittleEndian()) + return SDValue(N, 0); + + if (SDValue Res = combineV3I8LoadExt(LD, DAG)) + return Res; + + if (!LD->isNonTemporal()) return SDValue(N, 0); + EVT MemVT = LD->getMemoryVT(); if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 || MemVT.getSizeInBits() % 256 == 0 || 256 % MemVT.getScalarSizeInBits() != 0) diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 9eeb194409df6fa..7cac4134f0e1598 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -5,19 +5,10 @@ define <16 x i8> @load_v3i8(ptr %src, ptr %dst) { ; CHECK-LABEL: load_v3i8: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: umov.h w8, v0[0] -; CHECK-NEXT: umov.h w9, v0[1] +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: mov.b v0[1], w9 -; CHECK-NEXT: ld1.b { v0 }[2], [x8] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8: @@ -47,19 +38,14 @@ define <16 x i8> @load_v3i8(ptr %src, ptr %dst) { define <4 x i32> @load_v3i8_to_4xi32(ptr %src, ptr %dst) { ; CHECK-LABEL: load_v3i8_to_4xi32: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ldrsb w8, [x0, #2] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: mov.h v0[1], v0[1] -; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8_to_4xi32: @@ -193,19 +179,15 @@ entry: define void @load_ext_to_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: load_ext_to_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: add x8, x0, #2 -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ld1.b { v0 }[4], [x8] +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: add x8, x1, #4 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: st1.h { v0 }[2], [x8] ; CHECK-NEXT: str s0, [x1] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_ext_to_64bits: >From 192233f0fda044c759054ae9d79c5b33d66fb1af Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Fri, 19 Jan 2024 16:49:34 +0000 Subject: [PATCH 2/5] !fixup adjust alignment and pointer info --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e1139c2fede8e41..95bc6b5cdff57d3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21115,10 +21115,10 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { // Load 2 x i8, then 1 x i8. SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(), LD->getOriginalAlign()); - SDValue L8 = - DAG.getLoad(MVT::i8, DL, Chain, - DAG.getMemBasePlusOffset(BasePtr, TypeSize::getFixed(2), DL), - LD->getPointerInfo(), LD->getOriginalAlign()); + TypeSize Offset2 = TypeSize::getFixed(2); + SDValue L8 = DAG.getLoad( + MVT::i8, DL, Chain, DAG.getMemBasePlusOffset(BasePtr, Offset2, DL), + LD->getPointerInfo(), commonAlignment(LD->getOriginalAlign(), Offset2)); // Extend to i32. SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16); >From 39d6794cceb832afbf3e3bafe2c00413ef405eb7 Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Mon, 22 Jan 2024 16:11:35 +0000 Subject: [PATCH 3/5] !fixup add offset assert and update new tests. --- .../Target/AArch64/AArch64ISelLowering.cpp | 1 + .../AArch64/vec3-loads-ext-trunc-stores.ll | 30 +++++++------------ 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index c4e2a6f90827026..e26bb093ee5cbe0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21173,6 +21173,7 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { SDLoc DL(LD); SDValue Chain = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); + assert(LD->getOffset().isUndef() && "undef offset expected"); // Load 2 x i8, then 1 x i8. SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(), diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 31a3874126d4baf..7435dde4f551bf3 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -76,19 +76,14 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: ldrb w8, [x0, #3] +; CHECK-NEXT: ldurh w9, [x0, #1] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ldrsb w8, [x0, #3] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: mov.h v0[1], v0[1] -; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8_to_4xi32_const_offset_1: @@ -120,19 +115,14 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldurh w8, [x0, #3] +; CHECK-NEXT: ldrb w8, [x0, #5] +; CHECK-NEXT: ldurh w9, [x0, #3] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ldrsb w8, [x0, #5] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: mov.h v0[1], v0[1] -; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8_to_4xi32_const_offset_3: >From e96af2fa4ca83bade36e1f0aa1ab2e2b1d6dc49e Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Tue, 23 Jan 2024 14:19:32 +0000 Subject: [PATCH 4/5] !fixup update on top of new test coverage. Update checks after adding more tests in e7b4ff8 --- .../CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index a4698c27cfd2cd3..5a253bea6f1e9fe 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -348,24 +348,20 @@ entry: define void @load_ext_add_to_64bits(ptr %src, ptr %dst) { ; CHECK-LABEL: load_ext_add_to_64bits: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w9, [x0] +; CHECK-NEXT: ldrb w9, [x0, #2] +; CHECK-NEXT: ldrh w10, [x0] ; CHECK-NEXT: Lloh2: ; CHECK-NEXT: adrp x8, lCPI9_0@PAGE ; CHECK-NEXT: Lloh3: ; CHECK-NEXT: ldr d1, [x8, lCPI9_0@PAGEOFF] ; CHECK-NEXT: add x8, x1, #4 -; CHECK-NEXT: strh w9, [sp, #12] -; CHECK-NEXT: add x9, x0, #2 -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: ld1.b { v0 }[4], [x9] +; CHECK-NEXT: orr w9, w10, w9, lsl #16 +; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: add.4h v0, v0, v1 ; CHECK-NEXT: st1.h { v0 }[2], [x8] ; CHECK-NEXT: str s0, [x1] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh2, Lloh3 ; >From 7e2bf68358fc55e6e770601872b4a6ffd9349ec3 Mon Sep 17 00:00:00 2001 From: Florian Hahn <f...@fhahn.com> Date: Wed, 24 Jan 2024 21:17:34 +0000 Subject: [PATCH 5/5] !fixup update tests and use MMO. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 11 ++++++----- .../AArch64/vec3-loads-ext-trunc-stores.ll | 15 +++++---------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a53c4740bd3d49f..00d62b7450f3cb8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -21262,17 +21262,18 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) { return SDValue(); SDLoc DL(LD); + MachineFunction &MF = DAG.getMachineFunction(); SDValue Chain = LD->getChain(); SDValue BasePtr = LD->getBasePtr(); + MachineMemOperand *MMO = LD->getMemOperand(); assert(LD->getOffset().isUndef() && "undef offset expected"); // Load 2 x i8, then 1 x i8. - SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, LD->getPointerInfo(), - LD->getOriginalAlign()); + SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO); TypeSize Offset2 = TypeSize::getFixed(2); - SDValue L8 = DAG.getLoad( - MVT::i8, DL, Chain, DAG.getMemBasePlusOffset(BasePtr, Offset2, DL), - LD->getPointerInfo(), commonAlignment(LD->getOriginalAlign(), Offset2)); + SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain, + DAG.getMemBasePlusOffset(BasePtr, Offset2, DL), + MF.getMachineMemOperand(MMO, 2, 1)); // Extend to i32. SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16); diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index 796580f083d0dc0..275e5ac8b7062e0 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -76,19 +76,14 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) { define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) { ; CHECK-LABEL: load_v3i8_to_4xi32_align_2: ; CHECK: ; %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrh w9, [x0] ; CHECK-NEXT: movi.2d v1, #0x0000ff000000ff -; CHECK-NEXT: strh w8, [sp, #12] -; CHECK-NEXT: ldr s0, [sp, #12] -; CHECK-NEXT: ldrsb w8, [x0, #2] -; CHECK-NEXT: ushll.8h v0, v0, #0 -; CHECK-NEXT: mov.h v0[1], v0[1] -; CHECK-NEXT: mov.h v0[2], w8 +; CHECK-NEXT: orr w8, w9, w8, lsl #16 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: zip1.8b v0, v0, v0 ; CHECK-NEXT: ushll.4s v0, v0, #0 ; CHECK-NEXT: and.16b v0, v0, v1 -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret ; ; BE-LABEL: load_v3i8_to_4xi32_align_2: _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits