https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/120598
>From 768b1a91595c23b3902aee20b8142b2897ac3840 Mon Sep 17 00:00:00 2001 From: jofrn <jofer...@amd.com> Date: Thu, 19 Dec 2024 11:19:39 -0500 Subject: [PATCH] [SelectionDAG] Widen <2 x T> vector types for atomic load Vector types of 2 elements must be widened. This change does this for vector types of atomic load in SelectionDAG so that it can translate aligned vectors of >1 size. commit-id:2894ccd1 --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../SelectionDAG/LegalizeVectorTypes.cpp | 97 ++++-- llvm/test/CodeGen/X86/atomic-load-store.ll | 286 ++++++++++++++++++ 3 files changed, 361 insertions(+), 23 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index d24b4517a460d..b6e018ba0e454 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -1068,6 +1068,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N); SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N); SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N); + SDValue WidenVecRes_ATOMIC_LOAD(AtomicSDNode *N); SDValue WidenVecRes_LOAD(SDNode* N); SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N); SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index d6cbf2211f053..42763aab5bb55 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -4622,6 +4622,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { break; case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break; case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break; + case ISD::ATOMIC_LOAD: + Res = WidenVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N)); + break; case ISD::LOAD: Res = WidenVecRes_LOAD(N); break; case ISD::STEP_VECTOR: case ISD::SPLAT_VECTOR: @@ -6003,6 +6006,74 @@ SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) { N->getOperand(1), N->getOperand(2)); } +/// Either return the same load or provide appropriate casts +/// from the load and return that. +static SDValue coerceLoadedValue(SDValue LdOp, EVT FirstVT, EVT WidenVT, + TypeSize LdWidth, TypeSize FirstVTWidth, + SDLoc dl, SelectionDAG &DAG) { + assert(TypeSize::isKnownLE(LdWidth, FirstVTWidth)); + TypeSize WidenWidth = WidenVT.getSizeInBits(); + if (!FirstVT.isVector()) { + unsigned NumElts = + WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue(); + EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), FirstVT, NumElts); + SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp); + return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp); + } + assert(FirstVT == WidenVT); + return LdOp; +} + +static std::optional<EVT> findMemType(SelectionDAG &DAG, + const TargetLowering &TLI, unsigned Width, + EVT WidenVT, unsigned Align, + unsigned WidenEx); + +SDValue DAGTypeLegalizer::WidenVecRes_ATOMIC_LOAD(AtomicSDNode *LD) { + EVT WidenVT = + TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0)); + EVT LdVT = LD->getMemoryVT(); + SDLoc dl(LD); + assert(LdVT.isVector() && WidenVT.isVector() && "Expected vectors"); + assert(LdVT.isScalableVector() == WidenVT.isScalableVector() && + "Must be scalable"); + assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType() && + "Expected equivalent element types"); + + // Load information + SDValue Chain = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); + AAMDNodes AAInfo = LD->getAAInfo(); + + TypeSize LdWidth = LdVT.getSizeInBits(); + TypeSize WidenWidth = WidenVT.getSizeInBits(); + TypeSize WidthDiff = WidenWidth - LdWidth; + + // Find the vector type that can load from. + std::optional<EVT> FirstVT = + findMemType(DAG, TLI, LdWidth.getKnownMinValue(), WidenVT, /*LdAlign=*/0, + WidthDiff.getKnownMinValue()); + + if (!FirstVT) + return SDValue(); + + SmallVector<EVT, 8> MemVTs; + TypeSize FirstVTWidth = FirstVT->getSizeInBits(); + + SDValue LdOp = DAG.getAtomicLoad(ISD::NON_EXTLOAD, dl, *FirstVT, *FirstVT, + Chain, BasePtr, LD->getMemOperand()); + + // Load the element with one instruction. + SDValue Result = coerceLoadedValue(LdOp, *FirstVT, WidenVT, LdWidth, + FirstVTWidth, dl, DAG); + + // Modified the chain - switch anything that used the old chain to use + // the new one. + ReplaceValueWith(SDValue(LD, 1), LdOp.getValue(1)); + return Result; +} + SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) { LoadSDNode *LD = cast<LoadSDNode>(N); ISD::LoadExtType ExtType = LD->getExtensionType(); @@ -7894,29 +7965,9 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain, LdChain.push_back(LdOp.getValue(1)); // Check if we can load the element with one instruction. - if (MemVTs.empty()) { - assert(TypeSize::isKnownLE(LdWidth, FirstVTWidth)); - if (!FirstVT->isVector()) { - unsigned NumElts = - WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue(); - EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), *FirstVT, NumElts); - SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp); - return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp); - } - if (FirstVT == WidenVT) - return LdOp; - - // TODO: We don't currently have any tests that exercise this code path. - assert(WidenWidth.getFixedValue() % FirstVTWidth.getFixedValue() == 0); - unsigned NumConcat = - WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue(); - SmallVector<SDValue, 16> ConcatOps(NumConcat); - SDValue UndefVal = DAG.getUNDEF(*FirstVT); - ConcatOps[0] = LdOp; - for (unsigned i = 1; i != NumConcat; ++i) - ConcatOps[i] = UndefVal; - return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps); - } + if (MemVTs.empty()) + return coerceLoadedValue(LdOp, *FirstVT, WidenVT, LdWidth, FirstVTWidth, dl, + DAG); // Load vector by using multiple loads from largest vector to scalar. SmallVector<SDValue, 16> LdOps; diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 3e7b73a65fe07..ff5391f44bbe3 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -270,6 +270,212 @@ define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind { ret <1 x i64> %ret } +define <2 x i8> @atomic_vec2_i8(ptr %x) { +; CHECK-O3-LABEL: atomic_vec2_i8: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movzwl (%rdi), %eax +; CHECK-O3-NEXT: movd %eax, %xmm0 +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec2_i8: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax +; CHECK-SSE-O3-NEXT: movd %eax, %xmm0 +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_i8: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax +; CHECK-AVX-O3-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec2_i8: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movw (%rdi), %cx +; CHECK-O0-NEXT: # implicit-def: $eax +; CHECK-O0-NEXT: movw %cx, %ax +; CHECK-O0-NEXT: movd %eax, %xmm0 +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec2_i8: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movw (%rdi), %cx +; CHECK-SSE-O0-NEXT: # implicit-def: $eax +; CHECK-SSE-O0-NEXT: movw %cx, %ax +; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec2_i8: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: movw (%rdi), %cx +; CHECK-AVX-O0-NEXT: # implicit-def: $eax +; CHECK-AVX-O0-NEXT: movw %cx, %ax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <2 x i8>, ptr %x acquire, align 4 + ret <2 x i8> %ret +} + +define <2 x i16> @atomic_vec2_i16(ptr %x) { +; CHECK-O3-LABEL: atomic_vec2_i16: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movl (%rdi), %eax +; CHECK-O3-NEXT: movd %eax, %xmm0 +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec2_i16: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movl (%rdi), %eax +; CHECK-SSE-O3-NEXT: movd %eax, %xmm0 +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_i16: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: movl (%rdi), %eax +; CHECK-AVX-O3-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec2_i16: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movl (%rdi), %eax +; CHECK-O0-NEXT: movd %eax, %xmm0 +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec2_i16: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movl (%rdi), %eax +; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec2_i16: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: movl (%rdi), %eax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <2 x i16>, ptr %x acquire, align 4 + ret <2 x i16> %ret +} + +define <2 x ptr addrspace(270)> @atomic_vec2_ptr270(ptr %x) { +; CHECK-O3-LABEL: atomic_vec2_ptr270: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq (%rdi), %rax +; CHECK-O3-NEXT: movq %rax, %xmm0 +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec2_ptr270: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movq (%rdi), %rax +; CHECK-SSE-O3-NEXT: movq %rax, %xmm0 +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_ptr270: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: movq (%rdi), %rax +; CHECK-AVX-O3-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec2_ptr270: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq (%rdi), %rax +; CHECK-O0-NEXT: movq %rax, %xmm0 +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec2_ptr270: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movq (%rdi), %rax +; CHECK-SSE-O0-NEXT: movq %rax, %xmm0 +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec2_ptr270: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: movq (%rdi), %rax +; CHECK-AVX-O0-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <2 x ptr addrspace(270)>, ptr %x acquire, align 8 + ret <2 x ptr addrspace(270)> %ret +} + +define <2 x i32> @atomic_vec2_i32_align(ptr %x) { +; CHECK-O3-LABEL: atomic_vec2_i32_align: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq (%rdi), %rax +; CHECK-O3-NEXT: movq %rax, %xmm0 +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec2_i32_align: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movq (%rdi), %rax +; CHECK-SSE-O3-NEXT: movq %rax, %xmm0 +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_i32_align: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: movq (%rdi), %rax +; CHECK-AVX-O3-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec2_i32_align: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq (%rdi), %rax +; CHECK-O0-NEXT: movq %rax, %xmm0 +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec2_i32_align: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movq (%rdi), %rax +; CHECK-SSE-O0-NEXT: movq %rax, %xmm0 +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec2_i32_align: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: movq (%rdi), %rax +; CHECK-AVX-O0-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <2 x i32>, ptr %x acquire, align 8 + ret <2 x i32> %ret +} + +define <2 x float> @atomic_vec2_float_align(ptr %x) { +; CHECK-O3-LABEL: atomic_vec2_float_align: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq (%rdi), %rax +; CHECK-O3-NEXT: movq %rax, %xmm0 +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec2_float_align: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movq (%rdi), %rax +; CHECK-SSE-O3-NEXT: movq %rax, %xmm0 +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_float_align: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: movq (%rdi), %rax +; CHECK-AVX-O3-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec2_float_align: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq (%rdi), %rax +; CHECK-O0-NEXT: movq %rax, %xmm0 +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec2_float_align: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movq (%rdi), %rax +; CHECK-SSE-O0-NEXT: movq %rax, %xmm0 +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec2_float_align: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: movq (%rdi), %rax +; CHECK-AVX-O0-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <2 x float>, ptr %x acquire, align 8 + ret <2 x float> %ret +} + define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind { ; CHECK-O3-LABEL: atomic_vec1_ptr: ; CHECK-O3: # %bb.0: @@ -691,6 +897,86 @@ define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind { ret <2 x i32> %ret } +define <4 x i8> @atomic_vec4_i8(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec4_i8: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movl (%rdi), %eax +; CHECK-O3-NEXT: movd %eax, %xmm0 +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec4_i8: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movl (%rdi), %eax +; CHECK-SSE-O3-NEXT: movd %eax, %xmm0 +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec4_i8: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: movl (%rdi), %eax +; CHECK-AVX-O3-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec4_i8: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movl (%rdi), %eax +; CHECK-O0-NEXT: movd %eax, %xmm0 +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec4_i8: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movl (%rdi), %eax +; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec4_i8: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: movl (%rdi), %eax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <4 x i8>, ptr %x acquire, align 4 + ret <4 x i8> %ret +} + +define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind { +; CHECK-O3-LABEL: atomic_vec4_i16: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movq (%rdi), %rax +; CHECK-O3-NEXT: movq %rax, %xmm0 +; CHECK-O3-NEXT: retq +; +; CHECK-SSE-O3-LABEL: atomic_vec4_i16: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movq (%rdi), %rax +; CHECK-SSE-O3-NEXT: movq %rax, %xmm0 +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec4_i16: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: movq (%rdi), %rax +; CHECK-AVX-O3-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-O0-LABEL: atomic_vec4_i16: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: movq (%rdi), %rax +; CHECK-O0-NEXT: movq %rax, %xmm0 +; CHECK-O0-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec4_i16: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movq (%rdi), %rax +; CHECK-SSE-O0-NEXT: movq %rax, %xmm0 +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec4_i16: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: movq (%rdi), %rax +; CHECK-AVX-O0-NEXT: vmovq %rax, %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <4 x i16>, ptr %x acquire, align 8 + ret <4 x i16> %ret +} + define <4 x float> @atomic_vec4_float(ptr %x) nounwind { ; CHECK-O3-LABEL: atomic_vec4_float: ; CHECK-O3: # %bb.0: _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits