llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-aarch64 Author: None (llvmbot) <details> <summary>Changes</summary> Backport 58d70dc62b219cd89ba434c96928a0d9c1b23a60 Requested by: @<!-- -->guy-david --- Full diff: https://github.com/llvm/llvm-project/pull/151317.diff 5 Files Affected: - (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+64) - (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+9) - (modified) llvm/test/CodeGen/AArch64/selectopt-const.ll (+2-2) - (added) llvm/test/CodeGen/AArch64/store-float-conversion.ll (+131) - (modified) llvm/test/CodeGen/AArch64/tbl-loops.ll (+21-21) ``````````diff diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index bde4ba993f69e..f5d14e8972d26 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -23908,6 +23908,67 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, Store->getMemOperand()); } +// Combine store (fp_to_int X) to use vector semantics around the conversion +// when NEON is available. This allows us to store the in-vector result directly +// without transferring the result into a GPR in the process. +static SDValue combineStoreValueFPToInt(StoreSDNode *ST, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + // Limit to post-legalization in order to avoid peeling truncating stores. + if (DCI.isBeforeLegalize()) + return SDValue(); + if (!Subtarget->isNeonAvailable()) + return SDValue(); + // Source operand is already a vector. + SDValue Value = ST->getValue(); + if (Value.getValueType().isVector()) + return SDValue(); + + // Look through potential assertions. + while (Value->isAssert()) + Value = Value.getOperand(0); + + if (Value.getOpcode() != ISD::FP_TO_SINT && + Value.getOpcode() != ISD::FP_TO_UINT) + return SDValue(); + if (!Value->hasOneUse()) + return SDValue(); + + SDValue FPSrc = Value.getOperand(0); + EVT SrcVT = FPSrc.getValueType(); + if (SrcVT != MVT::f32 && SrcVT != MVT::f64) + return SDValue(); + + // No support for assignments such as i64 = fp_to_sint i32 + EVT VT = Value.getSimpleValueType(); + if (VT != SrcVT.changeTypeToInteger()) + return SDValue(); + + // Create a 128-bit element vector to avoid widening. The floating point + // conversion is transformed into a single element conversion via a pattern. + unsigned NumElements = 128 / SrcVT.getFixedSizeInBits(); + EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements); + EVT VecDstVT = VecSrcVT.changeTypeToInteger(); + SDLoc DL(ST); + SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc); + SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP); + + if (ST->isTruncatingStore()) { + EVT NewVecDstVT = EVT::getVectorVT( + *DAG.getContext(), ST->getMemoryVT(), + VecDstVT.getFixedSizeInBits() / ST->getMemoryVT().getFixedSizeInBits()); + VecConv = DAG.getNode(AArch64ISD::NVCAST, DL, NewVecDstVT, VecConv); + } + + SDValue Zero = DAG.getVectorIdxConstant(0, DL); + SDValue Extracted = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero); + + DCI.CombineTo(ST->getValue().getNode(), Extracted); + return SDValue(ST, 0); +} + bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) { return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) || (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) || @@ -23990,6 +24051,9 @@ static SDValue performSTORECombine(SDNode *N, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc DL(ST); + if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget)) + return Res; + auto hasValidElementTypeForFPTruncStore = [](EVT VT) { EVT EltVT = VT.getVectorElementType(); return EltVT == MVT::f32 || EltVT == MVT::f64; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ddc685fae5e9a..13194147da8f6 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6632,6 +6632,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))), (UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>; } +def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))), + (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>; +def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))), + (v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>; +def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))), + (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>; +def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))), + (v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>; + // int -> float conversion of value in lane 0 of simd vector should use // correct cvtf variant to avoid costly fpr <-> gpr register transfers. def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))), diff --git a/llvm/test/CodeGen/AArch64/selectopt-const.ll b/llvm/test/CodeGen/AArch64/selectopt-const.ll index a44c746e0f281..fe48dbaf1ab76 100644 --- a/llvm/test/CodeGen/AArch64/selectopt-const.ll +++ b/llvm/test/CodeGen/AArch64/selectopt-const.ll @@ -29,8 +29,8 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) { ; CHECK-NEXT: csel x10, x9, xzr, lt ; CHECK-NEXT: subs x8, x8, #1 ; CHECK-NEXT: ldr s3, [x4, x10] -; CHECK-NEXT: fcvtzs w10, s3 -; CHECK-NEXT: str w10, [x2], #4 +; CHECK-NEXT: fcvtzs s3, s3 +; CHECK-NEXT: st1 { v3.s }[0], [x2], #4 ; CHECK-NEXT: b.ne .LBB0_2 ; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup ; CHECK-NEXT: mov w0, wzr diff --git a/llvm/test/CodeGen/AArch64/store-float-conversion.ll b/llvm/test/CodeGen/AArch64/store-float-conversion.ll new file mode 100644 index 0000000000000..c46801fc16714 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/store-float-conversion.ll @@ -0,0 +1,131 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s + +define void @f32_to_u8(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_u8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu s0, s0 +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptoui float %f to i32 + %trunc = trunc i32 %conv to i8 + store i8 %trunc, ptr %dst + ret void +} + +define void @f32_to_s8(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_s8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs s0, s0 +; CHECK-NEXT: str b0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptosi float %f to i32 + %trunc = trunc i32 %conv to i8 + store i8 %trunc, ptr %dst + ret void +} + +define void @f32_to_u16(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_u16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu s0, s0 +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptoui float %f to i32 + %trunc = trunc i32 %conv to i16 + store i16 %trunc, ptr %dst + ret void +} + +define void @f32_to_s16(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs s0, s0 +; CHECK-NEXT: str h0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptosi float %f to i32 + %trunc = trunc i32 %conv to i16 + store i16 %trunc, ptr %dst + ret void +} + +define void @f32_to_u32(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_u32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu s0, s0 +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptoui float %f to i32 + store i32 %conv, ptr %dst + ret void +} + +define void @f32_to_s32(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs s0, s0 +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptosi float %f to i32 + store i32 %conv, ptr %dst + ret void +} + +define void @f32_to_s64(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_s64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs w8, s0 +; CHECK-NEXT: sxtw x8, w8 +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptosi float %f to i32 + %ext = sext i32 %conv to i64 + store i64 %ext, ptr %dst + ret void +} + +define void @f64_to_u64(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_u64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu d0, d0 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptoui double %d to i64 + store i64 %conv, ptr %dst + ret void +} + +define void @f64_to_s64(double %d, ptr %dst) { +; CHECK-LABEL: f64_to_s64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs d0, d0 +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret +entry: + %conv = fptosi double %d to i64 + store i64 %conv, ptr %dst + ret void +} + +define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) { +; CHECK-LABEL: f32_to_i32_multiple_uses: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs w8, s0 +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: strb w8, [x9] +; CHECK-NEXT: ret +entry: + %conv = fptosi float %f to i32 + %trunc = trunc i32 %conv to i8 + store i8 %trunc, ptr %dst + ret i32 %conv +} diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll index aa0a163b96ac8..223698ba225a8 100644 --- a/llvm/test/CodeGen/AArch64/tbl-loops.ll +++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll @@ -63,8 +63,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: fcmp s2, #0.0 ; CHECK-NEXT: fcsel s2, s0, s3, mi ; CHECK-NEXT: subs w10, w10, #1 -; CHECK-NEXT: fcvtzs w11, s2 -; CHECK-NEXT: strb w11, [x9], #1 +; CHECK-NEXT: fcvtzs s2, s2 +; CHECK-NEXT: st1 { v2.b }[0], [x9], #1 ; CHECK-NEXT: b.ne .LBB0_7 ; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup ; CHECK-NEXT: ret @@ -178,12 +178,12 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: fcmp s3, s1 ; CHECK-NEXT: fcsel s4, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 -; CHECK-NEXT: fcvtzs w11, s2 +; CHECK-NEXT: fcvtzs s2, s2 ; CHECK-NEXT: fcsel s3, s0, s4, mi ; CHECK-NEXT: subs w10, w10, #1 -; CHECK-NEXT: strb w11, [x9] -; CHECK-NEXT: fcvtzs w12, s3 -; CHECK-NEXT: strb w12, [x9, #1] +; CHECK-NEXT: str b2, [x9] +; CHECK-NEXT: fcvtzs s3, s3 +; CHECK-NEXT: stur b3, [x9, #1] ; CHECK-NEXT: add x9, x9, #2 ; CHECK-NEXT: b.ne .LBB1_6 ; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup @@ -395,19 +395,19 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: fcsel s4, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 ; CHECK-NEXT: ldr s3, [x8, #8] -; CHECK-NEXT: fcvtzs w11, s2 +; CHECK-NEXT: fcvtzs s2, s2 ; CHECK-NEXT: add x8, x8, #12 ; CHECK-NEXT: fcsel s4, s0, s4, mi ; CHECK-NEXT: fcmp s3, s1 -; CHECK-NEXT: strb w11, [x9] +; CHECK-NEXT: str b2, [x9] ; CHECK-NEXT: fcsel s5, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 -; CHECK-NEXT: fcvtzs w12, s4 +; CHECK-NEXT: fcvtzs s4, s4 ; CHECK-NEXT: fcsel s3, s0, s5, mi ; CHECK-NEXT: subs w10, w10, #1 -; CHECK-NEXT: strb w12, [x9, #1] -; CHECK-NEXT: fcvtzs w13, s3 -; CHECK-NEXT: strb w13, [x9, #2] +; CHECK-NEXT: stur b4, [x9, #1] +; CHECK-NEXT: fcvtzs s3, s3 +; CHECK-NEXT: stur b3, [x9, #2] ; CHECK-NEXT: add x9, x9, #3 ; CHECK-NEXT: b.ne .LBB2_8 ; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup @@ -563,26 +563,26 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: fcmp s3, s1 ; CHECK-NEXT: fcsel s4, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 -; CHECK-NEXT: fcvtzs w11, s2 +; CHECK-NEXT: fcvtzs s2, s2 ; CHECK-NEXT: ldp s3, s5, [x8, #8] ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: fcsel s4, s0, s4, mi ; CHECK-NEXT: fcmp s3, s1 -; CHECK-NEXT: strb w11, [x9] -; CHECK-NEXT: fcvtzs w12, s4 +; CHECK-NEXT: str b2, [x9] +; CHECK-NEXT: fcvtzs s4, s4 ; CHECK-NEXT: fcsel s6, s1, s3, gt ; CHECK-NEXT: fcmp s3, #0.0 ; CHECK-NEXT: fcsel s3, s0, s6, mi ; CHECK-NEXT: fcmp s5, s1 -; CHECK-NEXT: strb w12, [x9, #1] +; CHECK-NEXT: stur b4, [x9, #1] ; CHECK-NEXT: fcsel s6, s1, s5, gt ; CHECK-NEXT: fcmp s5, #0.0 -; CHECK-NEXT: fcvtzs w13, s3 -; CHECK-NEXT: fcsel s2, s0, s6, mi +; CHECK-NEXT: fcvtzs s3, s3 +; CHECK-NEXT: fcsel s5, s0, s6, mi ; CHECK-NEXT: subs w10, w10, #1 -; CHECK-NEXT: strb w13, [x9, #2] -; CHECK-NEXT: fcvtzs w14, s2 -; CHECK-NEXT: strb w14, [x9, #3] +; CHECK-NEXT: stur b3, [x9, #2] +; CHECK-NEXT: fcvtzs s5, s5 +; CHECK-NEXT: stur b5, [x9, #3] ; CHECK-NEXT: add x9, x9, #4 ; CHECK-NEXT: b.ne .LBB3_6 ; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup `````````` </details> https://github.com/llvm/llvm-project/pull/151317 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
