Author: Luke Lau Date: 2025-06-13T18:25:07+01:00 New Revision: b81d5e06c7cba8c9f1f5380daed4b9ee139214ba
URL: https://github.com/llvm/llvm-project/commit/b81d5e06c7cba8c9f1f5380daed4b9ee139214ba DIFF: https://github.com/llvm/llvm-project/commit/b81d5e06c7cba8c9f1f5380daed4b9ee139214ba.diff LOG: [InstCombine] Fold shuffles through all trivially vectorizable intrinsics (#141979) This addresses a TODO in foldShuffledIntrinsicOperands to use isTriviallyVectorizable instead of a hardcoded list of intrinsics, which in turn allows more intriniscs to be scalarized by VectorCombine. >From what I can tell every intrinsic here should be speculatable so an assertion was added. Because this enables intrinsics like abs which have a scalar operand, we need to also check isVectorIntrinsicWithScalarOpAtArg. Added: Modified: llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp llvm/test/Transforms/InstCombine/abs-1.ll llvm/test/Transforms/InstCombine/fma.ll llvm/test/Transforms/InstCombine/minmax-intrinsics.ll llvm/test/Transforms/InstCombine/powi.ll llvm/test/Transforms/InstCombine/scmp.ll llvm/test/Transforms/InstCombine/sqrt.ll llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll Removed: ################################################################################ diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index c169ab25b2106..8c8cc0859e4af 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1400,42 +1400,46 @@ static Instruction *factorizeMinMaxTree(IntrinsicInst *II) { /// try to shuffle after the intrinsic. Instruction * InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) { - // TODO: This should be extended to handle other intrinsics like fshl, ctpop, - // etc. Use llvm::isTriviallyVectorizable() and related to determine - // which intrinsics are safe to shuffle? - switch (II->getIntrinsicID()) { - case Intrinsic::smax: - case Intrinsic::smin: - case Intrinsic::umax: - case Intrinsic::umin: - case Intrinsic::fma: - case Intrinsic::fshl: - case Intrinsic::fshr: - break; - default: + if (!isTriviallyVectorizable(II->getIntrinsicID()) || + !II->getCalledFunction()->isSpeculatable()) + return nullptr; + + // fabs is canonicalized to fabs (shuffle ...) in foldShuffleOfUnaryOps, so + // avoid undoing it. + if (match(II, m_FAbs(m_Value()))) return nullptr; - } Value *X; Constant *C; ArrayRef<int> Mask; - auto *NonConstArg = find_if_not(II->args(), IsaPred<Constant>); + auto *NonConstArg = find_if_not(II->args(), [&II](Use &Arg) { + return isa<Constant>(Arg.get()) || + isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), + Arg.getOperandNo(), nullptr); + }); if (!NonConstArg || !match(NonConstArg, m_Shuffle(m_Value(X), m_Poison(), m_Mask(Mask)))) return nullptr; - // At least 1 operand must have 1 use because we are creating 2 instructions. - if (none_of(II->args(), [](Value *V) { return V->hasOneUse(); })) + // At least 1 operand must be a shuffle with 1 use because we are creating 2 + // instructions. + if (none_of(II->args(), [](Value *V) { + return isa<ShuffleVectorInst>(V) && V->hasOneUse(); + })) return nullptr; // See if all arguments are shuffled with the same mask. SmallVector<Value *, 4> NewArgs; Type *SrcTy = X->getType(); - for (Value *Arg : II->args()) { - if (match(Arg, m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) && - X->getType() == SrcTy) + for (Use &Arg : II->args()) { + if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), + Arg.getOperandNo(), nullptr)) + NewArgs.push_back(Arg); + else if (match(&Arg, + m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) && + X->getType() == SrcTy) NewArgs.push_back(X); - else if (match(Arg, m_ImmConstant(C))) { + else if (match(&Arg, m_ImmConstant(C))) { // If it's a constant, try find the constant that would be shuffled to C. if (Constant *ShuffledC = unshuffleConstant(Mask, C, cast<VectorType>(SrcTy))) @@ -1448,8 +1452,12 @@ InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) { // intrinsic (shuf X, M), (shuf Y, M), ... --> shuf (intrinsic X, Y, ...), M Instruction *FPI = isa<FPMathOperator>(II) ? II : nullptr; + // Result type might be a diff erent vector width. + // TODO: Check that the result type isn't widened? + VectorType *ResTy = + VectorType::get(II->getType()->getScalarType(), cast<VectorType>(SrcTy)); Value *NewIntrinsic = - Builder.CreateIntrinsic(II->getIntrinsicID(), SrcTy, NewArgs, FPI); + Builder.CreateIntrinsic(ResTy, II->getIntrinsicID(), NewArgs, FPI); return new ShuffleVectorInst(NewIntrinsic, Mask); } diff --git a/llvm/test/Transforms/InstCombine/abs-1.ll b/llvm/test/Transforms/InstCombine/abs-1.ll index 7037647d116ba..fd67fc3421498 100644 --- a/llvm/test/Transforms/InstCombine/abs-1.ll +++ b/llvm/test/Transforms/InstCombine/abs-1.ll @@ -978,3 +978,14 @@ define i32 @abs_ diff _signed_slt_no_nsw_swap(i32 %a, i32 %b) { %cond = select i1 %cmp, i32 %sub_ba, i32 %sub_ab ret i32 %cond } + +define <2 x i32> @abs_unary_shuffle_ops(<2 x i32> %x) { +; CHECK-LABEL: @abs_unary_shuffle_ops( +; CHECK-NEXT: [[R2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[R1:%.*]], i1 false) +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[R2]], <2 x i32> poison, <2 x i32> <i32 1, i32 0> +; CHECK-NEXT: ret <2 x i32> [[R]] +; + %a = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> <i32 1, i32 0> + %r = call <2 x i32> @llvm.abs(<2 x i32> %a, i1 false) + ret <2 x i32> %r +} diff --git a/llvm/test/Transforms/InstCombine/fma.ll b/llvm/test/Transforms/InstCombine/fma.ll index f0d4f776a5d90..e3d3e722bcc23 100644 --- a/llvm/test/Transforms/InstCombine/fma.ll +++ b/llvm/test/Transforms/InstCombine/fma.ll @@ -972,6 +972,19 @@ define <2 x half> @fma_negone_vec_partial_undef(<2 x half> %x, <2 x half> %y) { ret <2 x half> %sub } +define <2 x float> @fmuladd_unary_shuffle_ops(<2 x float> %x, <2 x float> %y, <2 x float> %z) { +; CHECK-LABEL: @fmuladd_unary_shuffle_ops( +; CHECK-NEXT: [[R:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]]) +; CHECK-NEXT: [[R1:%.*]] = shufflevector <2 x float> [[R]], <2 x float> poison, <2 x i32> <i32 1, i32 0> +; CHECK-NEXT: ret <2 x float> [[R1]] +; + %a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0> + %b = shufflevector <2 x float> %y, <2 x float> poison, <2 x i32> <i32 1, i32 0> + %c = shufflevector <2 x float> %z, <2 x float> poison, <2 x i32> <i32 1, i32 0> + %r = call <2 x float> @llvm.fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) + ret <2 x float> %r +} + ; negative tests define half @fma_non_negone(half %x, half %y) { diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll index 38930956eda2f..52bc3636be359 100644 --- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll @@ -2511,6 +2511,21 @@ define <3 x i8> @smin_unary_shuffle_ops_uses(<3 x i8> %x, <3 x i8> %y) { ret <3 x i8> %r } +; negative test - too many uses + +define <3 x i8> @smin_unary_shuffle_ops_uses_const(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: @smin_unary_shuffle_ops_uses_const( +; CHECK-NEXT: [[SX:%.*]] = shufflevector <3 x i8> [[X:%.*]], <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2> +; CHECK-NEXT: call void @use_vec(<3 x i8> [[SX]]) +; CHECK-NEXT: [[R:%.*]] = call <3 x i8> @llvm.smin.v3i8(<3 x i8> [[SX]], <3 x i8> <i8 1, i8 2, i8 3>) +; CHECK-NEXT: ret <3 x i8> [[R]] +; + %sx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2> + call void @use_vec(<3 x i8> %sx) + %r = call <3 x i8> @llvm.smin.v3i8(<3 x i8> %sx, <3 x i8> <i8 1, i8 2, i8 3>) + ret <3 x i8> %r +} + ; This would assert/crash because we tried to zext to i1. @g = external dso_local global i32, align 4 diff --git a/llvm/test/Transforms/InstCombine/powi.ll b/llvm/test/Transforms/InstCombine/powi.ll index d76f92c1849af..422792a5a2c28 100644 --- a/llvm/test/Transforms/InstCombine/powi.ll +++ b/llvm/test/Transforms/InstCombine/powi.ll @@ -564,3 +564,29 @@ define double @powi_fmul_powi_x_overflow(double noundef %x) { %mul = fmul reassoc double %p1, %x ret double %mul } + +define <3 x float> @powi_unary_shuffle_ops(<3 x float> %x, i32 %power) { +; CHECK-LABEL: @powi_unary_shuffle_ops( +; CHECK-NEXT: [[TMP1:%.*]] = call <3 x float> @llvm.powi.v3f32.i32(<3 x float> [[X:%.*]], i32 [[POWER:%.*]]) +; CHECK-NEXT: [[R:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2> +; CHECK-NEXT: ret <3 x float> [[R]] +; + %sx = shufflevector <3 x float> %x, <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2> + %r = call <3 x float> @llvm.powi(<3 x float> %sx, i32 %power) + ret <3 x float> %r +} + +; Negative test - multiple uses + +define <3 x float> @powi_unary_shuffle_ops_use(<3 x float> %x, i32 %power, ptr %p) { +; CHECK-LABEL: @powi_unary_shuffle_ops_use( +; CHECK-NEXT: [[SX:%.*]] = shufflevector <3 x float> [[X:%.*]], <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2> +; CHECK-NEXT: store <3 x float> [[SX]], ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = call <3 x float> @llvm.powi.v3f32.i32(<3 x float> [[SX]], i32 [[POWER:%.*]]) +; CHECK-NEXT: ret <3 x float> [[R]] +; + %sx = shufflevector <3 x float> %x, <3 x float> poison, <3 x i32> <i32 1, i32 0, i32 2> + store <3 x float> %sx, ptr %p + %r = call <3 x float> @llvm.powi(<3 x float> %sx, i32 %power) + ret <3 x float> %r +} diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll index 2140a59de3fa9..2bf22aeb7a6e9 100644 --- a/llvm/test/Transforms/InstCombine/scmp.ll +++ b/llvm/test/Transforms/InstCombine/scmp.ll @@ -423,6 +423,19 @@ define i8 @scmp_from_select_eq_and_gt_commuted3(i32 %x, i32 %y) { ret i8 %r } +define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) { +; CHECK-LABEL: define <3 x i2> @scmp_unary_shuffle_ops( +; CHECK-SAME: <3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i2> @llvm.scmp.v3i2.v3i8(<3 x i8> [[X]], <3 x i8> [[Y]]) +; CHECK-NEXT: [[R:%.*]] = shufflevector <3 x i2> [[TMP1]], <3 x i2> poison, <3 x i32> <i32 1, i32 0, i32 2> +; CHECK-NEXT: ret <3 x i2> [[R]] +; + %sx = shufflevector <3 x i8> %x, <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2> + %sy = shufflevector <3 x i8> %y, <3 x i8> poison, <3 x i32> <i32 1, i32 0, i32 2> + %r = call <3 x i2> @llvm.scmp(<3 x i8> %sx, <3 x i8> %sy) + ret <3 x i2> %r +} + ; Negative test: true value of outer select is not zero define i8 @scmp_from_select_eq_and_gt_neg1(i32 %x, i32 %y) { ; CHECK-LABEL: define i8 @scmp_from_select_eq_and_gt_neg1( diff --git a/llvm/test/Transforms/InstCombine/sqrt.ll b/llvm/test/Transforms/InstCombine/sqrt.ll index 0f4db3b3a65ae..2fda5bc37d023 100644 --- a/llvm/test/Transforms/InstCombine/sqrt.ll +++ b/llvm/test/Transforms/InstCombine/sqrt.ll @@ -201,6 +201,17 @@ define <2 x float> @sqrt_exp_vec(<2 x float> %x) { ret <2 x float> %res } +define <2 x float> @sqrt_unary_shuffle_ops(<2 x float> %x) { +; CHECK-LABEL: @sqrt_unary_shuffle_ops( +; CHECK-NEXT: [[R:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[A:%.*]]) +; CHECK-NEXT: [[R1:%.*]] = shufflevector <2 x float> [[R]], <2 x float> poison, <2 x i32> <i32 1, i32 0> +; CHECK-NEXT: ret <2 x float> [[R1]] +; + %a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0> + %r = call <2 x float> @llvm.sqrt(<2 x float> %a) + ret <2 x float> %r +} + declare i32 @foo(double) declare double @sqrt(double) readnone declare float @sqrtf(float) diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll index 3749bdf1bba39..783a1e83c6724 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat-inseltpoison.ll @@ -264,11 +264,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX8-NEXT: bb: ; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) +; GFX8-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]]) ; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison> ; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 ; GFX8-NEXT: ret <3 x i16> [[INS_2]] ; @@ -276,11 +273,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX9-NEXT: bb: ; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) +; GFX9-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]]) ; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison> ; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 ; GFX9-NEXT: ret <3 x i16> [[INS_2]] ; @@ -323,24 +317,20 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3> +; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]]) +; GFX8-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]]) ; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3> -; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) -; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> +; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5> ; GFX8-NEXT: ret <4 x i16> [[INS_31]] ; ; GFX9-LABEL: @uadd_sat_v4i16( ; GFX9-NEXT: bb: -; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3> +; GFX9-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]]) +; GFX9-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]]) ; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3> -; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) -; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> +; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5> ; GFX9-NEXT: ret <4 x i16> [[INS_31]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll index 0bb641371825b..7e31ec9a0b39a 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -264,11 +264,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX8-NEXT: bb: ; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) +; GFX8-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]]) ; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison> ; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 ; GFX8-NEXT: ret <3 x i16> [[INS_2]] ; @@ -276,11 +273,8 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) { ; GFX9-NEXT: bb: ; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 ; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) +; GFX9-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]]) ; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison> ; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2 ; GFX9-NEXT: ret <3 x i16> [[INS_2]] ; @@ -323,24 +317,20 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) { ; ; GFX8-LABEL: @uadd_sat_v4i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3> +; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]]) +; GFX8-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]]) ; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3> -; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) -; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> +; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5> ; GFX8-NEXT: ret <4 x i16> [[INS_31]] ; ; GFX9-LABEL: @uadd_sat_v4i16( ; GFX9-NEXT: bb: -; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1> -; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) -; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3> +; GFX9-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]]) +; GFX9-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]]) ; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3> -; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]]) -; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3> +; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison> +; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5> ; GFX9-NEXT: ret <4 x i16> [[INS_31]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll index b790e6f3c99c6..77d36f0107665 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll @@ -38,13 +38,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; SLM-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2> -; SLM-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) +; SLM-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2> ; SLM-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5> -; SLM-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7> -; SLM-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; SLM-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]]) +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7> ; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> ; SLM-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> @@ -59,13 +59,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2> -; AVX-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP8]]) +; AVX-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) +; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2> ; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5> -; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]]) -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7> -; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]]) +; AVX-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]]) +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5> +; AVX-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> <i32 6, i32 7> ; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 ; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> ; AVX-NEXT: [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> @@ -80,13 +80,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; AVX2-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; AVX2-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; AVX2-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2> -; AVX2-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) +; AVX2-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2> ; AVX2-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5> -; AVX2-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7> -; AVX2-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]]) +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5> +; AVX2-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) +; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7> ; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 ; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> ; AVX2-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll index ef1a67032c237..18d79752b0b44 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll @@ -38,13 +38,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; SLM-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2> -; SLM-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) +; SLM-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2> ; SLM-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5> -; SLM-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7> -; SLM-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; SLM-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]]) +; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5> +; SLM-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) +; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7> ; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> ; SLM-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> @@ -59,13 +59,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2> -; AVX-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP8]]) +; AVX-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) +; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2> ; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5> -; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP1]]) -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7> -; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP3]]) +; AVX-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]]) +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5> +; AVX-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) +; AVX-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <2 x i32> <i32 6, i32 7> ; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 ; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> ; AVX-NEXT: [[R2:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> @@ -80,13 +80,13 @@ define <8 x float> @ceil_floor(<8 x float> %a) { ; AVX2-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i64 0 ; AVX2-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i64 3 ; AVX2-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 1, i32 2> -; AVX2-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) +; AVX2-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2> ; AVX2-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 4, i32 5> -; AVX2-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <2 x i32> <i32 6, i32 7> -; AVX2-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) +; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A]]) +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <2 x i32> <i32 4, i32 5> +; AVX2-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A]]) +; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <2 x i32> <i32 6, i32 7> ; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i64 0 ; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> ; AVX2-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits