================ @@ -253,38 +253,38 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-MAXBW-NEXT: entry: ; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 ; CHECK-MAXBW-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-MAXBW: vector.ph: ; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 ; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16 ; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]] ; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-MAXBW: vector.body: ; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 ; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP8]] ; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 ; CHECK-MAXBW-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]] ; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1 -; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64> +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1 ; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 0 -; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1 -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64> -; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP11]] -; CHECK-MAXBW-NEXT: [[TMP15]] = add <vscale x 8 x i64> [[TMP14]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i64> +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i64> +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nuw nsw <vscale x 16 x i64> [[TMP15]], [[TMP13]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP14]]) ---------------- SamTebbs33 wrote:
Ah it looks like what was previously too high a cost for it to choose a 16i8 -> 2i64 partial reduction isn't sufficiently high now that the extend cost is hidden. I've made this permutation invalid. https://github.com/llvm/llvm-project/pull/136173 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits