https://github.com/nikic created https://github.com/llvm/llvm-project/pull/144322
Backport of https://github.com/llvm/llvm-project/pull/143020 for https://github.com/llvm/llvm-project/issues/139050. >From 9792f981063d6ddadd3678ac31e2254daa6aa9cf Mon Sep 17 00:00:00 2001 From: Nikita Popov <npo...@redhat.com> Date: Fri, 6 Jun 2025 17:50:16 +0200 Subject: [PATCH 1/2] [PhaseOrdering] Add test for #139050 (NFC) (cherry picked from commit cef5a3155bab9a2db5389f782471d56f1dd15b61) --- .../PhaseOrdering/X86/vector-reductions.ll | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll index 254136b0b841a..f8450766037b2 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -325,3 +325,53 @@ cleanup: %retval.0 = phi i1 [ false, %if.then ], [ true, %if.end ] ret i1 %retval.0 } + +; From https://github.com/llvm/llvm-project/issues/139050. +; FIXME: This should be vectorized. +define i8 @masked_min_reduction(ptr %data, ptr %mask) { +; CHECK-LABEL: @masked_min_reduction( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACC:%.*]] = phi i8 [ -1, [[ENTRY]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[DATA:%.*]] = getelementptr i8, ptr [[DATA1:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[DATA]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[MASK:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[M:%.*]] = load i8, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[M]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.umin.i8(i8 [[ACC]], i8 [[VAL]]) +; CHECK-NEXT: [[TMP21]] = select i1 [[COND]], i8 [[TMP0]], i8 [[ACC]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP20]], label [[EXIT:%.*]], label [[VECTOR_BODY]] +; CHECK: exit: +; CHECK-NEXT: ret i8 [[TMP21]] +; +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %next, %loop ] + %acc = phi i8 [ 255, %entry ], [ %acc_next, %loop ] + + %ptr_i = getelementptr i8, ptr %data, i64 %i + %val = load i8, ptr %ptr_i, align 1 + + %mask_ptr = getelementptr i8, ptr %mask, i64 %i + %m = load i8, ptr %mask_ptr, align 1 + %cond = icmp eq i8 %m, 0 + + ; Use select to implement masking + %masked_val = select i1 %cond, i8 %val, i8 255 + + ; min reduction + %acc_next = call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val) + + %next = add i64 %i, 1 + %cmp = icmp ult i64 %next, 1024 + br i1 %cmp, label %loop, label %exit + +exit: + ret i8 %acc_next +} >From 3d02b33c4b791b1e19bfdc5a7a6d372fa6e527ac Mon Sep 17 00:00:00 2001 From: Konstantin Bogdanov <thevar1a...@users.noreply.github.com> Date: Sat, 14 Jun 2025 09:32:54 +0300 Subject: [PATCH 2/2] [InstCombine] Avoid folding `select(umin(X, Y), X)` with min/max values in false arm (#143020) Fixes https://github.com/llvm/llvm-project/issues/139050. This patch adds a check to avoid folding min/max reduction into select, which may block loop vectorization. The issue is that the following snippet: ``` declare i8 @llvm.umin.i8(i8, i8) define i8 @masked_min_fold_bug(i8 %acc, i8 %val, i8 %mask) { ; CHECK-LABEL: @masked_min_fold_bug( ; CHECK: %cond = icmp eq i8 %mask, 0 ; CHECK: %masked_val = select i1 %cond, i8 %val, i8 255 ; CHECK: call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val) ; %cond = icmp eq i8 %mask, 0 %masked_val = select i1 %cond, i8 %val, i8 255 %res = call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val) ret i8 %res } ``` is being optimized to the following code, which can not be vectorized later. ``` declare i8 @llvm.umin.i8(i8, i8) #0 define i8 @masked_min_fold_bug(i8 %acc, i8 %val, i8 %mask) { %cond = icmp eq i8 %mask, 0 %1 = call i8 @llvm.umin.i8(i8 %acc, i8 %val) %res = select i1 %cond, i8 %1, i8 %acc ret i8 %res } attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ``` Expected: ``` declare i8 @llvm.umin.i8(i8, i8) #0 define i8 @masked_min_fold_bug(i8 %acc, i8 %val, i8 %mask) { %cond = icmp eq i8 %mask, 0 %masked_val = select i1 %cond, i8 %val, i8 -1 %res = call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val) ret i8 %res } attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ``` https://godbolt.org/z/cYMheKE5r (cherry picked from commit 07fa6d1d90c714fa269529c3e5004a063d814c4a) --- .../InstCombine/InstructionCombining.cpp | 9 ++++ llvm/test/Transforms/InstCombine/select.ll | 47 +++++++++++++++++ .../PhaseOrdering/X86/vector-reductions.ll | 50 ++++++++++++++----- 3 files changed, 94 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index a64c188575e6c..0f5e867877da2 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1697,6 +1697,15 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI, if (SI->getType()->isIntOrIntVectorTy(1)) return nullptr; + // Avoid breaking min/max reduction pattern, + // which is necessary for vectorization later. + if (isa<MinMaxIntrinsic>(&Op)) + for (Value *IntrinOp : Op.operands()) + if (auto *PN = dyn_cast<PHINode>(IntrinOp)) + for (Value *PhiOp : PN->operands()) + if (PhiOp == &Op) + return nullptr; + // Test if a FCmpInst instruction is used exclusively by a select as // part of a minimum or maximum operation. If so, refrain from doing // any other folding. This helps out other analyses which understand diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index 3c3111492fc68..e8a32cb1697a5 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -4901,3 +4901,50 @@ define i32 @src_simplify_2x_at_once_and(i32 %x, i32 %y) { %cond = select i1 %and0, i32 %sub, i32 %xor ret i32 %cond } + +define void @no_fold_masked_min_loop(ptr nocapture readonly %vals, ptr nocapture readonly %masks, ptr nocapture %out, i64 %n) { +; CHECK-LABEL: @no_fold_masked_min_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXT_INDEX:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ACC:%.*]] = phi i8 [ -1, [[ENTRY]] ], [ [[RES:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[VAL_PTR:%.*]] = getelementptr inbounds i8, ptr [[VALS:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[MASK_PTR:%.*]] = getelementptr inbounds i8, ptr [[MASKS:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[VAL_PTR]], align 1 +; CHECK-NEXT: [[MASK:%.*]] = load i8, ptr [[MASK_PTR]], align 1 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[MASK]], 0 +; CHECK-NEXT: [[MASKED_VAL:%.*]] = select i1 [[COND]], i8 [[VAL]], i8 -1 +; CHECK-NEXT: [[RES]] = call i8 @llvm.umin.i8(i8 [[ACC]], i8 [[MASKED_VAL]]) +; CHECK-NEXT: [[NEXT_INDEX]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXT_INDEX]], [[N:%.*]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: store i8 [[RES]], ptr [[OUT:%.*]], align 1 +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %index = phi i64 [0, %entry], [%next_index, %loop] + %acc = phi i8 [255, %entry], [%res, %loop] + + %val_ptr = getelementptr inbounds i8, ptr %vals, i64 %index + %mask_ptr = getelementptr inbounds i8, ptr %masks, i64 %index + + %val = load i8, ptr %val_ptr, align 1 + %mask = load i8, ptr %mask_ptr, align 1 + + %cond = icmp eq i8 %mask, 0 + %masked_val = select i1 %cond, i8 %val, i8 -1 + %res = call i8 @llvm.umin.i8(i8 %acc, i8 %masked_val) + + %next_index = add i64 %index, 1 + %done = icmp eq i64 %next_index, %n + br i1 %done, label %exit, label %loop + +exit: + store i8 %res, ptr %out, align 1 + ret void +} diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll index f8450766037b2..2ec48a8637dae 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -326,26 +326,52 @@ cleanup: ret i1 %retval.0 } -; From https://github.com/llvm/llvm-project/issues/139050. -; FIXME: This should be vectorized. define i8 @masked_min_reduction(ptr %data, ptr %mask) { ; CHECK-LABEL: @masked_min_reduction( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: loop: +; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACC:%.*]] = phi i8 [ -1, [[ENTRY]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <32 x i8> [ splat (i8 -1), [[ENTRY]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[DATA:%.*]] = getelementptr i8, ptr [[DATA1:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[DATA]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DATA]], i64 32 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DATA]], i64 64 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DATA]], i64 96 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[DATA]], align 1 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <32 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <32 x i8>, ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[MASK:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[M:%.*]] = load i8, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[M]], 0 -; CHECK-NEXT: [[TMP0:%.*]] = tail call i8 @llvm.umin.i8(i8 [[ACC]], i8 [[VAL]]) -; CHECK-NEXT: [[TMP21]] = select i1 [[COND]], i8 [[TMP0]], i8 [[ACC]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <32 x i8>, ptr [[TMP7]], align 1 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <32 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <32 x i8>, ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <32 x i8> [[WIDE_LOAD10]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = select <32 x i1> [[TMP8]], <32 x i8> [[WIDE_LOAD]], <32 x i8> splat (i8 -1) +; CHECK-NEXT: [[TMP13:%.*]] = select <32 x i1> [[TMP9]], <32 x i8> [[WIDE_LOAD4]], <32 x i8> splat (i8 -1) +; CHECK-NEXT: [[TMP14:%.*]] = select <32 x i1> [[TMP10]], <32 x i8> [[WIDE_LOAD5]], <32 x i8> splat (i8 -1) +; CHECK-NEXT: [[TMP15:%.*]] = select <32 x i1> [[TMP11]], <32 x i8> [[WIDE_LOAD6]], <32 x i8> splat (i8 -1) +; CHECK-NEXT: [[TMP16]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI]], <32 x i8> [[TMP12]]) +; CHECK-NEXT: [[TMP17]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI1]], <32 x i8> [[TMP13]]) +; CHECK-NEXT: [[TMP18]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI2]], <32 x i8> [[TMP14]]) +; CHECK-NEXT: [[TMP19]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[VEC_PHI3]], <32 x i8> [[TMP15]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP20]], label [[EXIT:%.*]], label [[VECTOR_BODY]] -; CHECK: exit: +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[RDX_MINMAX:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP16]], <32 x i8> [[TMP17]]) +; CHECK-NEXT: [[RDX_MINMAX11:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[RDX_MINMAX]], <32 x i8> [[TMP18]]) +; CHECK-NEXT: [[RDX_MINMAX12:%.*]] = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[RDX_MINMAX11]], <32 x i8> [[TMP19]]) +; CHECK-NEXT: [[TMP21:%.*]] = tail call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> [[RDX_MINMAX12]]) ; CHECK-NEXT: ret i8 [[TMP21]] ; entry: _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits