TiehuZhang updated this revision to Diff 427300.
TiehuZhang added a comment.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.
Fix the failed case (optimization-remark-options.c), because the remark info
should be updated
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D122126/new/
https://reviews.llvm.org/D122126
Files:
clang/test/Frontend/optimization-remark-options.c
llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll
Index: llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll
@@ -0,0 +1,90 @@
+; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -S -loop-vectorize < %s -o - | FileCheck %s
+; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -S -loop-vectorize -force-vector-interleave=2 < %s -o - | FileCheck -check-prefix=CHECK-INTERLEAVE %s
+
+; The case will do aggressive interleave on PowerPC, resulting in a lot of memory checks.
+; (On the A2, always unroll aggressively. In fact, if aggressive interleaving is enabled,
+; similar issues may occur on other targets).
+; Interleaving should also be restricted by the threshold of memory checks similar to VF.
+; (e.g., runtime-memory-check-threshold, default 8).
+
+; CHECK-LABEL: @eddy_diff_caleddy_
+; CHECK-NOT: vector.memcheck
+
+; CHECK-INTERLEAVE-LABEL: @eddy_diff_caleddy_
+; CHECK-INTERLEAVE: vector.memcheck
+
+define fastcc void @eddy_diff_caleddy_(i64* %wet_cl, i64 %0, i32 %ncol.cast.val) {
+entry:
+ %trip.count = add nuw i32 %ncol.cast.val, 1
+ %wide.trip.count = zext i32 %ncol.cast.val to i64
+ %1 = shl i64 %0, 1
+ %2 = mul i64 %0, 3
+ %3 = shl i64 %0, 2
+ %4 = mul i64 %0, 5
+ %5 = mul i64 %0, 6
+ %6 = mul i64 %0, 7
+ %7 = shl i64 %0, 3
+ %8 = mul i64 %0, 9
+ %9 = mul i64 %0, 10
+ %10 = mul i64 %0, 11
+ %11 = mul i64 %0, 12
+ br label %loop.body
+
+loop.body:
+ %indvars.iv774 = phi i64 [ 0, %entry ], [ %indvars.iv.next775, %loop.body ]
+ %12 = add nsw i64 %indvars.iv774, -5
+ %13 = add i64 %12, %0
+ %14 = getelementptr i64, i64* %wet_cl, i64 %13
+ %15 = bitcast i64* %14 to double*
+ store double 0.000000e+00, double* %15, align 8
+ %16 = add i64 %12, %1
+ %17 = getelementptr i64, i64* %wet_cl, i64 %16
+ %18 = bitcast i64* %17 to double*
+ store double 0.000000e+00, double* %18, align 8
+ %19 = add i64 %12, %2
+ %20 = getelementptr i64, i64* %wet_cl, i64 %19
+ %21 = bitcast i64* %20 to double*
+ store double 0.000000e+00, double* %21, align 8
+ %22 = add i64 %12, %3
+ %23 = getelementptr i64, i64* %wet_cl, i64 %22
+ %24 = bitcast i64* %23 to double*
+ store double 0.000000e+00, double* %24, align 8
+ %25 = add i64 %12, %4
+ %26 = getelementptr i64, i64* %wet_cl, i64 %25
+ %27 = bitcast i64* %26 to double*
+ store double 0.000000e+00, double* %27, align 8
+ %28 = add i64 %12, %5
+ %29 = getelementptr i64, i64* %wet_cl, i64 %28
+ %30 = bitcast i64* %29 to double*
+ store double 0.000000e+00, double* %30, align 8
+ %31 = add i64 %12, %6
+ %32 = getelementptr i64, i64* %wet_cl, i64 %31
+ %33 = bitcast i64* %32 to double*
+ store double 0.000000e+00, double* %33, align 8
+ %34 = add i64 %12, %7
+ %35 = getelementptr i64, i64* %wet_cl, i64 %34
+ %36 = bitcast i64* %35 to double*
+ store double 0.000000e+00, double* %36, align 8
+ %37 = add i64 %12, %8
+ %38 = getelementptr i64, i64* %wet_cl, i64 %37
+ %39 = bitcast i64* %38 to double*
+ store double 0.000000e+00, double* %39, align 8
+ %40 = add i64 %12, %9
+ %41 = getelementptr i64, i64* %wet_cl, i64 %40
+ %42 = bitcast i64* %41 to double*
+ store double 0.000000e+00, double* %42, align 8
+ %43 = add i64 %12, %10
+ %44 = getelementptr i64, i64* %wet_cl, i64 %43
+ %45 = bitcast i64* %44 to double*
+ store double 0.000000e+00, double* %45, align 8
+ %46 = add i64 %12, %11
+ %47 = getelementptr i64, i64* %wet_cl, i64 %46
+ %48 = bitcast i64* %47 to double*
+ store double 0.000000e+00, double* %48, align 8
+ %indvars.iv.next775 = add nuw nsw i64 %indvars.iv774, 1
+ %exitcond778.not = icmp eq i64 %indvars.iv.next775, %wide.trip.count
+ br i1 %exitcond778.not, label %loop.end, label %loop.body
+
+loop.end:
+ ret void
+}
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7450,6 +7450,14 @@
return VectorizationFactor::Disabled();
}
+bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() {
+ unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
+ return (NumRuntimePointerChecks >
+ VectorizerParams::RuntimeMemoryCheckThreshold &&
+ !Hints.allowReordering()) ||
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+}
+
Optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -7518,30 +7526,7 @@
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.
- auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
-
- // Check if it is profitable to vectorize with runtime checks.
- unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
- if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
- bool PragmaThresholdReached =
- NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
- bool ThresholdReached =
- NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
- if ((ThresholdReached && !Hints.allowReordering()) ||
- PragmaThresholdReached) {
- ORE->emit([&]() {
- return OptimizationRemarkAnalysisAliasing(
- DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
- OrigLoop->getHeader())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "memory operations";
- });
- LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
- Hints.emitRemarkWithHints();
- return VectorizationFactor::Disabled();
- }
- }
- return SelectedVF;
+ return CM.selectVectorizationFactor(VFCandidates);
}
VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
@@ -10450,9 +10435,21 @@
unsigned IC = 1;
if (MaybeVF) {
- VF = *MaybeVF;
- // Select the interleave count.
- IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
+ if (UserVF || UserIC || !LVP.requiresTooManyRuntimeChecks()) {
+ VF = *MaybeVF;
+ // Select the interleave count.
+ IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
+ } else {
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysisAliasing(
+ DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "memory operations";
+ });
+ LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+ Hints.emitRemarkWithHints();
+ }
}
// Identify the diagnostic messages that should be produced.
Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -320,6 +320,9 @@
getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate,
VFRange &Range);
+ /// Check if the number of runtime checks exceeds the threshold.
+ bool requiresTooManyRuntimeChecks();
+
protected:
/// Collect the instructions from the original loop that would be trivially
/// dead in the vectorized loop if generated.
Index: clang/test/Frontend/optimization-remark-options.c
===================================================================
--- clang/test/Frontend/optimization-remark-options.c
+++ clang/test/Frontend/optimization-remark-options.c
@@ -12,7 +12,7 @@
return v;
}
-// CHECK: {{.*}}:18:3: remark: loop not vectorized: cannot prove it is safe to reorder memory operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop. If the arrays will always be independent specify '#pragma clang loop vectorize(assume_safety)' before the loop or provide the '__restrict__' qualifier with the independent array arguments. Erroneous results will occur if these options are incorrectly applied!
+// CHECK: {{.*}}:18:3: remark: the cost-model indicates that interleaving is not beneficial and is explicitly disabled or interleave count is set to 1
void foo2(int *dw, int *uw, int *A, int *B, int *C, int *D, int N) {
for (long i = 0; i < N; i++) {
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits