[PATCH] D122126: [LoopVectorize] Don't interleave when the number of runtime checks exceeds the threshold

2022-05-05 Thread Tiehu Zhang via Phabricator via cfe-commits
TiehuZhang updated this revision to Diff 427300.
TiehuZhang added a comment.
Herald added a project: clang.
Herald added a subscriber: cfe-commits.

Fix the failed case (optimization-remark-options.c), because the remark info 
should be updated


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D122126/new/

https://reviews.llvm.org/D122126

Files:
  clang/test/Frontend/optimization-remark-options.c
  llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
  llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
  
llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll

Index: llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll
===
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll
@@ -0,0 +1,90 @@
+; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -S -loop-vectorize  < %s -o - | FileCheck %s
+; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -S -loop-vectorize -force-vector-interleave=2 < %s -o - | FileCheck -check-prefix=CHECK-INTERLEAVE %s
+ 
+; The case will do aggressive interleave on PowerPC, resulting in a lot of memory checks.
+; (On the A2, always unroll aggressively. In fact, if aggressive interleaving is enabled,
+; similar issues may occur on other targets).
+; Interleaving should also be restricted by the threshold of memory checks similar to VF.
+; (e.g., runtime-memory-check-threshold, default 8).
+ 
+; CHECK-LABEL: @eddy_diff_caleddy_
+; CHECK-NOT: vector.memcheck
+ 
+; CHECK-INTERLEAVE-LABEL: @eddy_diff_caleddy_
+; CHECK-INTERLEAVE: vector.memcheck
+ 
+define fastcc void @eddy_diff_caleddy_(i64* %wet_cl, i64 %0, i32 %ncol.cast.val) {
+entry:
+  %trip.count = add nuw i32 %ncol.cast.val, 1
+  %wide.trip.count = zext i32 %ncol.cast.val to i64
+  %1 = shl i64 %0, 1
+  %2 = mul i64 %0, 3
+  %3 = shl i64 %0, 2
+  %4 = mul i64 %0, 5
+  %5 = mul i64 %0, 6
+  %6 = mul i64 %0, 7
+  %7 = shl i64 %0, 3
+  %8 = mul i64 %0, 9
+  %9 = mul i64 %0, 10
+  %10 = mul i64 %0, 11
+  %11 = mul i64 %0, 12
+  br label %loop.body
+ 
+loop.body:
+  %indvars.iv774 = phi i64 [ 0, %entry ], [ %indvars.iv.next775, %loop.body ]
+  %12 = add nsw i64 %indvars.iv774, -5
+  %13 = add i64 %12, %0
+  %14 = getelementptr i64, i64* %wet_cl, i64 %13
+  %15 = bitcast i64* %14 to double*
+  store double 0.00e+00, double* %15, align 8
+  %16 = add i64 %12, %1
+  %17 = getelementptr i64, i64* %wet_cl, i64 %16
+  %18 = bitcast i64* %17 to double*
+  store double 0.00e+00, double* %18, align 8
+  %19 = add i64 %12, %2
+  %20 = getelementptr i64, i64* %wet_cl, i64 %19
+  %21 = bitcast i64* %20 to double*
+  store double 0.00e+00, double* %21, align 8
+  %22 = add i64 %12, %3
+  %23 = getelementptr i64, i64* %wet_cl, i64 %22
+  %24 = bitcast i64* %23 to double*
+  store double 0.00e+00, double* %24, align 8
+  %25 = add i64 %12, %4
+  %26 = getelementptr i64, i64* %wet_cl, i64 %25
+  %27 = bitcast i64* %26 to double*
+  store double 0.00e+00, double* %27, align 8
+  %28 = add i64 %12, %5
+  %29 = getelementptr i64, i64* %wet_cl, i64 %28
+  %30 = bitcast i64* %29 to double*
+  store double 0.00e+00, double* %30, align 8
+  %31 = add i64 %12, %6
+  %32 = getelementptr i64, i64* %wet_cl, i64 %31
+  %33 = bitcast i64* %32 to double*
+  store double 0.00e+00, double* %33, align 8
+  %34 = add i64 %12, %7
+  %35 = getelementptr i64, i64* %wet_cl, i64 %34
+  %36 = bitcast i64* %35 to double*
+  store double 0.00e+00, double* %36, align 8
+  %37 = add i64 %12, %8
+  %38 = getelementptr i64, i64* %wet_cl, i64 %37
+  %39 = bitcast i64* %38 to double*
+  store double 0.00e+00, double* %39, align 8
+  %40 = add i64 %12, %9
+  %41 = getelementptr i64, i64* %wet_cl, i64 %40
+  %42 = bitcast i64* %41 to double*
+  store double 0.00e+00, double* %42, align 8
+  %43 = add i64 %12, %10
+  %44 = getelementptr i64, i64* %wet_cl, i64 %43
+  %45 = bitcast i64* %44 to double*
+  store double 0.00e+00, double* %45, align 8
+  %46 = add i64 %12, %11
+  %47 = getelementptr i64, i64* %wet_cl, i64 %46
+  %48 = bitcast i64* %47 to double*
+  store double 0.00e+00, double* %48, align 8
+  %indvars.iv.next775 = add nuw nsw i64 %indvars.iv774, 1
+  %exitcond778.not = icmp eq i64 %indvars.iv.next775, %wide.trip.count
+  br i1 %exitcond778.not, label %loop.end, label %loop.body
+ 
+loop.end:
+  ret void
+}
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7450,6 +7450,14 @@
   return VectorizationFactor::Disabled();
 }
 
+bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() {
+  unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
+  return (NumRuntimePointerCheck

[PATCH] D122126: [LoopVectorize] Don't interleave when the number of runtime checks exceeds the threshold

2022-05-12 Thread Tiehu Zhang via Phabricator via cfe-commits
TiehuZhang updated this revision to Diff 428930.
TiehuZhang added a comment.

(Updated)
Difference with accepted version:  Move memory runtime checks to processLoop to 
control both VF and IC


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D122126/new/

https://reviews.llvm.org/D122126

Files:
  llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
  llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
  
llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll

Index: llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll
===
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll
@@ -0,0 +1,86 @@
+; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -S -loop-vectorize  < %s -o - | FileCheck %s
+ 
+; The case will do aggressive interleave on PowerPC, resulting in a lot of memory checks.
+; (On the A2, always unroll aggressively. In fact, if aggressive interleaving is enabled,
+; similar issues may occur on other targets).
+; Interleaving should also be restricted by the threshold of memory checks similar to VF.
+; (e.g., runtime-memory-check-threshold, default 8).
+ 
+; CHECK-LABEL: @eddy_diff_caleddy_
+; CHECK-NOT: vector.memcheck
+ 
+define fastcc void @eddy_diff_caleddy_(i64* %wet_cl, i64 %0, i32 %ncol.cast.val) {
+entry:
+  %trip.count = add nuw i32 %ncol.cast.val, 1
+  %wide.trip.count = zext i32 %ncol.cast.val to i64
+  %1 = shl i64 %0, 1
+  %2 = mul i64 %0, 3
+  %3 = shl i64 %0, 2
+  %4 = mul i64 %0, 5
+  %5 = mul i64 %0, 6
+  %6 = mul i64 %0, 7
+  %7 = shl i64 %0, 3
+  %8 = mul i64 %0, 9
+  %9 = mul i64 %0, 10
+  %10 = mul i64 %0, 11
+  %11 = mul i64 %0, 12
+  br label %loop.body
+ 
+loop.body:
+  %indvars.iv774 = phi i64 [ 0, %entry ], [ %indvars.iv.next775, %loop.body ]
+  %12 = add nsw i64 %indvars.iv774, -5
+  %13 = add i64 %12, %0
+  %14 = getelementptr i64, i64* %wet_cl, i64 %13
+  %15 = bitcast i64* %14 to double*
+  store double 0.00e+00, double* %15, align 8
+  %16 = add i64 %12, %1
+  %17 = getelementptr i64, i64* %wet_cl, i64 %16
+  %18 = bitcast i64* %17 to double*
+  store double 0.00e+00, double* %18, align 8
+  %19 = add i64 %12, %2
+  %20 = getelementptr i64, i64* %wet_cl, i64 %19
+  %21 = bitcast i64* %20 to double*
+  store double 0.00e+00, double* %21, align 8
+  %22 = add i64 %12, %3
+  %23 = getelementptr i64, i64* %wet_cl, i64 %22
+  %24 = bitcast i64* %23 to double*
+  store double 0.00e+00, double* %24, align 8
+  %25 = add i64 %12, %4
+  %26 = getelementptr i64, i64* %wet_cl, i64 %25
+  %27 = bitcast i64* %26 to double*
+  store double 0.00e+00, double* %27, align 8
+  %28 = add i64 %12, %5
+  %29 = getelementptr i64, i64* %wet_cl, i64 %28
+  %30 = bitcast i64* %29 to double*
+  store double 0.00e+00, double* %30, align 8
+  %31 = add i64 %12, %6
+  %32 = getelementptr i64, i64* %wet_cl, i64 %31
+  %33 = bitcast i64* %32 to double*
+  store double 0.00e+00, double* %33, align 8
+  %34 = add i64 %12, %7
+  %35 = getelementptr i64, i64* %wet_cl, i64 %34
+  %36 = bitcast i64* %35 to double*
+  store double 0.00e+00, double* %36, align 8
+  %37 = add i64 %12, %8
+  %38 = getelementptr i64, i64* %wet_cl, i64 %37
+  %39 = bitcast i64* %38 to double*
+  store double 0.00e+00, double* %39, align 8
+  %40 = add i64 %12, %9
+  %41 = getelementptr i64, i64* %wet_cl, i64 %40
+  %42 = bitcast i64* %41 to double*
+  store double 0.00e+00, double* %42, align 8
+  %43 = add i64 %12, %10
+  %44 = getelementptr i64, i64* %wet_cl, i64 %43
+  %45 = bitcast i64* %44 to double*
+  store double 0.00e+00, double* %45, align 8
+  %46 = add i64 %12, %11
+  %47 = getelementptr i64, i64* %wet_cl, i64 %46
+  %48 = bitcast i64* %47 to double*
+  store double 0.00e+00, double* %48, align 8
+  %indvars.iv.next775 = add nuw nsw i64 %indvars.iv774, 1
+  %exitcond778.not = icmp eq i64 %indvars.iv.next775, %wide.trip.count
+  br i1 %exitcond778.not, label %loop.end, label %loop.body
+ 
+loop.end:
+  ret void
+}
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7471,6 +7471,14 @@
   return VectorizationFactor::Disabled();
 }
 
+bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() {
+  unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
+  return (NumRuntimePointerChecks >
+  VectorizerParams::RuntimeMemoryCheckThreshold &&
+  !Hints.allowReordering()) ||
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+}
+
 Optional
 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -7538,31 +7546,7 @@

[PATCH] D122126: [LoopVectorize] Don't interleave when the number of runtime checks exceeds the threshold

2022-05-12 Thread Tiehu Zhang via Phabricator via cfe-commits
TiehuZhang added a comment.

The code has been updated since accept. Please review it again. Thank you very 
much! @fhahn @dmgreen




Comment at: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp:10460
 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
+if (!UserIC && requiresTooManyRtChecks) {
+  ORE->emit([&]() {

fhahn wrote:
> Can the handling be merged into a single check & diagnostic?
Hi,@fhahn, thanks for your reply! Does the current version meet the 
requirements?



Comment at: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp:10460
 IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
+if (!UserIC && requiresTooManyRtChecks) {
+  ORE->emit([&]() {

TiehuZhang wrote:
> fhahn wrote:
> > Can the handling be merged into a single check & diagnostic?
> Hi,@fhahn, thanks for your reply! Does the current version meet the 
> requirements?
Hi, @fhahn, is there any other problem with this patch? 

ping



Comment at: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp:10461
+if (!LVP.hasTooManyRuntimeChecks()) {
+  IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
+}

fhahn wrote:
> This check here should be sufficient, there should be no need to also check 
> in `selectInterleaveCount`.
> 
> Could you just move the remark generation & early exit from `::plan` here?
> 
> You might want to skip those checks if there's a UserVF or UserIC used, with 
> those I think we should always vectorize if possible. It also might be good 
> to add a check line to your test which forces an interleave count > 1.
Hi, @fhahn, thanks for your review! It sounds similar to doesNotMeet 
(https://reviews.llvm.org/D98634), but the difference is that I need to use 
UserIC and UserVF to control whether this check needs to be performed, right? 
E.g.

```
if (!UserVF && LVP.requiresTooManyRuntimeChecks()) {
  /*generate remarks*/
  VF = VectorizationFactor::Disabled();
}

if (!UserIC && LVP.requiresTooManyRuntimeChecks()) {
  /*generate remarks*/
  IC = 1;
}
```

> Could you just move the remark generation & early exit from ::plan here?
> 
> You might want to skip those checks if there's a UserVF or UserIC used, with 
> those I think we should always vectorize if possible. It also might be good 
> to add a check line to your test which forces an interleave count > 1.







Comment at: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp:7673-7674
   // Check if it is profitable to vectorize with runtime checks.
-  unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
-  if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
-bool PragmaThresholdReached =
-NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
-bool ThresholdReached =
-NumRuntimePointerChecks > 
VectorizerParams::RuntimeMemoryCheckThreshold;
-if ((ThresholdReached && !Hints.allowReordering()) ||
-PragmaThresholdReached) {
+  if (SelectedVF.Width.getKnownMinValue() > 1) {
+if (hasTooManyRuntimeChecks()) {
   ORE->emit([&]() {

dmgreen wrote:
> Maybe just use a single if now: `if (SelectedVF.Width.getKnownMinValue() > 1 
> && hasTooManyRuntimeChecks()) {`
Done. Thanks for your review! 


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D122126/new/

https://reviews.llvm.org/D122126

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D122126: [LoopVectorize] Don't interleave when the number of runtime checks exceeds the threshold

2022-05-12 Thread Tiehu Zhang via Phabricator via cfe-commits
TiehuZhang updated this revision to Diff 429110.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D122126/new/

https://reviews.llvm.org/D122126

Files:
  llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
  llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
  
llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll

Index: llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll
===
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/PowerPC/interleaved-pointer-runtime-check-unprofitable.ll
@@ -0,0 +1,86 @@
+; RUN: opt -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -S -loop-vectorize  < %s -o - | FileCheck %s
+ 
+; The case will do aggressive interleave on PowerPC, resulting in a lot of memory checks.
+; (On the A2, always unroll aggressively. In fact, if aggressive interleaving is enabled,
+; similar issues may occur on other targets).
+; Interleaving should also be restricted by the threshold of memory checks similar to VF.
+; (e.g., runtime-memory-check-threshold, default 8).
+ 
+; CHECK-LABEL: @eddy_diff_caleddy_
+; CHECK-NOT: vector.memcheck
+ 
+define fastcc void @eddy_diff_caleddy_(i64* %wet_cl, i64 %0, i32 %ncol.cast.val) {
+entry:
+  %trip.count = add nuw i32 %ncol.cast.val, 1
+  %wide.trip.count = zext i32 %ncol.cast.val to i64
+  %1 = shl i64 %0, 1
+  %2 = mul i64 %0, 3
+  %3 = shl i64 %0, 2
+  %4 = mul i64 %0, 5
+  %5 = mul i64 %0, 6
+  %6 = mul i64 %0, 7
+  %7 = shl i64 %0, 3
+  %8 = mul i64 %0, 9
+  %9 = mul i64 %0, 10
+  %10 = mul i64 %0, 11
+  %11 = mul i64 %0, 12
+  br label %loop.body
+ 
+loop.body:
+  %indvars.iv774 = phi i64 [ 0, %entry ], [ %indvars.iv.next775, %loop.body ]
+  %12 = add nsw i64 %indvars.iv774, -5
+  %13 = add i64 %12, %0
+  %14 = getelementptr i64, i64* %wet_cl, i64 %13
+  %15 = bitcast i64* %14 to double*
+  store double 0.00e+00, double* %15, align 8
+  %16 = add i64 %12, %1
+  %17 = getelementptr i64, i64* %wet_cl, i64 %16
+  %18 = bitcast i64* %17 to double*
+  store double 0.00e+00, double* %18, align 8
+  %19 = add i64 %12, %2
+  %20 = getelementptr i64, i64* %wet_cl, i64 %19
+  %21 = bitcast i64* %20 to double*
+  store double 0.00e+00, double* %21, align 8
+  %22 = add i64 %12, %3
+  %23 = getelementptr i64, i64* %wet_cl, i64 %22
+  %24 = bitcast i64* %23 to double*
+  store double 0.00e+00, double* %24, align 8
+  %25 = add i64 %12, %4
+  %26 = getelementptr i64, i64* %wet_cl, i64 %25
+  %27 = bitcast i64* %26 to double*
+  store double 0.00e+00, double* %27, align 8
+  %28 = add i64 %12, %5
+  %29 = getelementptr i64, i64* %wet_cl, i64 %28
+  %30 = bitcast i64* %29 to double*
+  store double 0.00e+00, double* %30, align 8
+  %31 = add i64 %12, %6
+  %32 = getelementptr i64, i64* %wet_cl, i64 %31
+  %33 = bitcast i64* %32 to double*
+  store double 0.00e+00, double* %33, align 8
+  %34 = add i64 %12, %7
+  %35 = getelementptr i64, i64* %wet_cl, i64 %34
+  %36 = bitcast i64* %35 to double*
+  store double 0.00e+00, double* %36, align 8
+  %37 = add i64 %12, %8
+  %38 = getelementptr i64, i64* %wet_cl, i64 %37
+  %39 = bitcast i64* %38 to double*
+  store double 0.00e+00, double* %39, align 8
+  %40 = add i64 %12, %9
+  %41 = getelementptr i64, i64* %wet_cl, i64 %40
+  %42 = bitcast i64* %41 to double*
+  store double 0.00e+00, double* %42, align 8
+  %43 = add i64 %12, %10
+  %44 = getelementptr i64, i64* %wet_cl, i64 %43
+  %45 = bitcast i64* %44 to double*
+  store double 0.00e+00, double* %45, align 8
+  %46 = add i64 %12, %11
+  %47 = getelementptr i64, i64* %wet_cl, i64 %46
+  %48 = bitcast i64* %47 to double*
+  store double 0.00e+00, double* %48, align 8
+  %indvars.iv.next775 = add nuw nsw i64 %indvars.iv774, 1
+  %exitcond778.not = icmp eq i64 %indvars.iv.next775, %wide.trip.count
+  br i1 %exitcond778.not, label %loop.end, label %loop.body
+ 
+loop.end:
+  ret void
+}
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7471,6 +7471,14 @@
   return VectorizationFactor::Disabled();
 }
 
+bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() {
+  unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
+  return (NumRuntimePointerChecks >
+  VectorizerParams::RuntimeMemoryCheckThreshold &&
+  !Hints.allowReordering()) ||
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+}
+
 Optional
 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -7538,31 +7546,7 @@
   if (!MaxFactors.hasVector())
 return VectorizationFactor::Disabled();
 
-  // Select the optimal vectorization factor.
-  auto SelectedV

[PATCH] D122126: [LoopVectorize] Don't interleave when the number of runtime checks exceeds the threshold

2022-05-16 Thread Tiehu Zhang via Phabricator via cfe-commits
TiehuZhang added a comment.

In D122126#3515070 , @fhahn wrote:

> Still LGTM, thanks! The remaining suggestion can be addressed directly before 
> committing the patch.

Thanks, @fhahn! I'll add the precommit test  when committing the patch


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D122126/new/

https://reviews.llvm.org/D122126

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits