[llvm-branch-commits] [llvm] 50bff64 - [SLP][Test] Add test for PR46983

2020-12-07 Thread Anton Afanasyev via llvm-branch-commits

Author: Anton Afanasyev
Date: 2020-12-07T21:07:40+03:00
New Revision: 50bff64158e9db3f91dd2d611a14707e8d173163

URL: 
https://github.com/llvm/llvm-project/commit/50bff64158e9db3f91dd2d611a14707e8d173163
DIFF: 
https://github.com/llvm/llvm-project/commit/50bff64158e9db3f91dd2d611a14707e8d173163.diff

LOG: [SLP][Test] Add test for PR46983

Added: 
llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll

Modified: 


Removed: 




diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll 
b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
new file mode 100644
index ..7df32e665805
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -0,0 +1,227 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -instcombine -S 
-mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s 
--check-prefixes=CHECK,AVX
+; RUN: opt < %s -slp-vectorizer -instcombine -S 
-mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s 
--check-prefixes=CHECK,AVX2
+
+define void @store_i32(i32* nocapture %0, i32 %1, i32 %2) {
+; CHECK-LABEL: @store_i32(
+; CHECK-NEXT:[[TMP4:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
+; CHECK-NEXT:[[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, 
[[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:[[TMP6:%.*]] = insertelement <4 x i32> undef, i32 
[[TMP1:%.*]], i32 0
+; CHECK-NEXT:[[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> 
undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:[[TMP8:%.*]] = mul <4 x i32> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:[[TMP9:%.*]] = lshr <4 x i32> [[TMP8]], 
+; CHECK-NEXT:[[TMP10:%.*]] = icmp ult <4 x i32> [[TMP9]], 
+; CHECK-NEXT:[[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> 
[[TMP9]], <4 x i32> 
+; CHECK-NEXT:[[TMP12:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, 
[[TBAA0]]
+; CHECK-NEXT:ret void
+;
+  %4 = load i32, i32* %0, align 4, !tbaa !2
+  %5 = mul i32 %4, %1
+  %6 = lshr i32 %5, 15
+  %7 = icmp ult i32 %6, 255
+  %8 = select i1 %7, i32 %6, i32 255
+  store i32 %8, i32* %0, align 4, !tbaa !2
+  %9 = getelementptr inbounds i32, i32* %0, i64 1
+  %10 = load i32, i32* %9, align 4, !tbaa !2
+  %11 = mul i32 %10, %1
+  %12 = lshr i32 %11, 15
+  %13 = icmp ult i32 %12, 255
+  %14 = select i1 %13, i32 %12, i32 255
+  store i32 %14, i32* %9, align 4, !tbaa !2
+  %15 = getelementptr inbounds i32, i32* %0, i64 2
+  %16 = load i32, i32* %15, align 4, !tbaa !2
+  %17 = mul i32 %16, %1
+  %18 = lshr i32 %17, 15
+  %19 = icmp ult i32 %18, 255
+  %20 = select i1 %19, i32 %18, i32 255
+  store i32 %20, i32* %15, align 4, !tbaa !2
+  %21 = getelementptr inbounds i32, i32* %0, i64 3
+  %22 = load i32, i32* %21, align 4, !tbaa !2
+  %23 = mul i32 %22, %1
+  %24 = lshr i32 %23, 15
+  %25 = icmp ult i32 %24, 255
+  %26 = select i1 %25, i32 %24, i32 255
+  store i32 %26, i32* %21, align 4, !tbaa !2
+  ret void
+}
+
+define void @store_i8(i8* nocapture %0, i32 %1, i32 %2) {
+; CHECK-LABEL: @store_i8(
+; CHECK-NEXT:[[TMP4:%.*]] = load i8, i8* [[TMP0:%.*]], align 1, 
[[TBAA4:!tbaa !.*]]
+; CHECK-NEXT:[[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:[[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP1:%.*]]
+; CHECK-NEXT:[[TMP7:%.*]] = lshr i32 [[TMP6]], 15
+; CHECK-NEXT:[[TMP8:%.*]] = icmp ult i32 [[TMP7]], 255
+; CHECK-NEXT:[[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP7]], i32 255
+; CHECK-NEXT:[[TMP10:%.*]] = trunc i32 [[TMP9]] to i8
+; CHECK-NEXT:store i8 [[TMP10]], i8* [[TMP0]], align 1, [[TBAA4]]
+; CHECK-NEXT:[[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 1
+; CHECK-NEXT:[[TMP12:%.*]] = load i8, i8* [[TMP11]], align 1, [[TBAA4]]
+; CHECK-NEXT:[[TMP13:%.*]] = zext i8 [[TMP12]] to i32
+; CHECK-NEXT:[[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP1]]
+; CHECK-NEXT:[[TMP15:%.*]] = lshr i32 [[TMP14]], 15
+; CHECK-NEXT:[[TMP16:%.*]] = icmp ult i32 [[TMP15]], 255
+; CHECK-NEXT:[[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP15]], i32 255
+; CHECK-NEXT:[[TMP18:%.*]] = trunc i32 [[TMP17]] to i8
+; CHECK-NEXT:store i8 [[TMP18]], i8* [[TMP11]], align 1, [[TBAA4]]
+; CHECK-NEXT:[[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 2
+; CHECK-NEXT:[[TMP20:%.*]] = load i8, i8* [[TMP19]], align 1, [[TBAA4]]
+; CHECK-NEXT:[[TMP21:%.*]] = zext i8 [[TMP20]] to i32
+; CHECK-NEXT:[[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP1]]
+; CHECK-NEXT:[[TMP23:%.*]] = lshr i32 [[TMP22]], 15
+; CHECK-NEXT:[[TMP24:%.*]] = icmp ult i32 [[TMP23]], 255
+; CHECK-NEXT:[[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 255
+; CHECK-NEXT:[[TMP26:%.*]] = trunc i32 [[TMP25]] to i8
+; CHECK-NEXT:store i8 [[TMP26]], i8* [[TMP19]], align 1, [[TBAA4]]
+; CHECK-NEXT:[[TMP27:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 

[llvm-branch-commits] [llvm] 6c3f56e - [SLP][Test] Differentiate SSE/AVX512 test coverage (NFC)

2020-12-08 Thread Anton Afanasyev via llvm-branch-commits

Author: Anton Afanasyev
Date: 2020-12-08T12:00:52+03:00
New Revision: 6c3f56efa6e6ca746ba3dafae43251105f16e5fb

URL: 
https://github.com/llvm/llvm-project/commit/6c3f56efa6e6ca746ba3dafae43251105f16e5fb
DIFF: 
https://github.com/llvm/llvm-project/commit/6c3f56efa6e6ca746ba3dafae43251105f16e5fb.diff

LOG: [SLP][Test] Differentiate SSE/AVX512 test coverage (NFC)

Add test coverage for SSE/AVX512 for insert-after-bundle.ll test.
Prepare this test for accurate showing of PR46983 fix.

Added: 


Modified: 
llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll

Removed: 




diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll 
b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
index 2a4d457f1063..fa1183400cb0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -slp-vectorizer < %s | FileCheck %s
+; RUN: opt -S -slp-vectorizer -mattr=+sse  < %s | FileCheck %s 
--check-prefixes=CHECK,SSE
+; RUN: opt -S -slp-vectorizer -mattr=+avx512f < %s | FileCheck %s 
--check-prefixes=CHECK,AVX512
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -410,75 +411,109 @@ for.end:  ; 
preds = %for.body
 @ia = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16
 
 define i32 @foo1() local_unnamed_addr #0 {
-; CHECK-LABEL: @foo1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:[[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([64 x 
i32]* @ib to <4 x i32>*), align 16
-; CHECK-NEXT:[[TMP1:%.*]] = xor <4 x i32> [[TMP0]], 
-; CHECK-NEXT:store <4 x i32> [[TMP1]], <4 x i32>* bitcast ([64 x i32]* @ia 
to <4 x i32>*), align 16
-; CHECK-NEXT:[[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 4) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP3:%.*]] = xor <4 x i32> [[TMP2]], 
-; CHECK-NEXT:store <4 x i32> [[TMP3]], <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 4) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 8) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP5:%.*]] = xor <4 x i32> [[TMP4]], 
-; CHECK-NEXT:store <4 x i32> [[TMP5]], <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 8) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 12) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP7:%.*]] = xor <4 x i32> [[TMP6]], 
-; CHECK-NEXT:store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 12) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 16) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP9:%.*]] = xor <4 x i32> [[TMP8]], 
-; CHECK-NEXT:store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 16) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 20) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP11:%.*]] = xor <4 x i32> [[TMP10]], 
-; CHECK-NEXT:store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 20) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP12:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 24) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP13:%.*]] = xor <4 x i32> [[TMP12]], 
-; CHECK-NEXT:store <4 x i32> [[TMP13]], <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 24) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP14:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 28) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP15:%.*]] = xor <4 x i32> [[TMP14]], 
-; CHECK-NEXT:store <4 x i32> [[TMP15]], <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 28) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP16:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* 
getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 32) to <4 x 
i32>*), align 16
-; CHECK-NEXT:[[TMP17:%.*]] = xor <4 x i32> [[TMP16]], 
-; CHECK-NEXT:store <4 x i32> [[TMP17]], <4 x i32

[llvm-branch-commits] [llvm] e5bf2e8 - [SLP] Use the width of value truncated just before storing

2020-12-09 Thread Anton Afanasyev via llvm-branch-commits

Author: Anton Afanasyev
Date: 2020-12-09T16:38:45+03:00
New Revision: e5bf2e8989469ec328d910be26bd3ee0710326d9

URL: 
https://github.com/llvm/llvm-project/commit/e5bf2e8989469ec328d910be26bd3ee0710326d9
DIFF: 
https://github.com/llvm/llvm-project/commit/e5bf2e8989469ec328d910be26bd3ee0710326d9.diff

LOG: [SLP] Use the width of value truncated just before storing

For stores chain vectorization we choose the size of vector
elements to ensure we fit to minimum and maximum vector register
size for the number of elements given. This patch corrects vector
element size choosing the width of value truncated just before
storing instead of the width of value stored.

Fixes PR46983

Differential Revision: https://reviews.llvm.org/D92824

Added: 


Modified: 
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll

Removed: 




diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp 
b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e3f6d8cc05f7..456485f45809 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5536,10 +5536,15 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 }
 
 unsigned BoUpSLP::getVectorElementSize(Value *V) {
-  // If V is a store, just return the width of the stored value without
-  // traversing the expression tree. This is the common case.
-  if (auto *Store = dyn_cast(V))
-return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+  // If V is a store, just return the width of the stored value (or value
+  // truncated just before storing) without traversing the expression tree.
+  // This is the common case.
+  if (auto *Store = dyn_cast(V)) {
+if (auto *Trunc = dyn_cast(Store->getValueOperand()))
+  return DL->getTypeSizeInBits(Trunc->getSrcTy());
+else
+  return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+  }
 
   auto E = InstrElementSize.find(V);
   if (E != InstrElementSize.end())

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll 
b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
index fa1183400cb0..23aa3536a88d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
@@ -22,132 +22,304 @@ entry:
 }
 
 define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture 
readonly %b, i8* noalias nocapture readonly %c, i8* noalias nocapture readonly 
%d, i8* noalias nocapture %e, i32 %w) local_unnamed_addr #1 {
-; CHECK-LABEL: @bar(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:[[TMP0:%.*]] = insertelement <16 x i32> undef, i32 [[W:%.*]], 
i32 0
-; CHECK-NEXT:[[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[W]], 
i32 1
-; CHECK-NEXT:[[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[W]], 
i32 2
-; CHECK-NEXT:[[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[W]], 
i32 3
-; CHECK-NEXT:[[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[W]], 
i32 4
-; CHECK-NEXT:[[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[W]], 
i32 5
-; CHECK-NEXT:[[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[W]], 
i32 6
-; CHECK-NEXT:[[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[W]], 
i32 7
-; CHECK-NEXT:[[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[W]], 
i32 8
-; CHECK-NEXT:[[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[W]], 
i32 9
-; CHECK-NEXT:[[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[W]], 
i32 10
-; CHECK-NEXT:[[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 
[[W]], i32 11
-; CHECK-NEXT:[[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 
[[W]], i32 12
-; CHECK-NEXT:[[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 
[[W]], i32 13
-; CHECK-NEXT:[[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 
[[W]], i32 14
-; CHECK-NEXT:[[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 
[[W]], i32 15
-; CHECK-NEXT:br label [[FOR_BODY:%.*]]
-; CHECK:   for.body:
-; CHECK-NEXT:[[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], 
[[FOR_BODY]] ]
-; CHECK-NEXT:[[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ 
[[ADD_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:[[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ 
[[ADD_PTR192:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:[[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ 
[[ADD_PTR191:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:[[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ 
[[ADD_PTR190:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:[[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ 
[[ADD_PTR189:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:[[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* 
[[C_ADDR_0352]], i64 1
-; CH

[llvm-branch-commits] [llvm] b8c847e - [SLP][Test] Precommit test for D93192

2020-12-13 Thread Anton Afanasyev via llvm-branch-commits

Author: Anton Afanasyev
Date: 2020-12-14T09:23:47+03:00
New Revision: b8c847ee731b319c1790ab4410f14933aa59efd5

URL: 
https://github.com/llvm/llvm-project/commit/b8c847ee731b319c1790ab4410f14933aa59efd5
DIFF: 
https://github.com/llvm/llvm-project/commit/b8c847ee731b319c1790ab4410f14933aa59efd5.diff

LOG: [SLP][Test] Precommit test for D93192

This test shows failure of combined stores chains vectorization

Added: 
llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll

Modified: 


Removed: 




diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll 
b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
new file mode 100644
index ..63e3178c0278
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-- -mcpu=corei7 | FileCheck 
%s
+
+define void @foo(i8* %v0, i8* readonly %v1) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:[[T0:%.*]] = bitcast i8* [[V0:%.*]] to i32*
+; CHECK-NEXT:[[T1:%.*]] = bitcast i8* [[V1:%.*]] to i32*
+; CHECK-NEXT:[[T02:%.*]] = bitcast i8* [[V0]] to i64*
+; CHECK-NEXT:[[T12:%.*]] = bitcast i8* [[V1]] to i64*
+; CHECK-NEXT:[[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
+; CHECK-NEXT:[[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 5
+; CHECK-NEXT:[[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
+; CHECK-NEXT:[[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 7
+; CHECK-NEXT:[[T142:%.*]] = getelementptr inbounds i64, i64* [[T12]], i64 8
+; CHECK-NEXT:[[T182:%.*]] = getelementptr inbounds i64, i64* [[T12]], i64 9
+; CHECK-NEXT:[[T222:%.*]] = getelementptr inbounds i64, i64* [[T12]], i64 
10
+; CHECK-NEXT:[[T262:%.*]] = getelementptr inbounds i64, i64* [[T12]], i64 
11
+; CHECK-NEXT:[[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
+; CHECK-NEXT:[[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
+; CHECK-NEXT:[[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
+; CHECK-NEXT:[[T32:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
+; CHECK-NEXT:[[T212:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 8
+; CHECK-NEXT:[[T252:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 9
+; CHECK-NEXT:[[T292:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 
10
+; CHECK-NEXT:[[T322:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 
11
+; CHECK-NEXT:[[T19:%.*]] = load i32, i32* [[T14]], align 4
+; CHECK-NEXT:[[T23:%.*]] = load i32, i32* [[T18]], align 4
+; CHECK-NEXT:[[T27:%.*]] = load i32, i32* [[T22]], align 4
+; CHECK-NEXT:[[T30:%.*]] = load i32, i32* [[T26]], align 4
+; CHECK-NEXT:[[TMP1:%.*]] = bitcast i64* [[T142]] to <2 x i64>*
+; CHECK-NEXT:[[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
+; CHECK-NEXT:[[TMP3:%.*]] = bitcast i64* [[T222]] to <2 x i64>*
+; CHECK-NEXT:[[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
+; CHECK-NEXT:[[T20:%.*]] = add nsw i32 [[T19]], 4
+; CHECK-NEXT:[[T24:%.*]] = add nsw i32 [[T23]], 4
+; CHECK-NEXT:[[T28:%.*]] = add nsw i32 [[T27]], 6
+; CHECK-NEXT:[[T31:%.*]] = add nsw i32 [[T30]], 7
+; CHECK-NEXT:[[TMP5:%.*]] = add nsw <2 x i64> [[TMP2]], 
+; CHECK-NEXT:[[TMP6:%.*]] = add nsw <2 x i64> [[TMP4]], 
+; CHECK-NEXT:[[TMP7:%.*]] = bitcast i64* [[T212]] to <2 x i64>*
+; CHECK-NEXT:store <2 x i64> [[TMP5]], <2 x i64>* [[TMP7]], align 8
+; CHECK-NEXT:[[TMP8:%.*]] = bitcast i64* [[T292]] to <2 x i64>*
+; CHECK-NEXT:store <2 x i64> [[TMP6]], <2 x i64>* [[TMP8]], align 8
+; CHECK-NEXT:store i32 [[T20]], i32* [[T21]], align 4
+; CHECK-NEXT:store i32 [[T24]], i32* [[T25]], align 4
+; CHECK-NEXT:store i32 [[T28]], i32* [[T29]], align 4
+; CHECK-NEXT:store i32 [[T31]], i32* [[T32]], align 4
+; CHECK-NEXT:ret void
+;
+  %t0 = bitcast i8* %v0 to i32*
+  %t1 = bitcast i8* %v1 to i32*
+
+  %t02 = bitcast i8* %v0 to i64*
+  %t12 = bitcast i8* %v1 to i64*
+
+  %t14 = getelementptr inbounds i32, i32* %t1, i64 4
+  %t18 = getelementptr inbounds i32, i32* %t1, i64 5
+  %t22 = getelementptr inbounds i32, i32* %t1, i64 6
+  %t26 = getelementptr inbounds i32, i32* %t1, i64 7
+
+  %t142 = getelementptr inbounds i64, i64* %t12, i64 8
+  %t182 = getelementptr inbounds i64, i64* %t12, i64 9
+  %t222 = getelementptr inbounds i64, i64* %t12, i64 10
+  %t262 = getelementptr inbounds i64, i64* %t12, i64 11
+
+  %t21 = getelementptr inbounds i32, i32* %t0, i64 4
+  %t25 = getelementptr inbounds i32, i32* %t0, i64 5
+  %t29 = getelementptr inbounds i32, i32* %t0, i64 6
+  %t32 = getelementptr inbounds i32, i32* %t0, i64 7
+
+  %t212 = getelementptr inbounds i64, i64* %t02, i64 8
+  %t252 = getelementptr inbou

[llvm-branch-commits] [llvm] fac7c7e - [SLP] Fix vector element size for the store chains

2020-12-14 Thread Anton Afanasyev via llvm-branch-commits

Author: Anton Afanasyev
Date: 2020-12-14T15:51:43+03:00
New Revision: fac7c7ec3ccd64d19b6d33af0a8bc2f3f7f7b047

URL: 
https://github.com/llvm/llvm-project/commit/fac7c7ec3ccd64d19b6d33af0a8bc2f3f7f7b047
DIFF: 
https://github.com/llvm/llvm-project/commit/fac7c7ec3ccd64d19b6d33af0a8bc2f3f7f7b047.diff

LOG: [SLP] Fix vector element size for the store chains

Vector element size could be different for different store chains.
This patch prevents wrong computation of maximum number of elements
for that case.

Differential Revision: https://reviews.llvm.org/D93192

Added: 


Modified: 
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll

Removed: 




diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp 
b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c5ba3709f6b1..e1c1c6edf08c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6076,7 +6076,7 @@ bool 
SLPVectorizerPass::vectorizeStores(ArrayRef Stores,
 
 // If a vector register can't hold 1 element, we are done.
 unsigned MaxVecRegSize = R.getMaxVecRegSize();
-unsigned EltSize = R.getVectorElementSize(Stores[0]);
+unsigned EltSize = R.getVectorElementSize(Operands[0]);
 if (MaxVecRegSize % EltSize != 0)
   continue;
 

diff  --git a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll 
b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
index 63e3178c0278..2fdef624d48f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll
@@ -23,28 +23,21 @@ define void @foo(i8* %v0, i8* readonly %v1) {
 ; CHECK-NEXT:[[T252:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 9
 ; CHECK-NEXT:[[T292:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 
10
 ; CHECK-NEXT:[[T322:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 
11
-; CHECK-NEXT:[[T19:%.*]] = load i32, i32* [[T14]], align 4
-; CHECK-NEXT:[[T23:%.*]] = load i32, i32* [[T18]], align 4
-; CHECK-NEXT:[[T27:%.*]] = load i32, i32* [[T22]], align 4
-; CHECK-NEXT:[[T30:%.*]] = load i32, i32* [[T26]], align 4
-; CHECK-NEXT:[[TMP1:%.*]] = bitcast i64* [[T142]] to <2 x i64>*
-; CHECK-NEXT:[[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT:[[TMP3:%.*]] = bitcast i64* [[T222]] to <2 x i64>*
+; CHECK-NEXT:[[TMP1:%.*]] = bitcast i32* [[T14]] to <4 x i32>*
+; CHECK-NEXT:[[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:[[TMP3:%.*]] = bitcast i64* [[T142]] to <2 x i64>*
 ; CHECK-NEXT:[[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
-; CHECK-NEXT:[[T20:%.*]] = add nsw i32 [[T19]], 4
-; CHECK-NEXT:[[T24:%.*]] = add nsw i32 [[T23]], 4
-; CHECK-NEXT:[[T28:%.*]] = add nsw i32 [[T27]], 6
-; CHECK-NEXT:[[T31:%.*]] = add nsw i32 [[T30]], 7
-; CHECK-NEXT:[[TMP5:%.*]] = add nsw <2 x i64> [[TMP2]], 
-; CHECK-NEXT:[[TMP6:%.*]] = add nsw <2 x i64> [[TMP4]], 
-; CHECK-NEXT:[[TMP7:%.*]] = bitcast i64* [[T212]] to <2 x i64>*
-; CHECK-NEXT:store <2 x i64> [[TMP5]], <2 x i64>* [[TMP7]], align 8
-; CHECK-NEXT:[[TMP8:%.*]] = bitcast i64* [[T292]] to <2 x i64>*
-; CHECK-NEXT:store <2 x i64> [[TMP6]], <2 x i64>* [[TMP8]], align 8
-; CHECK-NEXT:store i32 [[T20]], i32* [[T21]], align 4
-; CHECK-NEXT:store i32 [[T24]], i32* [[T25]], align 4
-; CHECK-NEXT:store i32 [[T28]], i32* [[T29]], align 4
-; CHECK-NEXT:store i32 [[T31]], i32* [[T32]], align 4
+; CHECK-NEXT:[[TMP5:%.*]] = bitcast i64* [[T222]] to <2 x i64>*
+; CHECK-NEXT:[[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:[[TMP7:%.*]] = add nsw <4 x i32> [[TMP2]], 
+; CHECK-NEXT:[[TMP8:%.*]] = add nsw <2 x i64> [[TMP4]], 
+; CHECK-NEXT:[[TMP9:%.*]] = add nsw <2 x i64> [[TMP6]], 
+; CHECK-NEXT:[[TMP10:%.*]] = bitcast i64* [[T212]] to <2 x i64>*
+; CHECK-NEXT:store <2 x i64> [[TMP8]], <2 x i64>* [[TMP10]], align 8
+; CHECK-NEXT:[[TMP11:%.*]] = bitcast i64* [[T292]] to <2 x i64>*
+; CHECK-NEXT:store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 8
+; CHECK-NEXT:[[TMP12:%.*]] = bitcast i32* [[T21]] to <4 x i32>*
+; CHECK-NEXT:store <4 x i32> [[TMP7]], <4 x i32>* [[TMP12]], align 4
 ; CHECK-NEXT:ret void
 ;
   %t0 = bitcast i8* %v0 to i32*



___
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits