[llvm-branch-commits] [llvm] 50bff64 - [SLP][Test] Add test for PR46983
Author: Anton Afanasyev Date: 2020-12-07T21:07:40+03:00 New Revision: 50bff64158e9db3f91dd2d611a14707e8d173163 URL: https://github.com/llvm/llvm-project/commit/50bff64158e9db3f91dd2d611a14707e8d173163 DIFF: https://github.com/llvm/llvm-project/commit/50bff64158e9db3f91dd2d611a14707e8d173163.diff LOG: [SLP][Test] Add test for PR46983 Added: llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll Modified: Removed: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll new file mode 100644 index ..7df32e665805 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -0,0 +1,227 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX +; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 + +define void @store_i32(i32* nocapture %0, i32 %1, i32 %2) { +; CHECK-LABEL: @store_i32( +; CHECK-NEXT:[[TMP4:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* +; CHECK-NEXT:[[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, [[TBAA0:!tbaa !.*]] +; CHECK-NEXT:[[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1:%.*]], i32 0 +; CHECK-NEXT:[[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT:[[TMP8:%.*]] = mul <4 x i32> [[TMP5]], [[TMP7]] +; CHECK-NEXT:[[TMP9:%.*]] = lshr <4 x i32> [[TMP8]], +; CHECK-NEXT:[[TMP10:%.*]] = icmp ult <4 x i32> [[TMP9]], +; CHECK-NEXT:[[TMP11:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT:[[TMP12:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>* +; CHECK-NEXT:store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]] +; CHECK-NEXT:ret void +; + %4 = load i32, i32* %0, align 4, !tbaa !2 + %5 = mul i32 %4, %1 + %6 = lshr i32 %5, 15 + %7 = icmp ult i32 %6, 255 + %8 = select i1 %7, i32 %6, i32 255 + store i32 %8, i32* %0, align 4, !tbaa !2 + %9 = getelementptr inbounds i32, i32* %0, i64 1 + %10 = load i32, i32* %9, align 4, !tbaa !2 + %11 = mul i32 %10, %1 + %12 = lshr i32 %11, 15 + %13 = icmp ult i32 %12, 255 + %14 = select i1 %13, i32 %12, i32 255 + store i32 %14, i32* %9, align 4, !tbaa !2 + %15 = getelementptr inbounds i32, i32* %0, i64 2 + %16 = load i32, i32* %15, align 4, !tbaa !2 + %17 = mul i32 %16, %1 + %18 = lshr i32 %17, 15 + %19 = icmp ult i32 %18, 255 + %20 = select i1 %19, i32 %18, i32 255 + store i32 %20, i32* %15, align 4, !tbaa !2 + %21 = getelementptr inbounds i32, i32* %0, i64 3 + %22 = load i32, i32* %21, align 4, !tbaa !2 + %23 = mul i32 %22, %1 + %24 = lshr i32 %23, 15 + %25 = icmp ult i32 %24, 255 + %26 = select i1 %25, i32 %24, i32 255 + store i32 %26, i32* %21, align 4, !tbaa !2 + ret void +} + +define void @store_i8(i8* nocapture %0, i32 %1, i32 %2) { +; CHECK-LABEL: @store_i8( +; CHECK-NEXT:[[TMP4:%.*]] = load i8, i8* [[TMP0:%.*]], align 1, [[TBAA4:!tbaa !.*]] +; CHECK-NEXT:[[TMP5:%.*]] = zext i8 [[TMP4]] to i32 +; CHECK-NEXT:[[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP1:%.*]] +; CHECK-NEXT:[[TMP7:%.*]] = lshr i32 [[TMP6]], 15 +; CHECK-NEXT:[[TMP8:%.*]] = icmp ult i32 [[TMP7]], 255 +; CHECK-NEXT:[[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP7]], i32 255 +; CHECK-NEXT:[[TMP10:%.*]] = trunc i32 [[TMP9]] to i8 +; CHECK-NEXT:store i8 [[TMP10]], i8* [[TMP0]], align 1, [[TBAA4]] +; CHECK-NEXT:[[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 1 +; CHECK-NEXT:[[TMP12:%.*]] = load i8, i8* [[TMP11]], align 1, [[TBAA4]] +; CHECK-NEXT:[[TMP13:%.*]] = zext i8 [[TMP12]] to i32 +; CHECK-NEXT:[[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP1]] +; CHECK-NEXT:[[TMP15:%.*]] = lshr i32 [[TMP14]], 15 +; CHECK-NEXT:[[TMP16:%.*]] = icmp ult i32 [[TMP15]], 255 +; CHECK-NEXT:[[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP15]], i32 255 +; CHECK-NEXT:[[TMP18:%.*]] = trunc i32 [[TMP17]] to i8 +; CHECK-NEXT:store i8 [[TMP18]], i8* [[TMP11]], align 1, [[TBAA4]] +; CHECK-NEXT:[[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 2 +; CHECK-NEXT:[[TMP20:%.*]] = load i8, i8* [[TMP19]], align 1, [[TBAA4]] +; CHECK-NEXT:[[TMP21:%.*]] = zext i8 [[TMP20]] to i32 +; CHECK-NEXT:[[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP1]] +; CHECK-NEXT:[[TMP23:%.*]] = lshr i32 [[TMP22]], 15 +; CHECK-NEXT:[[TMP24:%.*]] = icmp ult i32 [[TMP23]], 255 +; CHECK-NEXT:[[TMP25:%.*]] = select i1 [[TMP24]], i32 [[TMP23]], i32 255 +; CHECK-NEXT:[[TMP26:%.*]] = trunc i32 [[TMP25]] to i8 +; CHECK-NEXT:store i8 [[TMP26]], i8* [[TMP19]], align 1, [[TBAA4]] +; CHECK-NEXT:[[TMP27:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64
[llvm-branch-commits] [llvm] 6c3f56e - [SLP][Test] Differentiate SSE/AVX512 test coverage (NFC)
Author: Anton Afanasyev Date: 2020-12-08T12:00:52+03:00 New Revision: 6c3f56efa6e6ca746ba3dafae43251105f16e5fb URL: https://github.com/llvm/llvm-project/commit/6c3f56efa6e6ca746ba3dafae43251105f16e5fb DIFF: https://github.com/llvm/llvm-project/commit/6c3f56efa6e6ca746ba3dafae43251105f16e5fb.diff LOG: [SLP][Test] Differentiate SSE/AVX512 test coverage (NFC) Add test coverage for SSE/AVX512 for insert-after-bundle.ll test. Prepare this test for accurate showing of PR46983 fix. Added: Modified: llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll Removed: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll index 2a4d457f1063..fa1183400cb0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -slp-vectorizer < %s | FileCheck %s +; RUN: opt -S -slp-vectorizer -mattr=+sse < %s | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt -S -slp-vectorizer -mattr=+avx512f < %s | FileCheck %s --check-prefixes=CHECK,AVX512 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -410,75 +411,109 @@ for.end: ; preds = %for.body @ia = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16 define i32 @foo1() local_unnamed_addr #0 { -; CHECK-LABEL: @foo1( -; CHECK-NEXT: entry: -; CHECK-NEXT:[[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([64 x i32]* @ib to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP1:%.*]] = xor <4 x i32> [[TMP0]], -; CHECK-NEXT:store <4 x i32> [[TMP1]], <4 x i32>* bitcast ([64 x i32]* @ia to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 4) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP3:%.*]] = xor <4 x i32> [[TMP2]], -; CHECK-NEXT:store <4 x i32> [[TMP3]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 4) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 8) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP5:%.*]] = xor <4 x i32> [[TMP4]], -; CHECK-NEXT:store <4 x i32> [[TMP5]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 8) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 12) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP7:%.*]] = xor <4 x i32> [[TMP6]], -; CHECK-NEXT:store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 12) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 16) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP9:%.*]] = xor <4 x i32> [[TMP8]], -; CHECK-NEXT:store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 16) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 20) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP11:%.*]] = xor <4 x i32> [[TMP10]], -; CHECK-NEXT:store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 20) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP12:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 24) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP13:%.*]] = xor <4 x i32> [[TMP12]], -; CHECK-NEXT:store <4 x i32> [[TMP13]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 24) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP14:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 28) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP15:%.*]] = xor <4 x i32> [[TMP14]], -; CHECK-NEXT:store <4 x i32> [[TMP15]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 28) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP16:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 32) to <4 x i32>*), align 16 -; CHECK-NEXT:[[TMP17:%.*]] = xor <4 x i32> [[TMP16]], -; CHECK-NEXT:store <4 x i32> [[TMP17]], <4 x i32
[llvm-branch-commits] [llvm] e5bf2e8 - [SLP] Use the width of value truncated just before storing
Author: Anton Afanasyev Date: 2020-12-09T16:38:45+03:00 New Revision: e5bf2e8989469ec328d910be26bd3ee0710326d9 URL: https://github.com/llvm/llvm-project/commit/e5bf2e8989469ec328d910be26bd3ee0710326d9 DIFF: https://github.com/llvm/llvm-project/commit/e5bf2e8989469ec328d910be26bd3ee0710326d9.diff LOG: [SLP] Use the width of value truncated just before storing For stores chain vectorization we choose the size of vector elements to ensure we fit to minimum and maximum vector register size for the number of elements given. This patch corrects vector element size choosing the width of value truncated just before storing instead of the width of value stored. Fixes PR46983 Differential Revision: https://reviews.llvm.org/D92824 Added: Modified: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll Removed: diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e3f6d8cc05f7..456485f45809 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5536,10 +5536,15 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { } unsigned BoUpSLP::getVectorElementSize(Value *V) { - // If V is a store, just return the width of the stored value without - // traversing the expression tree. This is the common case. - if (auto *Store = dyn_cast(V)) -return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); + // If V is a store, just return the width of the stored value (or value + // truncated just before storing) without traversing the expression tree. + // This is the common case. + if (auto *Store = dyn_cast(V)) { +if (auto *Trunc = dyn_cast(Store->getValueOperand())) + return DL->getTypeSizeInBits(Trunc->getSrcTy()); +else + return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); + } auto E = InstrElementSize.find(V); if (E != InstrElementSize.end()) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll index fa1183400cb0..23aa3536a88d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll @@ -22,132 +22,304 @@ entry: } define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i8* noalias nocapture readonly %d, i8* noalias nocapture %e, i32 %w) local_unnamed_addr #1 { -; CHECK-LABEL: @bar( -; CHECK-NEXT: entry: -; CHECK-NEXT:[[TMP0:%.*]] = insertelement <16 x i32> undef, i32 [[W:%.*]], i32 0 -; CHECK-NEXT:[[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[W]], i32 1 -; CHECK-NEXT:[[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[W]], i32 2 -; CHECK-NEXT:[[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[W]], i32 3 -; CHECK-NEXT:[[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[W]], i32 4 -; CHECK-NEXT:[[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[W]], i32 5 -; CHECK-NEXT:[[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[W]], i32 6 -; CHECK-NEXT:[[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[W]], i32 7 -; CHECK-NEXT:[[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[W]], i32 8 -; CHECK-NEXT:[[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[W]], i32 9 -; CHECK-NEXT:[[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[W]], i32 10 -; CHECK-NEXT:[[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[W]], i32 11 -; CHECK-NEXT:[[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[W]], i32 12 -; CHECK-NEXT:[[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[W]], i32 13 -; CHECK-NEXT:[[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[W]], i32 14 -; CHECK-NEXT:[[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[W]], i32 15 -; CHECK-NEXT:br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT:[[I_0356:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT:[[A_ADDR_0355:%.*]] = phi i8* [ [[A:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT:[[E_ADDR_0354:%.*]] = phi i8* [ [[E:%.*]], [[ENTRY]] ], [ [[ADD_PTR192:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT:[[D_ADDR_0353:%.*]] = phi i8* [ [[D:%.*]], [[ENTRY]] ], [ [[ADD_PTR191:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT:[[C_ADDR_0352:%.*]] = phi i8* [ [[C:%.*]], [[ENTRY]] ], [ [[ADD_PTR190:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT:[[B_ADDR_0351:%.*]] = phi i8* [ [[B:%.*]], [[ENTRY]] ], [ [[ADD_PTR189:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT:[[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 1 -; CH
[llvm-branch-commits] [llvm] b8c847e - [SLP][Test] Precommit test for D93192
Author: Anton Afanasyev Date: 2020-12-14T09:23:47+03:00 New Revision: b8c847ee731b319c1790ab4410f14933aa59efd5 URL: https://github.com/llvm/llvm-project/commit/b8c847ee731b319c1790ab4410f14933aa59efd5 DIFF: https://github.com/llvm/llvm-project/commit/b8c847ee731b319c1790ab4410f14933aa59efd5.diff LOG: [SLP][Test] Precommit test for D93192 This test shows failure of combined stores chains vectorization Added: llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll Modified: Removed: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll new file mode 100644 index ..63e3178c0278 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-- -mcpu=corei7 | FileCheck %s + +define void @foo(i8* %v0, i8* readonly %v1) { +; CHECK-LABEL: @foo( +; CHECK-NEXT:[[T0:%.*]] = bitcast i8* [[V0:%.*]] to i32* +; CHECK-NEXT:[[T1:%.*]] = bitcast i8* [[V1:%.*]] to i32* +; CHECK-NEXT:[[T02:%.*]] = bitcast i8* [[V0]] to i64* +; CHECK-NEXT:[[T12:%.*]] = bitcast i8* [[V1]] to i64* +; CHECK-NEXT:[[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; CHECK-NEXT:[[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 5 +; CHECK-NEXT:[[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 +; CHECK-NEXT:[[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 7 +; CHECK-NEXT:[[T142:%.*]] = getelementptr inbounds i64, i64* [[T12]], i64 8 +; CHECK-NEXT:[[T182:%.*]] = getelementptr inbounds i64, i64* [[T12]], i64 9 +; CHECK-NEXT:[[T222:%.*]] = getelementptr inbounds i64, i64* [[T12]], i64 10 +; CHECK-NEXT:[[T262:%.*]] = getelementptr inbounds i64, i64* [[T12]], i64 11 +; CHECK-NEXT:[[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 +; CHECK-NEXT:[[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 +; CHECK-NEXT:[[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 +; CHECK-NEXT:[[T32:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 +; CHECK-NEXT:[[T212:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 8 +; CHECK-NEXT:[[T252:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 9 +; CHECK-NEXT:[[T292:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 10 +; CHECK-NEXT:[[T322:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 11 +; CHECK-NEXT:[[T19:%.*]] = load i32, i32* [[T14]], align 4 +; CHECK-NEXT:[[T23:%.*]] = load i32, i32* [[T18]], align 4 +; CHECK-NEXT:[[T27:%.*]] = load i32, i32* [[T22]], align 4 +; CHECK-NEXT:[[T30:%.*]] = load i32, i32* [[T26]], align 4 +; CHECK-NEXT:[[TMP1:%.*]] = bitcast i64* [[T142]] to <2 x i64>* +; CHECK-NEXT:[[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 +; CHECK-NEXT:[[TMP3:%.*]] = bitcast i64* [[T222]] to <2 x i64>* +; CHECK-NEXT:[[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT:[[T20:%.*]] = add nsw i32 [[T19]], 4 +; CHECK-NEXT:[[T24:%.*]] = add nsw i32 [[T23]], 4 +; CHECK-NEXT:[[T28:%.*]] = add nsw i32 [[T27]], 6 +; CHECK-NEXT:[[T31:%.*]] = add nsw i32 [[T30]], 7 +; CHECK-NEXT:[[TMP5:%.*]] = add nsw <2 x i64> [[TMP2]], +; CHECK-NEXT:[[TMP6:%.*]] = add nsw <2 x i64> [[TMP4]], +; CHECK-NEXT:[[TMP7:%.*]] = bitcast i64* [[T212]] to <2 x i64>* +; CHECK-NEXT:store <2 x i64> [[TMP5]], <2 x i64>* [[TMP7]], align 8 +; CHECK-NEXT:[[TMP8:%.*]] = bitcast i64* [[T292]] to <2 x i64>* +; CHECK-NEXT:store <2 x i64> [[TMP6]], <2 x i64>* [[TMP8]], align 8 +; CHECK-NEXT:store i32 [[T20]], i32* [[T21]], align 4 +; CHECK-NEXT:store i32 [[T24]], i32* [[T25]], align 4 +; CHECK-NEXT:store i32 [[T28]], i32* [[T29]], align 4 +; CHECK-NEXT:store i32 [[T31]], i32* [[T32]], align 4 +; CHECK-NEXT:ret void +; + %t0 = bitcast i8* %v0 to i32* + %t1 = bitcast i8* %v1 to i32* + + %t02 = bitcast i8* %v0 to i64* + %t12 = bitcast i8* %v1 to i64* + + %t14 = getelementptr inbounds i32, i32* %t1, i64 4 + %t18 = getelementptr inbounds i32, i32* %t1, i64 5 + %t22 = getelementptr inbounds i32, i32* %t1, i64 6 + %t26 = getelementptr inbounds i32, i32* %t1, i64 7 + + %t142 = getelementptr inbounds i64, i64* %t12, i64 8 + %t182 = getelementptr inbounds i64, i64* %t12, i64 9 + %t222 = getelementptr inbounds i64, i64* %t12, i64 10 + %t262 = getelementptr inbounds i64, i64* %t12, i64 11 + + %t21 = getelementptr inbounds i32, i32* %t0, i64 4 + %t25 = getelementptr inbounds i32, i32* %t0, i64 5 + %t29 = getelementptr inbounds i32, i32* %t0, i64 6 + %t32 = getelementptr inbounds i32, i32* %t0, i64 7 + + %t212 = getelementptr inbounds i64, i64* %t02, i64 8 + %t252 = getelementptr inbou
[llvm-branch-commits] [llvm] fac7c7e - [SLP] Fix vector element size for the store chains
Author: Anton Afanasyev Date: 2020-12-14T15:51:43+03:00 New Revision: fac7c7ec3ccd64d19b6d33af0a8bc2f3f7f7b047 URL: https://github.com/llvm/llvm-project/commit/fac7c7ec3ccd64d19b6d33af0a8bc2f3f7f7b047 DIFF: https://github.com/llvm/llvm-project/commit/fac7c7ec3ccd64d19b6d33af0a8bc2f3f7f7b047.diff LOG: [SLP] Fix vector element size for the store chains Vector element size could be different for different store chains. This patch prevents wrong computation of maximum number of elements for that case. Differential Revision: https://reviews.llvm.org/D93192 Added: Modified: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll Removed: diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c5ba3709f6b1..e1c1c6edf08c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6076,7 +6076,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef Stores, // If a vector register can't hold 1 element, we are done. unsigned MaxVecRegSize = R.getMaxVecRegSize(); -unsigned EltSize = R.getVectorElementSize(Stores[0]); +unsigned EltSize = R.getVectorElementSize(Operands[0]); if (MaxVecRegSize % EltSize != 0) continue; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll index 63e3178c0278..2fdef624d48f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll @@ -23,28 +23,21 @@ define void @foo(i8* %v0, i8* readonly %v1) { ; CHECK-NEXT:[[T252:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 9 ; CHECK-NEXT:[[T292:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 10 ; CHECK-NEXT:[[T322:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 11 -; CHECK-NEXT:[[T19:%.*]] = load i32, i32* [[T14]], align 4 -; CHECK-NEXT:[[T23:%.*]] = load i32, i32* [[T18]], align 4 -; CHECK-NEXT:[[T27:%.*]] = load i32, i32* [[T22]], align 4 -; CHECK-NEXT:[[T30:%.*]] = load i32, i32* [[T26]], align 4 -; CHECK-NEXT:[[TMP1:%.*]] = bitcast i64* [[T142]] to <2 x i64>* -; CHECK-NEXT:[[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT:[[TMP3:%.*]] = bitcast i64* [[T222]] to <2 x i64>* +; CHECK-NEXT:[[TMP1:%.*]] = bitcast i32* [[T14]] to <4 x i32>* +; CHECK-NEXT:[[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT:[[TMP3:%.*]] = bitcast i64* [[T142]] to <2 x i64>* ; CHECK-NEXT:[[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 -; CHECK-NEXT:[[T20:%.*]] = add nsw i32 [[T19]], 4 -; CHECK-NEXT:[[T24:%.*]] = add nsw i32 [[T23]], 4 -; CHECK-NEXT:[[T28:%.*]] = add nsw i32 [[T27]], 6 -; CHECK-NEXT:[[T31:%.*]] = add nsw i32 [[T30]], 7 -; CHECK-NEXT:[[TMP5:%.*]] = add nsw <2 x i64> [[TMP2]], -; CHECK-NEXT:[[TMP6:%.*]] = add nsw <2 x i64> [[TMP4]], -; CHECK-NEXT:[[TMP7:%.*]] = bitcast i64* [[T212]] to <2 x i64>* -; CHECK-NEXT:store <2 x i64> [[TMP5]], <2 x i64>* [[TMP7]], align 8 -; CHECK-NEXT:[[TMP8:%.*]] = bitcast i64* [[T292]] to <2 x i64>* -; CHECK-NEXT:store <2 x i64> [[TMP6]], <2 x i64>* [[TMP8]], align 8 -; CHECK-NEXT:store i32 [[T20]], i32* [[T21]], align 4 -; CHECK-NEXT:store i32 [[T24]], i32* [[T25]], align 4 -; CHECK-NEXT:store i32 [[T28]], i32* [[T29]], align 4 -; CHECK-NEXT:store i32 [[T31]], i32* [[T32]], align 4 +; CHECK-NEXT:[[TMP5:%.*]] = bitcast i64* [[T222]] to <2 x i64>* +; CHECK-NEXT:[[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8 +; CHECK-NEXT:[[TMP7:%.*]] = add nsw <4 x i32> [[TMP2]], +; CHECK-NEXT:[[TMP8:%.*]] = add nsw <2 x i64> [[TMP4]], +; CHECK-NEXT:[[TMP9:%.*]] = add nsw <2 x i64> [[TMP6]], +; CHECK-NEXT:[[TMP10:%.*]] = bitcast i64* [[T212]] to <2 x i64>* +; CHECK-NEXT:store <2 x i64> [[TMP8]], <2 x i64>* [[TMP10]], align 8 +; CHECK-NEXT:[[TMP11:%.*]] = bitcast i64* [[T292]] to <2 x i64>* +; CHECK-NEXT:store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 8 +; CHECK-NEXT:[[TMP12:%.*]] = bitcast i32* [[T21]] to <4 x i32>* +; CHECK-NEXT:store <4 x i32> [[TMP7]], <4 x i32>* [[TMP12]], align 4 ; CHECK-NEXT:ret void ; %t0 = bitcast i8* %v0 to i32* ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits