Author: Sanjay Patel Date: 2020-12-14T17:42:01-05:00 New Revision: d399f870b5a94b9dcc1817ed69fec88c325bb817
URL: https://github.com/llvm/llvm-project/commit/d399f870b5a94b9dcc1817ed69fec88c325bb817 DIFF: https://github.com/llvm/llvm-project/commit/d399f870b5a94b9dcc1817ed69fec88c325bb817.diff LOG: [VectorCombine] make load transform poison-safe As noted in D93229, the transform from scalar load to vector load potentially leaks poison from the extra vector elements that are being loaded. We could use freeze here (and x86 codegen at least appears to be the same either way), but we already have a shuffle in this logic to optionally change the vector size, so let's allow that instruction to serve both purposes. Differential Revision: https://reviews.llvm.org/D93238 Added: Modified: llvm/lib/Transforms/Vectorize/VectorCombine.cpp llvm/test/Transforms/VectorCombine/X86/load.ll Removed: ################################################################################ diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 19f5a2b432f7a..89b60045ce910 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -161,15 +161,17 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { Value *CastedPtr = Builder.CreateBitCast(SrcPtr, MinVecTy->getPointerTo(AS)); Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment); - // If the insert type does not match the target's minimum vector type, - // use an identity shuffle to shrink/grow the vector. - if (Ty != MinVecTy) { - unsigned OutputNumElts = Ty->getNumElements(); - SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem); - for (unsigned i = 0; i < OutputNumElts && i < MinVecNumElts; ++i) - Mask[i] = i; - VecLd = Builder.CreateShuffleVector(VecLd, Mask); - } + // Set everything but element 0 to undef to prevent poison from propagating + // from the extra loaded memory. This will also optionally shrink/grow the + // vector from the loaded size to the output size. + // We assume this operation has no cost in codegen. + // Note that we could use freeze to avoid poison problems, but then we might + // still need a shuffle to change the vector size. + unsigned OutputNumElts = Ty->getNumElements(); + SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem); + Mask[0] = 0; + VecLd = Builder.CreateShuffleVector(VecLd, Mask); + replaceValue(I, *VecLd); ++NumVecLoad; return true; diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll index ba2bf3f37d7b6..03902c48157fe 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load.ll @@ -175,7 +175,8 @@ define double @larger_fp_scalar_256bit_vec(<8 x float>* align 32 dereferenceable define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_f32_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -185,7 +186,8 @@ define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) { ; CHECK-LABEL: @casted_load_f32_insert_v4f32( -; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <4 x float> [[R]] ; %b = bitcast <4 x float>* %p to float* @@ -199,7 +201,8 @@ define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenc define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_i32_insert_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %s = load i32, i32* %p, align 4 @@ -212,7 +215,8 @@ define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) { define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) { ; CHECK-LABEL: @casted_load_i32_insert_v4i32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[R:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %b = bitcast <16 x i8>* %p to i32* @@ -225,7 +229,8 @@ define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceabl define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32( -; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <4 x float> [[R]] ; %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0 @@ -238,7 +243,8 @@ define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenc define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace( -; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <4 x float> [[R]] ; %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0 @@ -253,7 +259,8 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl ; CHECK-LABEL: @gep01_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>* -; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 @@ -283,7 +290,8 @@ define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceabl ; CHECK-LABEL: @gep10_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>* -; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 @@ -410,7 +418,7 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_i32_insert_v8i32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %s = load i32, i32* %p, align 4 @@ -421,7 +429,7 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) { define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) { ; CHECK-LABEL: @casted_load_i32_insert_v8i32( ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %b = bitcast <4 x i32>* %p to i32* @@ -434,7 +442,7 @@ define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) ; CHECK-LABEL: @load_f32_insert_v16f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <16 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -446,7 +454,7 @@ define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p ; CHECK-LABEL: @load_f32_insert_v2f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 1> +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 undef> ; CHECK-NEXT: ret <2 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -500,7 +508,8 @@ define void @PR47558_multiple_use_load(<2 x float>* nocapture nonnull %resultptr define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <4 x float> [[R]] ; %l = load <2 x float>, <2 x float>* %p, align 4 @@ -512,7 +521,8 @@ define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 derefe define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) { ; CHECK-LABEL: @load_v8f32_extract_insert_v4f32( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[R:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> ; CHECK-NEXT: ret <4 x float> [[R]] ; %l = load <8 x float>, <8 x float>* %p, align 4 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits