https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/104661
>From 5729891997dacfac91ad807ddfde60aec44708fb Mon Sep 17 00:00:00 2001 From: Shilei Tian <i...@tianshilei.me> Date: Fri, 17 Jan 2025 11:01:49 -0500 Subject: [PATCH] [Clang] Remove 3-element vector load and store special handling Clang uses a long-time special handling of the case where 3 element vector loads and stores are performed as 4 element, and then a shufflevector is used to extract the used elements. Odd sized vector codegen should now work reasonably well. This patch removes this special handling, as well as the compiler argument `-fpreserve-vec3-type`. --- clang/include/clang/Basic/CodeGenOptions.def | 3 -- clang/include/clang/Basic/LangOptions.def | 2 + clang/include/clang/Driver/Options.td | 4 -- clang/lib/Basic/LangOptions.cpp | 2 + clang/lib/CodeGen/ABIInfo.cpp | 8 +++ clang/lib/CodeGen/ABIInfo.h | 8 +++ clang/lib/CodeGen/CGExpr.cpp | 49 +++++++++--------- clang/lib/CodeGen/Targets/AMDGPU.cpp | 11 ++++ .../test/CodeGenCXX/matrix-vector-bit-int.cpp | 32 ++++++------ clang/test/CodeGenOpenCL/amdgpu-alignment.cl | 8 +-- clang/test/CodeGenOpenCL/preserve_vec3.cl | 51 ++++++++++--------- 11 files changed, 101 insertions(+), 77 deletions(-) diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index 0f4ed13d5f3d8c..1ab8c7fb4d3c33 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -413,9 +413,6 @@ CODEGENOPT(StrictReturn, 1, 1) /// Whether emit pseudo probes for sample pgo profile collection. CODEGENOPT(PseudoProbeForProfiling, 1, 0) -/// Whether 3-component vector type is preserved. -CODEGENOPT(PreserveVec3Type, 1, 0) - CODEGENOPT(NoPLT, 1, 0) /// Whether to emit all vtables diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 3b833240e5b68c..a980be853d53e6 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -532,6 +532,8 @@ BENIGN_LANGOPT(CheckConstexprFunctionBodies, 1, 1, LANGOPT(BoundsSafety, 1, 0, "Bounds safety extension for C") +LANGOPT(PreserveVec3Type, 1, 0, "Preserve 3-component vector type") + #undef LANGOPT #undef COMPATIBLE_LANGOPT #undef BENIGN_LANGOPT diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d38dd2b4e3cf09..3e752ae86f06d9 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -8240,10 +8240,6 @@ def fhlsl_strict_availability : Flag<["-"], "fhlsl-strict-availability">, Group<hlsl_Group>, MarshallingInfoFlag<LangOpts<"HLSLStrictAvailability">>; -def fpreserve_vec3_type : Flag<["-"], "fpreserve-vec3-type">, - HelpText<"Preserve 3-component vector type">, - MarshallingInfoFlag<CodeGenOpts<"PreserveVec3Type">>, - ImpliedByAnyOf<[hlsl.KeyPath]>; def fwchar_type_EQ : Joined<["-"], "fwchar-type=">, HelpText<"Select underlying type for wchar_t">, Values<"char,short,int">, diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp index 94caf6a3897bc1..e3037ec819add2 100644 --- a/clang/lib/Basic/LangOptions.cpp +++ b/clang/lib/Basic/LangOptions.cpp @@ -208,6 +208,8 @@ void LangOptions::setLangDefaults(LangOptions &Opts, Language Lang, // OpenCL and HLSL have half keyword Opts.Half = Opts.OpenCL || Opts.HLSL; + + Opts.PreserveVec3Type = Opts.HLSL; } FPOptions FPOptions::defaultWithoutTrailingStorage(const LangOptions &LO) { diff --git a/clang/lib/CodeGen/ABIInfo.cpp b/clang/lib/CodeGen/ABIInfo.cpp index 642bca9e8b76da..cda8a494f6c27d 100644 --- a/clang/lib/CodeGen/ABIInfo.cpp +++ b/clang/lib/CodeGen/ABIInfo.cpp @@ -236,6 +236,14 @@ void ABIInfo::appendAttributeMangling(StringRef AttrStr, } } +llvm::FixedVectorType * +ABIInfo::getOptimalVectorMemoryType(llvm::FixedVectorType *T, + const LangOptions &Opt) const { + if (T->getNumElements() == 3 && !Opt.PreserveVec3Type) + return llvm::FixedVectorType::get(T->getElementType(), 4); + return T; +} + // Pin the vtable to this file. SwiftABIInfo::~SwiftABIInfo() = default; diff --git a/clang/lib/CodeGen/ABIInfo.h b/clang/lib/CodeGen/ABIInfo.h index b8a8de57e5b971..213e7879c3162b 100644 --- a/clang/lib/CodeGen/ABIInfo.h +++ b/clang/lib/CodeGen/ABIInfo.h @@ -20,6 +20,7 @@ class Value; class LLVMContext; class DataLayout; class Type; +class FixedVectorType; } // namespace llvm namespace clang { @@ -123,6 +124,13 @@ class ABIInfo { raw_ostream &Out) const; virtual void appendAttributeMangling(StringRef AttrStr, raw_ostream &Out) const; + + /// Returns the optimal vector memory type based on the given vector type. For + /// example, on certain targets, a vector with 3 elements might be promoted to + /// one with 4 elements to improve performance. + virtual llvm::FixedVectorType * + getOptimalVectorMemoryType(llvm::FixedVectorType *T, + const LangOptions &Opt) const; }; /// Target specific hooks for defining how a type should be passed or returned diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 9a9a8c7f6eae09..78bac4c688dd3c 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -2002,20 +2002,19 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile, return EmitFromMemory(V, Ty); } - // Handle vectors of size 3 like size 4 for better performance. - const llvm::Type *EltTy = Addr.getElementType(); - const auto *VTy = cast<llvm::FixedVectorType>(EltTy); - - if (!CGM.getCodeGenOpts().PreserveVec3Type && VTy->getNumElements() == 3) { - - llvm::VectorType *vec4Ty = - llvm::FixedVectorType::get(VTy->getElementType(), 4); - Address Cast = Addr.withElementType(vec4Ty); - // Now load value. - llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVec4"); - - // Shuffle vector to get vec3. - V = Builder.CreateShuffleVector(V, ArrayRef<int>{0, 1, 2}, "extractVec"); + // Handles vectors of sizes that are likely to be expanded to a larger size + // to optimize performance. + auto *VTy = cast<llvm::FixedVectorType>(Addr.getElementType()); + auto *NewVecTy = + CGM.getABIInfo().getOptimalVectorMemoryType(VTy, getLangOpts()); + + if (VTy != NewVecTy) { + Address Cast = Addr.withElementType(NewVecTy); + llvm::Value *V = Builder.CreateLoad(Cast, Volatile, "loadVecN"); + unsigned OldNumElements = VTy->getNumElements(); + SmallVector<int, 4> Mask(OldNumElements); + std::iota(Mask.begin(), Mask.end(), 0); + V = Builder.CreateShuffleVector(V, Mask, "extractVec"); return EmitFromMemory(V, Ty); } } @@ -2145,21 +2144,21 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr, Addr = Addr.withPointer(Builder.CreateThreadLocalAddress(GV), NotKnownNonNull); + // Handles vectors of sizes that are likely to be expanded to a larger size + // to optimize performance. llvm::Type *SrcTy = Value->getType(); if (const auto *ClangVecTy = Ty->getAs<VectorType>()) { - auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy); - if (!CGM.getCodeGenOpts().PreserveVec3Type) { - // Handle vec3 special. - if (VecTy && !ClangVecTy->isExtVectorBoolType() && - cast<llvm::FixedVectorType>(VecTy)->getNumElements() == 3) { - // Our source is a vec3, do a shuffle vector to make it a vec4. - Value = Builder.CreateShuffleVector(Value, ArrayRef<int>{0, 1, 2, -1}, - "extractVec"); - SrcTy = llvm::FixedVectorType::get(VecTy->getElementType(), 4); + if (auto *VecTy = dyn_cast<llvm::FixedVectorType>(SrcTy)) { + auto *NewVecTy = + CGM.getABIInfo().getOptimalVectorMemoryType(VecTy, getLangOpts()); + if (!ClangVecTy->isExtVectorBoolType() && VecTy != NewVecTy) { + SmallVector<int, 4> Mask(NewVecTy->getNumElements(), -1); + std::iota(Mask.begin(), Mask.begin() + VecTy->getNumElements(), 0); + Value = Builder.CreateShuffleVector(Value, Mask, "extractVec"); + SrcTy = NewVecTy; } - if (Addr.getElementType() != SrcTy) { + if (Addr.getElementType() != SrcTy) Addr = Addr.withElementType(SrcTy); - } } } diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index fa07e68c558356..788eac5f28231e 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -52,6 +52,17 @@ class AMDGPUABIInfo final : public DefaultABIInfo { void computeInfo(CGFunctionInfo &FI) const override; RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty, AggValueSlot Slot) const override; + + llvm::FixedVectorType * + getOptimalVectorMemoryType(llvm::FixedVectorType *T, + const LangOptions &Opt) const override { + // We have legal instructions for 96-bit so 3x32 can be supported. + // FIXME: This check should be a subtarget feature as technically SI doesn't + // support it. + if (T->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(T) == 96) + return T; + return DefaultABIInfo::getOptimalVectorMemoryType(T, Opt); + } }; bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { diff --git a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp index ffbce9ff8d6f4c..7dc3b6bd598221 100644 --- a/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp +++ b/clang/test/CodeGenCXX/matrix-vector-bit-int.cpp @@ -15,14 +15,14 @@ using i512x3x3 = _BitInt(512) __attribute__((matrix_type(3, 3))); // CHECK-NEXT: [[A:%.*]] = alloca <3 x i8>, align 4 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i8>, align 4 // CHECK-NEXT: store i32 [[A_COERCE]], ptr [[A]], align 4 -// CHECK-NEXT: [[LOADVEC4:%.*]] = load <4 x i8>, ptr [[A]], align 4 -// CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i8> [[LOADVEC4]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2> +// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i8>, ptr [[A]], align 4 +// CHECK-NEXT: [[A1:%.*]] = shufflevector <4 x i8> [[LOADVECN]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2> // CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A1]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> // CHECK-NEXT: store <4 x i8> [[EXTRACTVEC]], ptr [[A_ADDR]], align 4 -// CHECK-NEXT: [[LOADVEC42:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4 -// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVEC42]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2> -// CHECK-NEXT: [[LOADVEC44:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4 -// CHECK-NEXT: [[EXTRACTVEC5:%.*]] = shufflevector <4 x i8> [[LOADVEC44]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2> +// CHECK-NEXT: [[LOADVECN2:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i8> [[LOADVECN2]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2> +// CHECK-NEXT: [[LOADVECN4:%.*]] = load <4 x i8>, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[EXTRACTVEC5:%.*]] = shufflevector <4 x i8> [[LOADVECN4]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2> // CHECK-NEXT: [[ADD:%.*]] = add <3 x i8> [[EXTRACTVEC3]], [[EXTRACTVEC5]] // CHECK-NEXT: store <3 x i8> [[ADD]], ptr [[RETVAL]], align 4 // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4 @@ -38,10 +38,10 @@ i8x3 v1(i8x3 a) { // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i32>, align 16 // CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[A]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> // CHECK-NEXT: store <4 x i32> [[EXTRACTVEC]], ptr [[A_ADDR]], align 16 -// CHECK-NEXT: [[LOADVEC4:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16 -// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVEC4]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2> -// CHECK-NEXT: [[LOADVEC42:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16 -// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i32> [[LOADVEC42]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2> +// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x i32> [[LOADVECN]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2> +// CHECK-NEXT: [[LOADVECN2:%.*]] = load <4 x i32>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[EXTRACTVEC3:%.*]] = shufflevector <4 x i32> [[LOADVECN2]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2> // CHECK-NEXT: [[ADD:%.*]] = add <3 x i32> [[EXTRACTVEC1]], [[EXTRACTVEC3]] // CHECK-NEXT: ret <3 x i32> [[ADD]] // @@ -53,14 +53,14 @@ i32x3 v2(i32x3 a) { // CHECK-SAME: ptr noundef byval(<3 x i512>) align 256 [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[A_ADDR:%.*]] = alloca <3 x i512>, align 256 -// CHECK-NEXT: [[LOADVEC4:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256 -// CHECK-NEXT: [[A:%.*]] = shufflevector <4 x i512> [[LOADVEC4]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2> +// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x i512>, ptr [[TMP0]], align 256 +// CHECK-NEXT: [[A:%.*]] = shufflevector <4 x i512> [[LOADVECN]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2> // CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i512> [[A]], <3 x i512> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> // CHECK-NEXT: store <4 x i512> [[EXTRACTVEC]], ptr [[A_ADDR]], align 256 -// CHECK-NEXT: [[LOADVEC41:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256 -// CHECK-NEXT: [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVEC41]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2> -// CHECK-NEXT: [[LOADVEC43:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256 -// CHECK-NEXT: [[EXTRACTVEC4:%.*]] = shufflevector <4 x i512> [[LOADVEC43]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2> +// CHECK-NEXT: [[LOADVECN1:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256 +// CHECK-NEXT: [[EXTRACTVEC2:%.*]] = shufflevector <4 x i512> [[LOADVECN1]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2> +// CHECK-NEXT: [[LOADVECN3:%.*]] = load <4 x i512>, ptr [[A_ADDR]], align 256 +// CHECK-NEXT: [[EXTRACTVEC4:%.*]] = shufflevector <4 x i512> [[LOADVECN3]], <4 x i512> poison, <3 x i32> <i32 0, i32 1, i32 2> // CHECK-NEXT: [[ADD:%.*]] = add <3 x i512> [[EXTRACTVEC2]], [[EXTRACTVEC4]] // CHECK-NEXT: ret <3 x i512> [[ADD]] // diff --git a/clang/test/CodeGenOpenCL/amdgpu-alignment.cl b/clang/test/CodeGenOpenCL/amdgpu-alignment.cl index 8f57713fe1f041..3c2653bf34124f 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-alignment.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-alignment.cl @@ -106,7 +106,7 @@ typedef double __attribute__((ext_vector_type(16))) double16; // CHECK: store volatile <16 x i16> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i16, align 32 // CHECK: store volatile i32 0, ptr addrspace(3) @local_memory_alignment_global.lds_i32, align 4 // CHECK: store volatile <2 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2i32, align 8 -// CHECK: store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3i32, align 16 +// CHECK: store volatile <3 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3i32, align 16 // CHECK: store volatile <4 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4i32, align 16 // CHECK: store volatile <8 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8i32, align 32 // CHECK: store volatile <16 x i32> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16i32, align 64 @@ -124,7 +124,7 @@ typedef double __attribute__((ext_vector_type(16))) double16; // CHECK: store volatile <16 x half> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f16, align 32 // CHECK: store volatile float 0.000000e+00, ptr addrspace(3) @local_memory_alignment_global.lds_f32, align 4 // CHECK: store volatile <2 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v2f32, align 8 -// CHECK: store volatile <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr addrspace(3) @local_memory_alignment_global.lds_v3f32, align 16 +// CHECK: store volatile <3 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v3f32, align 16 // CHECK: store volatile <4 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v4f32, align 16 // CHECK: store volatile <8 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v8f32, align 32 // CHECK: store volatile <16 x float> zeroinitializer, ptr addrspace(3) @local_memory_alignment_global.lds_v16f32, align 64 @@ -393,7 +393,7 @@ kernel void local_memory_alignment_arg( // CHECK: store volatile <16 x i16> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32 // CHECK: store volatile i32 0, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4 // CHECK: store volatile <2 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8 -// CHECK: store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16 +// CHECK: store volatile <3 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16 // CHECK: store volatile <4 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16 // CHECK: store volatile <8 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32 // CHECK: store volatile <16 x i32> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64 @@ -411,7 +411,7 @@ kernel void local_memory_alignment_arg( // CHECK: store volatile <16 x half> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32 // CHECK: store volatile float 0.000000e+00, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 4 // CHECK: store volatile <2 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 8 -// CHECK: store volatile <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16 +// CHECK: store volatile <3 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16 // CHECK: store volatile <4 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 16 // CHECK: store volatile <8 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 32 // CHECK: store volatile <16 x float> zeroinitializer, ptr addrspace(5) %arraydecay{{[0-9]+}}, align 64 diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl index c84effe0c4b6e3..747cc301feff6c 100644 --- a/clang/test/CodeGenOpenCL/preserve_vec3.cl +++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown -fpreserve-vec3-type | FileCheck %s +// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s typedef char char3 __attribute__((ext_vector_type(3))); typedef char char8 __attribute__((ext_vector_type(8))); @@ -9,10 +9,11 @@ typedef float float3 __attribute__((ext_vector_type(3))); typedef float float4 __attribute__((ext_vector_type(4))); // CHECK-LABEL: define dso_local spir_kernel void @foo( -// CHECK-SAME: ptr addrspace(1) nocapture noundef readonly align 16 [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 12)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// CHECK-SAME: ptr addrspace(1) nocapture noundef readonly align 16 [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8:![0-9]+]] -// CHECK-NEXT: store <3 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] +// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]] // CHECK-NEXT: ret void // void kernel foo(global float3 *a, global float3 *b) { @@ -20,11 +21,11 @@ void kernel foo(global float3 *a, global float3 *b) { } // CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3( -// CHECK-SAME: ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 12)) [[A:%.*]], ptr addrspace(1) nocapture noundef readonly align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] { +// CHECK-SAME: ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 16)) [[A:%.*]], ptr addrspace(1) nocapture noundef readonly align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2> -// CHECK-NEXT: store <3 x float> [[ASTYPE]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // void kernel float4_to_float3(global float3 *a, global float4 *b) { @@ -34,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4( // CHECK-SAME: ptr addrspace(1) nocapture noundef readonly align 16 [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> // CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // @@ -46,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) { // CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2( // CHECK-SAME: ptr addrspace(1) nocapture noundef readonly align 16 [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly align 16 initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]] -// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> -// CHECK-NEXT: store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] +// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // void kernel float3_to_double2(global float3 *a, global double2 *b) { @@ -56,11 +57,11 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) { } // CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3( -// CHECK-SAME: ptr addrspace(1) nocapture noundef writeonly align 8 initializes((0, 6)) [[A:%.*]], ptr addrspace(1) nocapture noundef readonly align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] { +// CHECK-SAME: ptr addrspace(1) nocapture noundef writeonly align 8 initializes((0, 8)) [[A:%.*]], ptr addrspace(1) nocapture noundef readonly align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2> -// CHECK-NEXT: store <3 x i16> [[ASTYPE]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // void kernel char8_to_short3(global short3 *a, global char8 *b) { @@ -70,8 +71,8 @@ void kernel char8_to_short3(global short3 *a, global char8 *b) { // CHECK-LABEL: define dso_local spir_func void @from_char3( // CHECK-SAME: <3 x i8> noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> -// CHECK-NEXT: store <4 x i8> [[TMP0]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[TBAA17:![0-9]+]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i8> [[A]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[TBAA17:![0-9]+]] // CHECK-NEXT: ret void // void from_char3(char3 a, global int *out) { @@ -81,8 +82,8 @@ void from_char3(char3 a, global int *out) { // CHECK-LABEL: define dso_local spir_func void @from_short3( // CHECK-SAME: <3 x i16> noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[A]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> -// CHECK-NEXT: store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA19:![0-9]+]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[A]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA19:![0-9]+]] // CHECK-NEXT: ret void // void from_short3(short3 a, global long *out) { @@ -90,11 +91,11 @@ void from_short3(short3 a, global long *out) { } // CHECK-LABEL: define dso_local spir_func void @scalar_to_char3( -// CHECK-SAME: i32 noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 3)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +// CHECK-SAME: i32 noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 4)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[A]] to <4 x i8> -// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <3 x i32> <i32 0, i32 1, i32 2> -// CHECK-NEXT: store <3 x i8> [[ASTYPE]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: store <4 x i8> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // void scalar_to_char3(int a, global char3 *out) { @@ -102,11 +103,11 @@ void scalar_to_char3(int a, global char3 *out) { } // CHECK-LABEL: define dso_local spir_func void @scalar_to_short3( -// CHECK-SAME: i64 noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 6)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { +// CHECK-SAME: i64 noundef [[A:%.*]], ptr addrspace(1) nocapture noundef writeonly initializes((0, 8)) [[OUT:%.*]]) local_unnamed_addr #[[ATTR1]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16> -// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2> -// CHECK-NEXT: store <3 x i16> [[ASTYPE]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA8]] +// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison> +// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[OUT]], align 8, !tbaa [[TBAA8]] // CHECK-NEXT: ret void // void scalar_to_short3(long a, global short3 *out) { _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits